In [2]:
import numpy as np
import scipy as sc
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import re
import os

Download the SEED-IV dataset from here : https://www.kaggle.com/datasets/phhasian0710/seed-iv

## Explaining the files structure:
### The "eeg_raw_data" folder:
   * Contains 3 inner folders named 1, 2 ,3 corresponding to the 3 sessions.
      * Each .mat file inside those folders is for a subject from the  15 subjects (named with {SubjectName}_{Date}.mat), which contains more files:
         * The .mat file contains the EEG signals recorded during 24 trials for 62 channels
   * Each of the 24 trials in each session folder (1, 2 or 3) has a label, and the labels are the same across all subjects 

**Each class has 18 trial, so the data is perfectly balanced**

session [1-3]
   * subject [1-15]
      * trial [1-24]
         * channel [0-62]

### So that we know this , we can calculate the dataset size:
3 sessions * 15 subject * 24 trial * 62 channels = 66960 raw EEG signal (before windowing)

**Label Mapping**:
- Neutral: 0
- Sad: 1
- Fear: 2
- Happy: 3

In [3]:
labels = np.array([
    [1,2,3,0,2,0,0,1,0,1,2,1,1,1,2,3,2,2,3,3,0,3,0,3],
    [2,1,3,0,0,2,0,2,3,3,2,3,2,0,1,1,2,1,0,3,0,1,3,1],
    [1,2,2,1,3,3,3,1,1,2,1,0,2,3,3,0,2,3,0,0,2,0,1,0]
])

In [4]:
labels.shape

(3, 24)

Mapping the sad and fear emotions to negative. This will lead to an unbalanced dataset , which is a problem we will solve later

In [5]:
#currently neutral:0 , happy:3 , sad:1 , fear:2
labels[labels==2] = 1  # changing fear labels from 2 to 1
#currently neutral:0 , happy:3 , sad:1 , fear:1
labels[labels==0] = -1  # changing neutral labels from 0 to -1
#currently neutral:-1 , happy:3 , sad:1 , fear:1
labels[labels==3] = 0  # changing happy labels from 3 to 0
#currently neutral:-1 , happy:0 , sad:1 , fear:1

**Final label mapping**:
- Neutral: -1
- Positive (Happy): 0
- Negative (Sad , Fear): 1

In [6]:
labels

array([[ 1,  1,  0, -1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1,  0,
         1,  1,  0,  0, -1,  0, -1,  0],
       [ 1,  1,  0, -1, -1,  1, -1,  1,  0,  0,  1,  0,  1, -1,  1,  1,
         1,  1, -1,  0, -1,  1,  0,  1],
       [ 1,  1,  1,  1,  0,  0,  0,  1,  1,  1,  1, -1,  1,  0,  0, -1,
         1,  0, -1, -1,  1, -1,  1, -1]])

We can't load more than one session at a time because of the resources it needs, if we try to load all the data the computer will crash

In [7]:
def loadSession(k):
    sessionPath=f'SEED-IV/eeg_raw_data/{k}/'
    sessionSubjects=os.listdir(sessionPath)
    s=[]
    for i,subjectFile in enumerate(sessionSubjects):
        sub=sc.io.loadmat(sessionPath+subjectFile)
        # sub = {int(re.search(r'(\d+)$', k).group(1))-1: v for k, v in sub.items() if not k.startswith('__')}
        sub = [v for k, v in sub.items() if not k.startswith('__')]
        s.append(sub)
    return s

In [8]:
# s1=loadSession(1)

**To index a channel by its name**

In [9]:
channelsMapping=pd.read_excel('SEED-IV/Channel Order.xlsx',header=None, names=['channels']).reset_index()
channelsMapping.set_index('channels', inplace=True)

In [10]:
def getChannel(channel):
    return channelsMapping.loc[channel]['index'] 

### Let's play with the files a bit to understand it better.

In [11]:
def loadSubject(session,subject):
    '''This function is 1-based'''
    for file in os.listdir(f'SEED-IV/eeg_raw_data/{session}/'):
        if file.startswith(f'{subject}_'):
            subData=sc.io.loadmat(f'SEED-IV/eeg_raw_data/{session}/{file}')
            break
    subData = [v for k, v in subData.items() if not k.startswith('__')]
    return subData

In [12]:
origSamplingRate = 1000
newSamplingRate = 200
q = int(origSamplingRate/newSamplingRate) # step size for down sampling
windowSize=4 #4 seconds
overlapSize=0.1 #percent of overlapped points between segments
noOfSamples = newSamplingRate * windowSize # = 800
bandpassWindow = (4,50) #Hz

In [13]:
def downSample(trial):
    return np.array([ch[::q] for ch in trial])

In [14]:
def segmentChannel(ch):
    '''
    This function segments the channel with window size of 800 samples while applying overlapping of size 10% , additionally if the 
    channel isn't divisible by the window size , the last segment will be ch[-window size] , which means its overlap with the previous
    segment can be any value from 10% to 99%
    '''
    s = []
    stepSize= int(newSamplingRate * windowSize *(1-overlapSize))
    segmentsCount = int(np.floor((len(ch) - noOfSamples) / stepSize)) + 1
    for i in range(segmentsCount):
        start=i*stepSize
        end=(i*stepSize)+noOfSamples
        s.append(ch[start:end])

    #to cover the whole signal
    if end+1< len(ch):
        s.append(ch[-noOfSamples:])
    return np.array(s)

In [15]:
def segmentTrial(trial):
    return [segmentChannel(ch) for ch in trial]

In [16]:
def preProcess(subData):
    f'''This function applies band pass filter {bandpassWindow} then down sampling to 200 Hz'''
    b, a = sc.signal.butter(4, Wn=bandpassWindow, btype='bandpass', fs=origSamplingRate)
    s = [sc.signal.lfilter(b, a, trial) for trial in subData]
    s = [downSample(trial)  for trial in s]
    s = [segmentTrial(trial)  for trial in s]
    return s

In [17]:
s = loadSubject(1,1)

### Plotting the signal to show the effect of preprocessing
we will plot the the signal of the first channel of the first subject in the first trial in the first session

In [28]:
px.line(s[0][0][:10001])

In [19]:
b, a = sc.signal.butter(4, Wn=bandpassWindow, btype='bandpass', fs=origSamplingRate)
filteredSignal = [sc.signal.lfilter(b, a, trial) for trial in s]

#### After applying butterworth bandpass filter 

In [29]:
px.line(filteredSignal[0][0][:10001])

#### After downsampling from 1000 to 200

In [21]:
downSampledSignal = [downSample(trial)  for trial in filteredSignal]

In [31]:
px.line(downSampledSignal[0][0][:2001])

Some plotting for comparisons

Seeing how different subject have their EEG signals given the same videos (same label)

In [23]:
# These are the positive indexes of the first session
posIndex=np.flatnonzero(labels[0]==0)

In [24]:
s1 = loadSubject(1,1)
s2 = loadSubject(1,2)
s3 = loadSubject(1,3)

In [25]:
p1=preProcess(s1)
p2=preProcess(s2)
p3=preProcess(s3)

In [26]:
fig = make_subplots(
    rows=3, 
    cols=1, 
    subplot_titles=("Subject 1", "Subject 2", "Subject 3"),
)
fig.add_trace(
    go.Scatter(y=p1[posIndex[2]][getChannel('PZ')][0], mode="lines", name="Subject 1"),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(y=p2[posIndex[2]][getChannel('PZ')][0], mode="lines", name="Subject 2"),
    row=2, col=1
)
fig.add_trace(
    go.Scatter(y=p3[posIndex[2]][getChannel('PZ')][0], mode="lines", name="Subject 3"),
    row=3, col=1
)
fig.update_layout(
    title_text="A segment of the EEG PZ Channel Across 3 subject given the same trial (same movie and same label)", 
    height=700, 
    showlegend=False
)
fig.update_xaxes(title_text="Sample Number", row=3, col=1)

fig.show()