In [1]:
#read data into a dataframe
import pandas as pd
df = pd.read_csv('data/EEG_data.csv')


#remove possibly corrupt data
df = df.drop(df[df.SubjectID == 2.0].index)

In [8]:
#write data into a two-dimensional dictionary for easy access
#example of use: to get data about student with index 5 watching video with index 2, use ID[5][2]

#For each video of each student I reduced the data to a single row by taking the mean of each column.
#I write only the mean values to the dictionary.
#To get all of the data, simply remove ".mean()" from the last row

ID = {}
for sub_id in df.SubjectID.unique():
    
    ID[sub_id] = {}
    for vid_id in df.loc[df['SubjectID'] == sub_id].VideoID.unique():
        ID[sub_id][vid_id] = df.loc[df['SubjectID'] == sub_id].loc[df['VideoID'] == vid_id].mean()

#print(ID[5][2])

{0.0: {0.0: SubjectID                  0.000000
VideoID                    0.000000
Attention                 55.256944
Mediation                 53.826389
Raw                       46.986111
Delta                 544315.097222
Theta                 124965.590278
Alpha1                 36693.701389
Alpha2                 25875.298611
Beta1                  20108.791667
Beta2                  40268.763889
Gamma1                 40729.284722
Gamma2                 16817.062500
predefinedlabel            0.000000
user-definedlabeln         0.000000
dtype: float64, 1.0: SubjectID                  0.000000
VideoID                    1.000000
Attention                 43.621429
Mediation                 48.621429
Raw                       28.800000
Delta                 739737.292857
Theta                 161064.228571
Alpha1                 34918.028571
Alpha2                 25078.935714
Beta1                  22157.307143
Beta2                  37410.728571
Gamma1                 36758.70

In [9]:
#convert the dictionary to a pandas dataframe

average_data = pd.DataFrame()

for sub_id in ID.keys():
    average_data = average_data.append(pd.DataFrame.from_dict(ID[sub_id], orient = 'index'))
    
# print(average_data)

     SubjectID  VideoID  Attention  Mediation        Raw         Delta  \
0.0        0.0      0.0  55.256944  53.826389  46.986111  5.443151e+05   
1.0        0.0      1.0  43.621429  48.621429  28.800000  7.397373e+05   
2.0        0.0      2.0  43.978873  47.316901  13.154930  6.940781e+05   
3.0        0.0      3.0  51.057377  51.844262  34.713115  6.008237e+05   
4.0        0.0      4.0  55.224138  47.474138  30.008621  5.466280e+05   
5.0        0.0      5.0  49.813008  41.048780  38.804878  9.352090e+05   
6.0        0.0      6.0  33.586207  44.310345  44.224138  7.850926e+05   
7.0        0.0      7.0  48.000000  50.241071  28.267857  4.779900e+05   
8.0        0.0      8.0  55.548387  52.879032  29.564516  9.144155e+05   
9.0        0.0      9.0  49.934426  47.270492  37.286885  6.022494e+05   
0.0        1.0      0.0  52.457143  62.521429  39.578571  2.659105e+05   
1.0        1.0      1.0  40.098592  57.922535  25.802817  4.786495e+05   
2.0        1.0      2.0  34.721311  53

In [10]:
#extract eeg measurements

Attention = average_data['Attention']
Mediation = average_data['Mediation']
Raw = average_data['Raw']
Delta = average_data['Delta']
Theta = average_data['Theta']
Alpha1 = average_data['Alpha1']
Alpha2 = average_data['Alpha2']
Beta1 = average_data['Beta1']
Beta2 = average_data['Beta2']
Gamma1 = average_data['Gamma1']
Gamma2 = average_data['Gamma2']

SET = pd.DataFrame(columns = ['Attention', 'Mediation', 'Raw', 'Delta', 'Theta', 'Alpha1', 'Alpha2', 'Beta1', 'Beta2', 'Gamma1', 'Gamma2'])

SET['Attention'] = Attention
SET['Mediation'] = Mediation
SET['Raw'] = Raw
SET['Delta'] = Delta
SET['Theta'] = Theta
SET['Alpha1'] = Alpha1
SET['Alpha2'] = Alpha2
SET['Beta1'] = Beta1
SET['Beta2'] = Beta2
SET['Gamma1'] = Gamma1
SET['Gamma2'] = Gamma2
SET.head()

     Attention  Mediation        Raw         Delta          Theta  \
0.0  55.256944  53.826389  46.986111  5.443151e+05  124965.590278   
1.0  43.621429  48.621429  28.800000  7.397373e+05  161064.228571   
2.0  43.978873  47.316901  13.154930  6.940781e+05  149816.873239   
3.0  51.057377  51.844262  34.713115  6.008237e+05  162653.360656   
4.0  55.224138  47.474138  30.008621  5.466280e+05  126893.948276   
5.0  49.813008  41.048780  38.804878  9.352090e+05  282487.512195   
6.0  33.586207  44.310345  44.224138  7.850926e+05  212861.879310   
7.0  48.000000  50.241071  28.267857  4.779900e+05  114145.276786   
8.0  55.548387  52.879032  29.564516  9.144155e+05  198926.048387   
9.0  49.934426  47.270492  37.286885  6.022494e+05  130773.500000   
0.0  52.457143  62.521429  39.578571  2.659105e+05   83320.414286   
1.0  40.098592  57.922535  25.802817  4.786495e+05  143226.676056   
2.0  34.721311  53.827869  54.934426  6.206911e+05  153239.770492   
3.0  15.905172  35.560345  35.8103

In [12]:
#extract labels

labels = pd.DataFrame(columns = ['VideoID'])
labels['VideoID'] = average_data['VideoID']
labels.head()

Unnamed: 0,VideoID
0.0,0.0
1.0,1.0
2.0,2.0
3.0,3.0
4.0,4.0


In [15]:
#split the data
from sklearn.model_selection import train_test_split as tts
X_train, X_test, Y_train, Y_test = tts(SET, labels, train_size = 0.8, test_size = 0.2)


#choose a classifier
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

#fit the model
clf.fit(X_train, Y_train['VideoID'])
predictions = clf.predict(X_test) #predictions of Y_test elements


#observe the (in)accuracy
clf.score(X_test, Y_test)

0.05555555555555555