In [191]:
#read data into a dataframe
import pandas as pd
df = pd.read_csv('data/EEG_data.csv')


#remove possibly corrupt data
df = df.drop(df[df.SubjectID == 2.0].index)

In [192]:
#write data into a two-dimensional dictionary for easy access
#example of use: to get data about student with index 5 watching video with index 2, use ID[5][2]

#For each video of each student I reduced the data to a single row by taking the mean of each column.
#I write only the mean values to the dictionary.
#To get all of the data, simply remove ".mean()" from the last row

ID = {}
for sub_id in df.SubjectID.unique():
    
    ID[sub_id] = {}
    for vid_id in df.loc[df['SubjectID'] == sub_id].VideoID.unique():
        ID[sub_id][vid_id] = df.loc[df['SubjectID'] == sub_id].loc[df['VideoID'] == vid_id].mean()

#print(ID[5][2])

In [268]:
#convert the dictionary to a pandas dataframe

average_data = pd.DataFrame()

for sub_id in ID.keys():
    average_data = average_data.append(pd.DataFrame.from_dict(ID[sub_id], orient = 'index'))
    
#print(average_data)

In [200]:
#extract eeg measurements

Attention = average_data['Attention']
Mediation = average_data['Mediation']
Raw = average_data['Raw']
Delta = average_data['Delta']
Theta = average_data['Theta']
Alpha1 = average_data['Alpha1']
Alpha2 = average_data['Alpha2']
Beta1 = average_data['Beta1']
Beta2 = average_data['Beta2']
Gamma1 = average_data['Gamma1']
Gamma2 = average_data['Gamma2']

SET = pd.DataFrame(columns = ['Attention', 'Mediation', 'Raw', 'Delta', 'Theta', 'Alpha1', 'Alpha2', 'Beta1', 'Beta2', 'Gamma1', 'Gamma2'])

SET['Attention'] = Attention
SET['Mediation'] = Mediation
SET['Raw'] = Raw
SET['Delta'] = Delta
SET['Theta'] = Theta
SET['Alpha1'] = Alpha1
SET['Alpha2'] = Alpha2
SET['Beta1'] = Beta1
SET['Beta2'] = Beta2
SET['Gamma1'] = Gamma1
SET['Gamma2'] = Gamma2
SET.head()

Unnamed: 0,Attention,Mediation,Raw,Delta,Theta,Alpha1,Alpha2,Beta1,Beta2,Gamma1,Gamma2
0.0,55.256944,53.826389,46.986111,544315.097222,124965.590278,36693.701389,25875.298611,20108.791667,40268.763889,40729.284722,16817.0625
1.0,43.621429,48.621429,28.8,739737.292857,161064.228571,34918.028571,25078.935714,22157.307143,37410.728571,36758.7,14519.407143
2.0,43.978873,47.316901,13.15493,694078.084507,149816.873239,30493.873239,21667.591549,21888.338028,36446.43662,33908.873239,14545.84507
3.0,51.057377,51.844262,34.713115,600823.688525,162653.360656,33367.278689,26281.5,17224.278689,43706.52459,41438.213115,16558.631148
4.0,55.224138,47.474138,30.008621,546628.017241,126893.948276,23113.844828,17017.051724,15955.87931,36427.836207,36024.818966,14752.655172


In [178]:
#extract labels

labels = pd.DataFrame(columns = ['VideoID'])
labels['VideoID'] = average_data['VideoID']
labels.head()

Unnamed: 0,VideoID
0.0,0.0
1.0,1.0
2.0,2.0
3.0,3.0
4.0,4.0


In [280]:
#split the data
from sklearn.model_selection import train_test_split as tts
X_train, X_test, Y_train, Y_test = tts(SET, labels, train_size = 0.8, test_size = 0.2)


#choose a classifier
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()


#fit the model
clf.fit(X_train, Y_train['VideoID'])
predictions = clf.predict(X_test) #predictions of Y_test elements


#observe the (in)accuracy
clf.score(X_test, Y_test)

0.05555555555555555