In [1]:
# import the libraries
import pandas as pd
import numpy as np
from scipy.io.arff import loadarff
from matplotlib import pyplot as plt

### Preprocess EEG Eye State Dataset¶

In [2]:
# 14980 samples, 14 attrs, last column binary label

# load the data. The function loadarff read most arff files and it can also read
# files with missing data, representing the data points as NaNs. This 
# information is important for data preprocessing. The data used here 
# has no missing values
EEG_Eye_State, meta = loadarff('EEG-Eye-State.arff')

In [3]:
# meta contains information about the arff file, as shown below is the attributes
meta

Dataset: EEG_DATA
	AF3's type is numeric
	F7's type is numeric
	F3's type is numeric
	FC5's type is numeric
	T7's type is numeric
	P7's type is numeric
	O1's type is numeric
	O2's type is numeric
	P8's type is numeric
	T8's type is numeric
	FC6's type is numeric
	F4's type is numeric
	F8's type is numeric
	AF4's type is numeric
	eyeDetection's type is nominal, range is ('0', '1')

In [4]:
# EEG_Eye_State records the data of the arff file, accessible by attribute names
# When add the EEG_Eye_State data to matrix, each element in the matrix has the type numpy.bytes_, therefore need to convert to
# float or int type so data matrix could be manipulated without errors
# Turn EEG_Eye_State into matrix of data
Eye_State_data = np.array(EEG_Eye_State[meta.names()[0]].astype(float, copy = True)).reshape(14980,1)

# Load attributes as type float
for i in range(1,14):
    Eye_State_data = np.c_[Eye_State_data, np.array(EEG_Eye_State[meta.names()[i]]).astype(float, copy = True)]

# Load label as type int
Eye_State_data = np.c_[Eye_State_data, np.array(EEG_Eye_State[meta.names()[14]]).astype(int, copy = True)]

# Convert to pandas DataFrame for easier manipulation 
df = pd.DataFrame(data = Eye_State_data, columns = meta.names()[:])

# First 10 samples
df.head(10)

Unnamed: 0,AF3,F7,F3,FC5,T7,P7,O1,O2,P8,T8,FC6,F4,F8,AF4,eyeDetection
0,4329.23,4009.23,4289.23,4148.21,4350.26,4586.15,4096.92,4641.03,4222.05,4238.46,4211.28,4280.51,4635.9,4393.85,0.0
1,4324.62,4004.62,4293.85,4148.72,4342.05,4586.67,4097.44,4638.97,4210.77,4226.67,4207.69,4279.49,4632.82,4384.1,0.0
2,4327.69,4006.67,4295.38,4156.41,4336.92,4583.59,4096.92,4630.26,4207.69,4222.05,4206.67,4282.05,4628.72,4389.23,0.0
3,4328.72,4011.79,4296.41,4155.9,4343.59,4582.56,4097.44,4630.77,4217.44,4235.38,4210.77,4287.69,4632.31,4396.41,0.0
4,4326.15,4011.79,4292.31,4151.28,4347.69,4586.67,4095.9,4627.69,4210.77,4244.1,4212.82,4288.21,4632.82,4398.46,0.0
5,4321.03,4004.62,4284.1,4153.33,4345.64,4587.18,4093.33,4616.92,4202.56,4232.82,4209.74,4281.03,4628.21,4389.74,0.0
6,4319.49,4001.03,4280.51,4151.79,4343.59,4584.62,4089.74,4615.9,4212.31,4226.67,4201.03,4269.74,4625.13,4378.46,0.0
7,4325.64,4006.67,4278.46,4143.08,4344.1,4583.08,4087.18,4614.87,4205.64,4230.26,4195.9,4266.67,4622.05,4380.51,0.0
8,4326.15,4010.77,4276.41,4139.49,4345.13,4584.1,4091.28,4608.21,4187.69,4229.74,4202.05,4273.85,4627.18,4389.74,0.0
9,4326.15,4011.28,4276.92,4142.05,4344.1,4582.56,4092.82,4608.72,4194.36,4228.72,4212.82,4277.95,4637.44,4393.33,0.0


In [5]:
# Data matrix, and labels array from pandas DataFrame
EEG_Matrix = df.values

# Shuffle the matrix by rows a few times
for i in range(0, 14000):
    np.random.shuffle(EEG_Matrix)

EEG_Data_Matrix = EEG_Matrix[:,:-1]
EEG_Data_Matrix = EEG_Data_Matrix.astype('float')
EEG_Data_Labels = EEG_Matrix[:,-1]
EEG_Data_Labels = EEG_Data_Labels.astype('int')

print("Dim(EEG_Data_Matrix) = ", EEG_Data_Matrix.shape)
print("Dim(EEG_Data_Labels) = ", EEG_Data_Labels.shape)

Dim(EEG_Data_Matrix) =  (14980, 14)
Dim(EEG_Data_Labels) =  (14980,)


### Training & Testing, using k = 10 cross validation

**Partitioning data and labels into folds, 1498 samples per fold**

In [6]:
# Partitioning data into 10 folds
f = 1498
EEG_Xfolds = np.array([EEG_Data_Matrix[:f], EEG_Data_Matrix[f:2*f], EEG_Data_Matrix[2*f:3*f], EEG_Data_Matrix[3*f:4*f], EEG_Data_Matrix[4*f:5*f], EEG_Data_Matrix[5*f:6*f], EEG_Data_Matrix[6*f:7*f], EEG_Data_Matrix[7*f:8*f], EEG_Data_Matrix[8*f:9*f], EEG_Data_Matrix[9*f:]])

# Partitioning labels into 10 folds
EEGlabel_fold1 = EEG_Data_Labels[0:f]
EEGlabel_fold2 = EEG_Data_Labels[f:2*f]
EEGlabel_fold3 = EEG_Data_Labels[2*f:3*f]
EEGlabel_fold4 = EEG_Data_Labels[3*f:4*f]
EEGlabel_fold5 = EEG_Data_Labels[4*f:5*f]
EEGlabel_fold6 = EEG_Data_Labels[5*f:6*f]
EEGlabel_fold7 = EEG_Data_Labels[6*f:7*f]
EEGlabel_fold8 = EEG_Data_Labels[7*f:8*f]
EEGlabel_fold9 = EEG_Data_Labels[8*f:9*f]
EEGlabel_fold10 = EEG_Data_Labels[9*f:]

EEG_Labels_folds = np.array([EEGlabel_fold1, EEGlabel_fold2, EEGlabel_fold3, EEGlabel_fold4, EEGlabel_fold5, EEGlabel_fold6, EEGlabel_fold7, EEGlabel_fold8, EEGlabel_fold9, EEGlabel_fold10])

# Store errors
EEG_SVM_accuracies = []

**SVM Training and Test Method**<br>
Arguments are which folds to use as train, which fold to use as test. Uses sklearn's SVM classifier to fit based on training_data, training_labels. Then tests classifier using test fold and compares to test_labels for accuracy.

In [7]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Returns training and test scores
def EEG_TrainTestSVM(f1, f2, f3, f4, f5, f6, f7, f8, f9, ftest):
    # Train Data and Labels
    train_data = np.c_[EEG_Xfolds[f1 - 1].T, EEG_Xfolds[f2 - 1].T, EEG_Xfolds[f3 - 1].T, EEG_Xfolds[f4 - 1].T, EEG_Xfolds[f5 - 1].T, EEG_Xfolds[f6 - 1].T, EEG_Xfolds[f7 - 1].T, EEG_Xfolds[f8 - 1].T, EEG_Xfolds[f9 - 1].T].T
    train_labels = np.concatenate((EEG_Labels_folds[f1 - 1], EEG_Labels_folds[f2 - 1], EEG_Labels_folds[f3 - 1], EEG_Labels_folds[f4 - 1], EEG_Labels_folds[f5 - 1], EEG_Labels_folds[f6 - 1], EEG_Labels_folds[f7 - 1], EEG_Labels_folds[f8 - 1], EEG_Labels_folds[f9 - 1]))
    
    # Test Data and Labels
    test_data = EEG_Xfolds[ftest - 1]
    test_labels = EEG_Labels_folds[ftest - 1]
    
    # SVM Train
    clf = SVC(gamma = 'auto')
    #train_score = clf.fit(train_data, train_labels).score(train_data, train_labels)
    clf.fit(train_data, train_labels)
    
    # Test SVM
    predictions = [] # Stores classifier predictions
    for i in range(0, 1498):
        test_sample = test_data[i].reshape(1, -1)
        prediction = clf.predict(test_sample)
        predictions.append(prediction)
    
    accuracy = accuracy_score(test_labels, predictions)
    
    return accuracy

**Cross validation iterations**

In [8]:
# Iteration 1
# Using folds 1, 2, 3, 4, 5, 6, 7, 8, 9 as training, fold 10 as test
EEGiteration1_accuracy = EEG_TrainTestSVM(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
EEG_SVM_accuracies.append(EEGiteration1_accuracy)

# Iteration 2
# Using folds 1, 2, 3, 4, 5, 6, 7, 8, 10 as training, fold 9 as test
EEGiteration2_accuracy = EEG_TrainTestSVM(1, 2, 3, 4, 5, 6, 7, 8, 10, 9)
EEG_SVM_accuracies.append(EEGiteration2_accuracy)

# Iteration 3
# Using folds 1, 2, 3, 4, 5, 6, 7, 9, 10 as training, fold 8 as test
EEGiteration3_accuracy = EEG_TrainTestSVM(1, 2, 3, 4, 5, 6, 7, 9, 10, 8)
EEG_SVM_accuracies.append(EEGiteration3_accuracy)

# Iteration 4
# Using folds 1, 2, 3, 4, 5, 6, 8, 9, 10 as training, fold 7 as test
EEGiteration4_accuracy = EEG_TrainTestSVM(1, 2, 3, 4, 5, 6, 8, 9, 10, 7)
EEG_SVM_accuracies.append(EEGiteration4_accuracy)

# Iteration 5
# Using folds 1, 2, 3, 4, 5, 7, 8, 9, 10 as training, fold 6 as test
EEGiteration5_accuracy = EEG_TrainTestSVM(1, 2, 3, 4, 5, 7, 8, 9, 10, 6)
EEG_SVM_accuracies.append(EEGiteration5_accuracy)

# Iteration 6
# Using folds 1, 2, 3, 4, 6, 7, 8, 9, 10 as training, fold 5 as test
EEGiteration6_accuracy = EEG_TrainTestSVM(1, 2, 3, 4, 6, 7, 8, 9, 10, 5)
EEG_SVM_accuracies.append(EEGiteration6_accuracy)

# Iteration 7
# Using folds 1, 2, 3, 5, 6, 7, 8, 9, 10 as training, fold 4 as test
EEGiteration7_accuracy = EEG_TrainTestSVM(1, 2, 3, 5, 6, 7, 8, 9, 10, 4)
EEG_SVM_accuracies.append(EEGiteration7_accuracy)

# Iteration 8
# Using folds 1, 2, 4, 5, 6, 7, 8, 9, 10 as training, fold 3 as test
EEGiteration8_accuracy = EEG_TrainTestSVM(1, 2, 4, 5, 6, 7, 8, 9, 10, 3)
EEG_SVM_accuracies.append(EEGiteration8_accuracy)

# Iteration 9
# Using folds 1, 3, 4, 5, 6, 7, 8, 9, 10 as training, fold 2 as test
EEGiteration9_accuracy = EEG_TrainTestSVM(1, 3, 4, 5, 6, 7, 8, 9, 10, 2)
EEG_SVM_accuracies.append(EEGiteration9_accuracy)

# Iteration 10
# Using folds 2, 3, 4, 5, 6, 7, 8, 9, 10 as training, fold 1 as test
EEGiteration10_accuracy = EEG_TrainTestSVM(2, 3, 4, 5, 6, 7, 8, 9, 10, 1)
EEG_SVM_accuracies.append(EEGiteration10_accuracy)


print(EEG_SVM_accuracies)

[0.5520694259012016, 0.5640854472630173, 0.5380507343124166, 0.5487316421895861, 0.5574098798397864, 0.5413885180240321, 0.5514018691588785, 0.5514018691588785, 0.5520694259012016, 0.5554072096128171]


**Printing mean SVM accuracy across all k = 10 cross validation iterations**

In [9]:
print("Mean accuracy = ", np.mean(EEG_SVM_accuracies))

Mean accuracy =  0.5512016021361816
