### Clinical BCI Challenge-WCCI2020
- [website link](https://sites.google.com/view/bci-comp-wcci/?fbclid=IwAR37WLQ_xNd5qsZvktZCT8XJerHhmVb_bU5HDu69CnO85DE3iF0fs57vQ6M)


- [Dataset Link](https://github.com/5anirban9/Clinical-Brain-Computer-Interfaces-Challenge-WCCI-2020-Glasgow)
 

In [2]:
import mne
from scipy.io import loadmat
import scipy
import sklearn
import numpy as np
import pandas as pd
import glob
from mne.decoding import CSP
import os

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedShuffleSplit, RandomizedSearchCV
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.pipeline import make_pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as lda
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [31]:
import warnings
warnings.filterwarnings('ignore') # to ignore warnings

In [5]:
verbose = False                    # global variable to suppress output display of MNE functions
mne.set_log_level(verbose=verbose) # to suppress large info outputs

In [6]:
# using kappa as evaluation metric
kappa = sklearn.metrics.make_scorer(sklearn.metrics.cohen_kappa_score) # kappa scorer
acc = sklearn.metrics.make_scorer(sklearn.metrics.accuracy_score)      # accuracy scorer
scorer = kappa          # just assign another scorer to replace kappa scorer

In [7]:
n_jobs = -1  # for multicore parallel processing, set it to 1 if cause memory issues, for full utilization set to -1

## Data Loading and Conversion to MNE Datatypes
[Mike Cohen Tutorials link for EEG Preprocessing](https://www.youtube.com/watch?v=uWB5tjhataY&list=PLn0OLiymPak2gDD-VDA90w9_iGDgOOb2o)

In [9]:
current_folder = globals()['_dh'][0]  # a hack to get path of current folder in which juptyter file is located
data_path = os.path.join(current_folder, 'Data')

In [10]:
all_files        = glob.glob(data_path + '/*.mat')
training_files   = glob.glob(data_path + '/*T.mat')
evaluation_files = glob.glob(data_path + '/*E.mat')
len(all_files), len(training_files), len(evaluation_files)     # if these return zero,then no file is loaded

(18, 8, 10)

In [11]:
def get_mne_epochs(filepath, verbose=verbose, t_start=2, fs=512, mode='train'):
    '''
    This function reads the EEG data from .mat file and convert it to MNE-Python Compatible epochs
    data structure. It takes data from [0, 8] sec range and return it by setting t = 0 at cue onset
    i.e. 3 seconds and dropping first two seconds so the output data is in [-1.0, 5.0] sec range. The
    Details can be found in the preprocessing section of the attached document
    '''
    mat_data = loadmat(filepath) # read .mat file
    eeg_data= mat_data['RawEEGData']
    idx_start = fs*t_start      
    eeg_data = eeg_data[:, :, idx_start:]
    event_id = {'left-hand': 1, 'right-hand': 2}
    channel_names = ['F3', 'FC3', 'C3', 'CP3', 'P3', 'FCz', 'CPz', 'F4', 'FC4', 'C4', 'CP4', 'P4']
    info = mne.create_info(ch_names=channel_names, sfreq=fs, ch_types='eeg')
    epochs = mne.EpochsArray(eeg_data, info, verbose=verbose, tmin=t_start-3.0)
    epochs.set_montage('standard_1020')
    epochs.filter(1., None) 
    epochs.apply_baseline(baseline=(-.250, 0)) # linear baseline correction
    
    if mode == 'train': # this in only applicable for training data
        epochs.event_id = event_id
        epochs.events[:,2] = mat_data['Labels'].ravel()    
    return epochs 

def get_labels(filepath):
    mat_data = loadmat(filepath) # read .mat file
    return mat_data['Labels'].ravel()

In [32]:
epochs, labels = get_mne_epochs(training_files[0], verbose=verbose), get_labels(training_files[0])
data = epochs.get_data()
print('Shape of EEG Data: ', data.shape, '\t Shape of Labels: ', labels.shape) 

Shape of EEG Data:  (80, 12, 3072) 	 Shape of Labels:  (80,)


## Lets Append Epochs

In [13]:
def get_mne_epochs_complete(files_paths, verbose=verbose, t_start=2, fs=512, mode='train'):
    '''
    similar to get_mne_epochs, just appends data from all relevant files together to give a single
    epoch object
    '''
    eeg_data = []
    for filepath in files_paths:
        mat_data = loadmat(filepath)
        eeg_data.extend(mat_data['RawEEGData'])

    idx_start = fs*t_start      # fs*ts
    eeg_data = np.array(eeg_data)
    eeg_data = eeg_data[:, :, idx_start:]
    event_id = {'left-hand': 1, 'right-hand': 2}
    channel_names = ['F3', 'FC3', 'C3', 'CP3', 'P3', 'FCz', 'CPz', 'F4', 'FC4', 'C4', 'CP4', 'P4']
    info = mne.create_info(ch_names=channel_names, sfreq=fs, ch_types='eeg')
    epochs = mne.EpochsArray(eeg_data, info, verbose=verbose, tmin=t_start-3.0)
    epochs.set_montage('standard_1020')
    epochs.filter(1., None) # required be ICA, (7-30 Hz) later
    epochs.apply_baseline(baseline=(-.250, 0)) # linear baseline correction
    
    if mode == 'train': # this in only applicable for training data
        labels = []
        for filepath in files_paths:
            mat_data = loadmat(filepath)
            labels.extend(mat_data['Labels'].ravel())
        epochs.event_id = event_id
        epochs.events[:,2] = labels    
    return epochs 

### Data Loading with Band Pass Filtering

In [14]:
# loading relevant files
training_epochs_all = get_mne_epochs_complete(training_files).filter(7,32)            # for all training subjects
evaluation_epochs_9 = get_mne_epochs(evaluation_files[-2], mode='eval').filter(7,32)  # for subject 9
evaluation_epochs_10 = get_mne_epochs(evaluation_files[-1], mode='eval').filter(7,32) # for subject 10

## Leave One Group Out CV

In [15]:
# group parameter for leave one group out cross validation in sklearn, each subject is given unique identifier
group_list = []
for subject in np.linspace(1,8,8):
    group_list.extend([subject for _ in range(80)]) # since we have 80 samples in each training file
groups = np.array(group_list)

In [16]:
cv = LeaveOneGroupOut()

## Lets try classification with psds as features
faster than applying csp and results aren't bad either

In [29]:
# from 0.5-4.5 sec
epochs = training_epochs_all.copy()
psds, freqs = mne.time_frequency.psd_multitaper(epochs, tmin=0.5, tmax=4.5, fmin=8, fmax=30 ,n_jobs=1)
psds = 10 * np.log10(psds) # to convert powers to DB
labels = epochs.events[:,-1]
psds = psds.reshape(psds.shape[0], -1)
x_train = psds
y_train = labels

print('*'*10, 'Classification Scores Comparison with default Parameters' ,'*'*10)
print('KNN           : ', np.mean(cross_val_score(make_pipeline(StandardScaler(),KNeighborsClassifier()), x_train, y_train, cv=cv, scoring=scorer, groups=groups)))
print('Log-Regression: ', np.mean(cross_val_score(make_pipeline(StandardScaler(),LogisticRegression(max_iter=1000)), x_train, y_train, cv=cv, scoring=scorer, groups=groups)))
print('Linear SVM    : ', np.mean(cross_val_score(make_pipeline(StandardScaler(),LinearSVC(random_state=0)), x_train, y_train, cv=cv, scoring=scorer, groups=groups)))
print('kernal SVM    : ', np.mean(cross_val_score(make_pipeline(StandardScaler(), SVC(gamma='scale')), x_train, y_train, cv=cv, scoring=scorer, groups=groups)))
print('LDA           : ', np.mean(cross_val_score(make_pipeline(StandardScaler(), lda()), x_train, y_train, cv=cv, scoring=scorer, groups=groups)))

********** Classification Scores Comparison with default Parameters **********
KNN           :  0.2
Log-Regression:  0.3375
Linear SVM    :  0.3375
kernal SVM    :  0.33125
LDA           :  0.21250000000000002


In [30]:
# from 1.5-4.5 sec
epochs = training_epochs_all.copy()
psds, freqs = mne.time_frequency.psd_multitaper(epochs, tmin=1.5, tmax=4.5, fmin=8, fmax=30 ,n_jobs=1)
psds = 10 * np.log10(psds) # to convert powers to DB
labels = epochs.events[:,-1]
psds = psds.reshape(psds.shape[0], -1)
x_train = psds
y_train = labels

# using all channels
print('*'*10, 'Classification Scores Comparison with default Parameters' ,'*'*10)
print('KNN           : ', np.mean(cross_val_score(make_pipeline(StandardScaler(),KNeighborsClassifier()), x_train, y_train, cv=cv, scoring=scorer, groups=groups)))
print('Log-Regression: ', np.mean(cross_val_score(make_pipeline(StandardScaler(),LogisticRegression(max_iter=1000)), x_train, y_train, cv=cv, scoring=scorer, groups=groups)))
print('Linear SVM    : ', np.mean(cross_val_score(make_pipeline(StandardScaler(),LinearSVC(random_state=0)), x_train, y_train, cv=cv, scoring=scorer, groups=groups)))
print('kernal SVM    : ', np.mean(cross_val_score(make_pipeline(StandardScaler(), SVC(gamma='scale')), x_train, y_train, cv=cv, scoring=scorer, groups=groups)))
print('LDA           : ', np.mean(cross_val_score(make_pipeline(StandardScaler(), lda()), x_train, y_train, cv=cv, scoring=scorer, groups=groups)))

********** Classification Scores Comparison with default Parameters **********
KNN           :  0.23750000000000004
Log-Regression:  0.359375
Linear SVM    :  0.33125
kernal SVM    :  0.35624999999999996
LDA           :  0.24375


# Grid Search 
from 1.5-4.5 sec window

In [20]:
# for training set
psds_train, _ = mne.time_frequency.psd_multitaper(training_epochs_all.copy(), tmin=1.5, tmax=4.5, fmin=8, fmax=30 ,n_jobs=1)
psds_train = 10 * np.log10(psds_train) # to convert powers to dB

In [21]:
x_train  =  psds_train.reshape(psds_train.shape[0], -1)
y_train = training_epochs_all.events[:,-1]

In [22]:
print('Training Data Shape  : ', x_train.shape)
print('Training Labels Shape: ', y_train.shape)

Training Data Shape  :  (640, 792)
Training Labels Shape:  (640,)


In [23]:
param_grid_knn = {'kneighborsclassifier__n_neighbors': np.arange(2,15,2)}
grid_knn = GridSearchCV(make_pipeline(StandardScaler(), KNeighborsClassifier()), 
                        param_grid=param_grid_knn, cv=cv, scoring=scorer, n_jobs=n_jobs)
grid_knn.fit(x_train, y_train, groups=groups) 
print('Maximum Cross Validation Score: {:.3f}'.format(grid_knn.best_score_))
print('Optimal Parameters: ', grid_knn.best_params_)

Maximum Cross Validation Score: 0.244
Optimal Parameters:  {'kneighborsclassifier__n_neighbors': 8}


In [24]:
# for logistic regression
param_grid_log_reg = {'logisticregression__C' : np.logspace(-4, 3, 16)}
grid_log_reg = GridSearchCV(make_pipeline(StandardScaler(), LogisticRegression()), 
                            param_grid=param_grid_log_reg, cv=cv, scoring=scorer) 
grid_log_reg.fit(x_train, y_train, groups=groups) 
print('Maximum Cross Validation Score: ',  round(grid_log_reg.best_score_,3))
print('Optimal Parameters: ', grid_log_reg.best_params_)

Maximum Cross Validation Score:  0.422
Optimal Parameters:  {'logisticregression__C': 0.0025118864315095794}


In [25]:
# for linear svm
param_grid_linear_svm =     {'linearsvc__C' : np.logspace(-4, 3, 15)}
grid_linear_svm = GridSearchCV(make_pipeline(StandardScaler(), LinearSVC()), 
                               param_grid=param_grid_linear_svm, cv=cv, scoring=scorer, n_jobs=n_jobs)
grid_linear_svm.fit(x_train, y_train, groups=groups) 
print('Maximum Cross Validation Score: ',  round(grid_linear_svm.best_score_,3))
print('Optimal Parameters: ', grid_linear_svm.best_params_)

Maximum Cross Validation Score:  0.434
Optimal Parameters:  {'linearsvc__C': 0.0031622776601683794}


In [26]:
# kernel svm
param_grid_kernel_svm = {'svc__C': np.logspace(-4, 3, 8),
                         'svc__gamma': np.logspace(-4, 2, 4) / x_train.shape[0]} 
                         
grid_kernel_svm = GridSearchCV(make_pipeline(StandardScaler(), SVC()), 
                            param_grid=param_grid_kernel_svm, cv=cv, scoring=scorer, n_jobs=n_jobs)
grid_kernel_svm.fit(x_train, y_train, groups=groups) 
print('Maximum Cross Validation Score: ',  round(grid_kernel_svm.best_score_,3))
print('Optimal Parameters: ', grid_kernel_svm.best_params_)

Maximum Cross Validation Score:  0.4
Optimal Parameters:  {'svc__C': 100.0, 'svc__gamma': 1.5625e-05}


In [27]:
# lda, auto shrinkage works pretty well
shrinkage = list(np.arange(0.0,1.01,0.05))
shrinkage.append('auto')

param_grid_lda = {'lineardiscriminantanalysis__shrinkage': shrinkage}   
grid_lda = GridSearchCV(make_pipeline(StandardScaler(), lda(solver='eigen')), 
                        param_grid=param_grid_lda, cv=cv, scoring=scorer, n_jobs=n_jobs)
grid_lda.fit(x_train, y_train, groups=groups) 
print('Maximum Cross Validation Score: ',  round(grid_lda.best_score_,3))
print('Optimal Parameters: ', grid_lda.best_params_)

Maximum Cross Validation Score:  0.434
Optimal Parameters:  {'lineardiscriminantanalysis__shrinkage': 0.5}


In [136]:
# predictions counts on evaluation subject 9
psds_eval, _ = mne.time_frequency.psd_multitaper(evaluation_epochs_9.copy(), tmin=1.5, tmax=4.5, fmin=8, fmax=30 ,n_jobs=1)
psds_eval = 10 * np.log10(psds_eval) # to convert powers to DB
x_eval = psds_eval.reshape(psds_eval.shape[0], -1)

preds_knn = grid_knn.predict(x_eval)
preds_log_reg = grid_log_reg.predict(x_eval)
preds_linear_svm = grid_linear_svm.predict(x_eval)
preds_kernel_svm = grid_kernel_svm.predict(x_eval)
preds_lda = grid_lda.predict(x_eval)
print('*'*10, 'Predictions Counts on Subject 9' ,'*'*10)
print('KNN          : ', 'Class 1 =', sum(preds_knn==1), 'Class 2 =', sum(preds_knn==2)) 
print('LogReg       : ', 'Class 1 =', sum(preds_log_reg==1), 'Class 2 =', sum(preds_log_reg==2))
print('LinearSVM    : ', 'Class 1 =', sum(preds_linear_svm==1), 'Class 2 =', sum(preds_linear_svm==2))
print('KernelSVM    : ', 'Class 1 =', sum(preds_kernel_svm==1), 'Class 2 =', sum(preds_kernel_svm==2))
print('LDA          : ', 'Class 1 =', sum(preds_lda==1), 'Class 2 =', sum(preds_lda==2)) 

********** Predictions Counts on Evaluation set **********
KNN          :  Class 1 = 22 Class 2 = 18
LogReg       :  Class 1 = 32 Class 2 = 8
LinearSVM    :  Class 1 = 33 Class 2 = 7
KernelSVM    :  Class 1 = 30 Class 2 = 10
LDA          :  Class 1 = 34 Class 2 = 6
RandomForest :  Class 1 = 29 Class 2 = 11


In [137]:
# predictions counts on evaluation subject 10
psds_eval, _ = mne.time_frequency.psd_multitaper(evaluation_epochs_10.copy(), tmin=1.5, tmax=4.5, fmin=8, fmax=30 ,n_jobs=1)
psds_eval = 10 * np.log10(psds_eval) # to convert powers to DB
x_eval = psds_eval.reshape(psds_eval.shape[0], -1)

preds_knn = grid_knn.predict(x_eval)
preds_log_reg = grid_log_reg.predict(x_eval)
preds_linear_svm = grid_linear_svm.predict(x_eval)
preds_kernel_svm = grid_kernel_svm.predict(x_eval)
preds_lda = grid_lda.predict(x_eval)
print('*'*10, 'Predictions Counts on Subject 10' ,'*'*10)
print('KNN          : ', 'Class 1 =', sum(preds_knn==1), 'Class 2 =', sum(preds_knn==2)) 
print('LogReg       : ', 'Class 1 =', sum(preds_log_reg==1), 'Class 2 =', sum(preds_log_reg==2))
print('LinearSVM    : ', 'Class 1 =', sum(preds_linear_svm==1), 'Class 2 =', sum(preds_linear_svm==2))
print('KernelSVM    : ', 'Class 1 =', sum(preds_kernel_svm==1), 'Class 2 =', sum(preds_kernel_svm==2))
print('LDA          : ', 'Class 1 =', sum(preds_lda==1), 'Class 2 =', sum(preds_lda==2)) 

********** Predictions Counts on Evaluation set **********
KNN          :  Class 1 = 22 Class 2 = 18
LogReg       :  Class 1 = 18 Class 2 = 22
LinearSVM    :  Class 1 = 15 Class 2 = 25
KernelSVM    :  Class 1 = 15 Class 2 = 25
LDA          :  Class 1 = 11 Class 2 = 29
RandomForest :  Class 1 = 21 Class 2 = 19


##  Results
results are good for subject 10 and bad for 9. linear svm and lda both good. I'd pick linSVM as the winner.