## Feature Extraction using Autoregressive and Statistical Histogram with difference of a DataFrame 
 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline
import mne
mne.utils.use_log_level('error')

<mne.utils.use_log_level at 0x29da69a91d0>

In [75]:
import glob

path = r'Preprocessed_Data'
all_files = glob.glob(path + "/*_VR.fif")

cols = []
sub = 1
mean, median, std, mode,mean_std,skew,kurto = [],[],[],[],[],[],[]

for filename in all_files:
    raw =  mne.io.read_raw_fif(filename, preload=True,verbose='error');
    if 'CH 4' in raw.info['ch_names']:
        raw.pick_channels(["CH 4"])
      
        picks = mne.pick_types(raw.info, eeg=True, exclude=[])
        
        interval = 5
        start = 55
        stop = start + interval
        while(stop <= 165):
            start1, stop1 = raw.time_as_index([start, stop])
            data, times = raw[picks[:], start1:stop1]
            segment = pd.Series(data[0]).shift(17).dropna()
            
            df_mean = segment.mean()
            mean.append(df_mean)
            
            df_median = segment.median()
            median.append(df_median)
                       
            df_std = segment.std()
            std.append(df_std)
            
            df_skewness = segment.skew()
            skew.append(df_skewness)

            df_kurt = segment.kurt()
            kurto.append(df_kurt)
            
            df_mean_std = df_mean/df_std
            mean_std.append(df_mean_std)            
                       
            start += interval
            stop += interval

            cols.append(sub)
    sub += 1
df1 = pd.DataFrame(data={'Mean':mean, 'Median':median, 'StdDev':std,'Mean/StdDev':mean_std,'Skew':skew,'Kurt':kurto})    

df1.insert(loc=0, column='Subject', value=cols)

In [76]:
df1.head() #Features after AR and Statistical histogram

Unnamed: 0,Subject,Mean,Median,StdDev,Mean/StdDev,Skew,Kurt
0,1,-0.047717,0.853077,60.153542,-0.000793,-0.372398,1.726107
1,1,-0.58698,-1.398961,32.779575,-0.017907,0.235663,0.688715
2,1,0.496268,-0.249225,21.813704,0.02275,-0.036059,-0.105357
3,1,0.025601,1.037389,25.891945,0.000989,-0.008202,0.784456
4,1,-0.22637,0.226183,20.323359,-0.011138,-0.213009,-0.280326


In [77]:
# Features after 1st difference
df2 = df1.diff().dropna()
df2.head()

Unnamed: 0,Subject,Mean,Median,StdDev,Mean/StdDev,Skew,Kurt
1,0.0,-0.539263,-2.252037,-27.373966,-0.017114,0.608062,-1.037391
2,0.0,1.083249,1.149736,-10.965871,0.040657,-0.271722,-0.794072
3,0.0,-0.470667,1.286614,4.078241,-0.021762,0.027857,0.889813
4,0.0,-0.251972,-0.811207,-5.568586,-0.012127,-0.204807,-1.064782
5,0.0,0.420738,1.846343,-1.360457,0.021388,0.048117,-0.651973


In [78]:
# Features after 2st difference
df3 = df2.diff().dropna()
df3.head()

Unnamed: 0,Subject,Mean,Median,StdDev,Mean/StdDev,Skew,Kurt
2,0.0,1.622512,3.401773,16.408095,0.057771,-0.879783,0.24332
3,0.0,-1.553916,0.136878,15.044112,-0.062419,0.299579,1.683885
4,0.0,0.218695,-2.097821,-9.646827,0.009634,-0.232663,-1.954595
5,0.0,0.67271,2.65755,4.208129,0.033516,0.252924,0.412809
6,0.0,-0.611649,-3.108729,-0.557181,-0.031435,-0.030764,1.097


In [79]:
df1.reset_index(drop=True, inplace=True)
df2.reset_index(drop=True, inplace=True)
df3.reset_index(drop=True, inplace=True)

In [80]:
# Cobnining all 3 different features
df = pd.concat([df1, df2, df3], axis=1)

In [81]:
df.dropna(inplace=True)
df.columns = df.columns = ['Subject', 'Mean', 'Median', 'StdDev', 'Mean/StdDev', 'Skew', 'Kurt',
       'Subject1', 'Mean1', 'Median1', 'StdDev1', 'Mean/StdDev1', 'Skew1', 'Kurt1',
       'Subject2', 'Mean2', 'Median2', 'StdDev2', 'Mean/StdDev2', 'Skew2', 'Kurt2']
df.drop(['Subject1','Subject2'], axis=1, inplace=True)
df.head()

Unnamed: 0,Subject,Mean,Median,StdDev,Mean/StdDev,Skew,Kurt,Mean1,Median1,StdDev1,Mean/StdDev1,Skew1,Kurt1,Mean2,Median2,StdDev2,Mean/StdDev2,Skew2,Kurt2
0,1,-0.047717,0.853077,60.153542,-0.000793,-0.372398,1.726107,-0.539263,-2.252037,-27.373966,-0.017114,0.608062,-1.037391,1.622512,3.401773,16.408095,0.057771,-0.879783,0.24332
1,1,-0.58698,-1.398961,32.779575,-0.017907,0.235663,0.688715,1.083249,1.149736,-10.965871,0.040657,-0.271722,-0.794072,-1.553916,0.136878,15.044112,-0.062419,0.299579,1.683885
2,1,0.496268,-0.249225,21.813704,0.02275,-0.036059,-0.105357,-0.470667,1.286614,4.078241,-0.021762,0.027857,0.889813,0.218695,-2.097821,-9.646827,0.009634,-0.232663,-1.954595
3,1,0.025601,1.037389,25.891945,0.000989,-0.008202,0.784456,-0.251972,-0.811207,-5.568586,-0.012127,-0.204807,-1.064782,0.67271,2.65755,4.208129,0.033516,0.252924,0.412809
4,1,-0.22637,0.226183,20.323359,-0.011138,-0.213009,-0.280326,0.420738,1.846343,-1.360457,0.021388,0.048117,-0.651973,-0.611649,-3.108729,-0.557181,-0.031435,-0.030764,1.097


In [82]:
df.columns

Index(['Subject', 'Mean', 'Median', 'StdDev', 'Mean/StdDev', 'Skew', 'Kurt',
       'Mean1', 'Median1', 'StdDev1', 'Mean/StdDev1', 'Skew1', 'Kurt1',
       'Mean2', 'Median2', 'StdDev2', 'Mean/StdDev2', 'Skew2', 'Kurt2'],
      dtype='object')

In [83]:
#import seaborn as sns
#sns_plot = sns.pairplot(df1.drop('Subject', axis=1))
#sns_plot.savefig('features_plot.png')

In [84]:
feature_list = ['Mean', 'Median', 'StdDev','Mean/StdDev','Skew','Kurt',
       'Mean1', 'Median1', 'StdDev1', 'Mean/StdDev1', 'Skew1', 'Kurt1',
       'Mean2', 'Median2', 'StdDev2', 'Mean/StdDev2', 'Skew2', 'Kurt2']

col = ['Kurt', 'Kurt1', 'Kurt2', 'Mean', 'Mean/StdDev', 'Mean/StdDev1',
       'Mean/StdDev2', 'Mean1', 'Mean2', 'Median', 'Median1', 'Median2',
       'Skew', 'Skew1', 'Skew2', 'StdDev', 'StdDev1', 'StdDev2', 'Subject',
       'Type']

In [85]:
import itertools

In [86]:
def distanceCalculation(df):
    subs = df['Subject'].unique()    # All subjects
    all_subs= list(itertools.combinations(subs, 2)) # All possible combination for all subjects

    distance_col = ['Subject',*feature_list, 'Type']
    intra_data = pd.DataFrame(columns = distance_col)

    #Intra Distance Computation (Same Person)
    for sub in subs:
        rows = df.loc[df['Subject'] == sub]
        each_comb = list(itertools.combinations(rows.index, 2))
        for i in range(len(each_comb)):
            comb = each_comb[i]
            fdr = absDistance(df, feature_list, comb[0], comb[1])
            intra_data = intra_data.append(pd.Series([sub,*fdr,0], index=distance_col), ignore_index=True)
    
    inter_data = pd.DataFrame(columns = distance_col)
    # Inter Distance Computation (Different Person) 
    all_rows=len(df)
    for sub_pair in all_subs: # Pairs of subjets
        sp1 = df.loc[df['Subject'] == sub_pair[0]].index
        sp2 = df.loc[df['Subject'] == sub_pair[1]].index
        for i in range(len(sp1)):
            for j in range(len(sp2)):
                fdr = absDistance(df, feature_list, sp1[i], sp2[j])
                inter_data = inter_data.append(pd.Series([sub_pair, *fdr, 1], index=distance_col), ignore_index=True)    
    return intra_data, inter_data

def absDistance(df, features, s1, s2):
    r=0
    fdr = [None]*len(features)
    for feature in features:
        f1 = df.iloc[s1][feature] 
        f2 = df.iloc[s2][feature] 
        Inter_dis = np.absolute(f1-f2) # absolute difference
        fdr[r] = Inter_dis
        r+=1
    return fdr

In [87]:
intra1, inter1 = distanceCalculation(df)

In [88]:
print("Intra length: "+str(len(intra1)))
print("Inter length: "+str(len(inter1)))

Intra length: 6658
Inter length: 195272


In [89]:
import random
6
#takes subframe and returns a more managble table for SVM
def get_SVM_Table(intra, inter):
    svmTable = pd.DataFrame()
    rands = random.sample(range(0, len(intra)), 6000)
    for rand in rands:
        svmTable = svmTable.append(intra.iloc[rand],ignore_index=True)
    
    rands = random.sample(range(0, len(inter)), 9000)
    for rand in rands:
        svmTable = svmTable.append(inter.iloc[rand],ignore_index=True)
    return svmTable

In [90]:
svm1 = get_SVM_Table(intra1,inter1)

# SVM

In [91]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [92]:

def svmTest(svm):
    X = svm.drop(['Subject','Type'], axis=1)
    y = svm["Type"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
    
    model = SVC(C=1.0, kernel = 'rbf', degree=3, gamma='auto')
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    confusionMatrix = confusion_matrix(y_test, prediction)
    print(confusionMatrix)
    print(classification_report(y_test, prediction))
    print("Accuracy: "+str(accuracy_score(y_test, prediction)))
    return confusionMatrix, model

In [93]:
cm, model = svmTest(svm1)

[[1159  647]
 [ 528 2166]]
              precision    recall  f1-score   support

         0.0       0.69      0.64      0.66      1806
         1.0       0.77      0.80      0.79      2694

   micro avg       0.74      0.74      0.74      4500
   macro avg       0.73      0.72      0.73      4500
weighted avg       0.74      0.74      0.74      4500

Accuracy: 0.7388888888888889


In [94]:
TN = cm[0][0]
FN = cm[1][0]
TP = cm[1][1]
FP = cm[0][1]

sums = TN+TP+FN+FP

acc = (TN+TP)/sums

print('False Acceptance: '+str(FP/sums))
print('False Rejection: '+str(FN/sums))
print(acc)

False Acceptance: 0.14377777777777778
False Rejection: 0.11733333333333333
0.7388888888888889


In [24]:
def classify(test):
    if(model.predict([test]))==0:
        return True
    else:
        return False

##### Precision Score
TP – True Positives
FP – False Positives

Precision – Accuracy of positive predictions.
Precision = TP/(TP + FP)

##### Recall Score
FN – False Negatives

Recall (aka sensitivity or true positive rate): Fraction of positives That were correctly identified.
Recall = TP/(TP+FN)


##### F1 Score
F1 Score (aka F-Score or F-Measure) – A helpful metric for comparing two classifiers. F1 Score takes into account precision and the recall. It is created by finding the the harmonic mean of precision and recall.

F1 = 2 x (precision x recall)/(precision + recall)


In [78]:
#from sklearn.metrics import roc_auc_score
#fpr, tpr, thresholds = metrics.roc_curve(y_test, prediction, pos_label=2)


"""
    from sklearn.metrics import roc_curve, auc
    fpr, tpr, thresholds = roc_curve(y_test, prediction)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()
""" 

'\n    from sklearn.metrics import roc_curve, auc\n    fpr, tpr, thresholds = roc_curve(y_test, prediction)\n    roc_auc = auc(fpr, tpr)\n\n    plt.figure()\n    plt.plot(fpr, tpr, color=\'darkorange\', lw=1, label=\'ROC curve (area = %0.2f)\' % roc_auc)\n    plt.plot([0, 1], [0, 1], color=\'navy\', lw=2, linestyle=\'--\')\n    plt.xlim([0.0, 1.0])\n    plt.ylim([0.0, 1.05])\n    plt.xlabel(\'False Positive Rate\')\n    plt.ylabel(\'True Positive Rate\')\n    plt.title(\'Receiver operating characteristic\')\n    plt.legend(loc="lower right")\n    plt.show()\n'