## Feature Extraction using Autoregressive and Statistical Histogram with difference of a DataFrame 
 

In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline
import mne
mne.utils.use_log_level('error')

<mne.utils.use_log_level at 0x1c16d893320>

In [20]:
import glob

path = r'Preprocessed_Data'
all_files = glob.glob(path + "/*_Non-VR.fif")

cols = []
sub = 1
mean, median, std, mode,mean_std,skew,kurto = [],[],[],[],[],[],[]

for filename in all_files:
    raw =  mne.io.read_raw_fif(filename, preload=True,verbose='error');
    if 'CH 4' in raw.info['ch_names']:
        raw.pick_channels(["CH 4"])
      
        picks = mne.pick_types(raw.info, eeg=True, exclude=[])
        
        interval = 5
        start = 60
        stop = start + interval
        while(stop <= 120):
            start1, stop1 = raw.time_as_index([start, stop])
            data, times = raw[picks[:], start1:stop1]
            segment = pd.Series(data[0]).shift(17).dropna()
            
            df_mean = segment.mean()
            mean.append(df_mean)
            
            df_median = segment.median()
            median.append(df_median)
                       
            df_std = segment.std()
            std.append(df_std)
            
            df_skewness = segment.skew()
            skew.append(df_skewness)

            df_kurt = segment.kurt()
            kurto.append(df_kurt)
            
            df_mean_std = df_mean/df_std
            mean_std.append(df_mean_std)            
                       
            start += interval
            stop += interval

            cols.append(sub)
    sub += 1
df1 = pd.DataFrame(data={'Mean':mean, 'Median':median, 'StdDev':std,'Mean/StdDev':mean_std,'Skew':skew,'Kurt':kurto})    

df1.insert(loc=0, column='Subject', value=cols)

In [21]:
df1.head() #Features after AR and Statistical histogram

Unnamed: 0,Subject,Mean,Median,StdDev,Mean/StdDev,Skew,Kurt
0,1,-0.06447,-0.424835,8.052628,-0.008006,0.121501,0.428748
1,1,-0.075851,-0.145451,7.282104,-0.010416,-0.367209,2.917546
2,1,0.156003,0.017355,7.060631,0.022095,0.11509,0.23668
3,1,0.0739,0.251501,7.021932,0.010524,-0.01179,0.184013
4,1,-0.062443,-0.027827,6.402234,-0.009753,-0.264659,0.7637


In [22]:
# Features after 1st difference
df2 = df1.diff().dropna()
df2.head()

Unnamed: 0,Subject,Mean,Median,StdDev,Mean/StdDev,Skew,Kurt
1,0.0,-0.01138,0.279385,-0.770524,-0.00241,-0.488711,2.488798
2,0.0,0.231854,0.162806,-0.221472,0.032511,0.482299,-2.680867
3,0.0,-0.082103,0.234146,-0.038699,-0.011571,-0.12688,-0.052667
4,0.0,-0.136343,-0.279328,-0.619698,-0.020277,-0.25287,0.579687
5,0.0,0.048089,-0.121548,1.01595,0.007818,0.275939,-0.40408


In [23]:
# Features after 2st difference
df3 = df2.diff().dropna()
df3.head()

Unnamed: 0,Subject,Mean,Median,StdDev,Mean/StdDev,Skew,Kurt
2,0.0,0.243234,-0.116579,0.549052,0.034921,0.97101,-5.169665
3,0.0,-0.313956,0.07134,0.182773,-0.044081,-0.609179,2.6282
4,0.0,-0.05424,-0.513474,-0.580999,-0.008707,-0.12599,0.632353
5,0.0,0.184432,0.157781,1.635648,0.028096,0.528809,-0.983767
6,0.0,0.047436,0.307619,-0.85884,0.004832,-0.374526,0.114555


In [24]:
df1.reset_index(drop=True, inplace=True)
df2.reset_index(drop=True, inplace=True)
df3.reset_index(drop=True, inplace=True)

In [25]:
# Cobnining all 3 different features
df = pd.concat([df1, df2, df3], axis=1)

In [26]:
df.dropna(inplace=True)
df.columns = df.columns = ['Subject', 'Mean', 'Median', 'StdDev', 'Mean/StdDev', 'Skew', 'Kurt',
       'Subject1', 'Mean1', 'Median1', 'StdDev1', 'Mean/StdDev1', 'Skew1', 'Kurt1',
       'Subject2', 'Mean2', 'Median2', 'StdDev2', 'Mean/StdDev2', 'Skew2', 'Kurt2']
df.drop(['Subject1','Subject2'], axis=1, inplace=True)
df.head()

Unnamed: 0,Subject,Mean,Median,StdDev,Mean/StdDev,Skew,Kurt,Mean1,Median1,StdDev1,Mean/StdDev1,Skew1,Kurt1,Mean2,Median2,StdDev2,Mean/StdDev2,Skew2,Kurt2
0,1,-0.06447,-0.424835,8.052628,-0.008006,0.121501,0.428748,-0.01138,0.279385,-0.770524,-0.00241,-0.488711,2.488798,0.243234,-0.116579,0.549052,0.034921,0.97101,-5.169665
1,1,-0.075851,-0.145451,7.282104,-0.010416,-0.367209,2.917546,0.231854,0.162806,-0.221472,0.032511,0.482299,-2.680867,-0.313956,0.07134,0.182773,-0.044081,-0.609179,2.6282
2,1,0.156003,0.017355,7.060631,0.022095,0.11509,0.23668,-0.082103,0.234146,-0.038699,-0.011571,-0.12688,-0.052667,-0.05424,-0.513474,-0.580999,-0.008707,-0.12599,0.632353
3,1,0.0739,0.251501,7.021932,0.010524,-0.01179,0.184013,-0.136343,-0.279328,-0.619698,-0.020277,-0.25287,0.579687,0.184432,0.157781,1.635648,0.028096,0.528809,-0.983767
4,1,-0.062443,-0.027827,6.402234,-0.009753,-0.264659,0.7637,0.048089,-0.121548,1.01595,0.007818,0.275939,-0.40408,0.047436,0.307619,-0.85884,0.004832,-0.374526,0.114555


In [27]:
df.columns

Index(['Subject', 'Mean', 'Median', 'StdDev', 'Mean/StdDev', 'Skew', 'Kurt',
       'Mean1', 'Median1', 'StdDev1', 'Mean/StdDev1', 'Skew1', 'Kurt1',
       'Mean2', 'Median2', 'StdDev2', 'Mean/StdDev2', 'Skew2', 'Kurt2'],
      dtype='object')

In [28]:
#import seaborn as sns
#sns_plot = sns.pairplot(df1.drop('Subject', axis=1))
#sns_plot.savefig('features_plot.png')

In [29]:
feature_list = ['Mean', 'Median', 'StdDev','Mean/StdDev','Skew','Kurt',
       'Mean1', 'Median1', 'StdDev1', 'Mean/StdDev1', 'Skew1', 'Kurt1',
       'Mean2', 'Median2', 'StdDev2', 'Mean/StdDev2', 'Skew2', 'Kurt2']

col = ['Kurt', 'Kurt1', 'Kurt2', 'Mean', 'Mean/StdDev', 'Mean/StdDev1',
       'Mean/StdDev2', 'Mean1', 'Mean2', 'Median', 'Median1', 'Median2',
       'Skew', 'Skew1', 'Skew2', 'StdDev', 'StdDev1', 'StdDev2', 'Subject',
       'Type']

In [30]:
import itertools

In [31]:
def distanceCalculation(df):
    subs = df['Subject'].unique()    # All subjects
    all_subs= list(itertools.combinations(subs, 2)) # All possible combination for all subjects

    distance_col = ['Subject',*feature_list, 'Type']
    intra_data = pd.DataFrame(columns = distance_col)

    #Intra Distance Computation (Same Person)
    for sub in subs:
        rows = df.loc[df['Subject'] == sub]
        each_comb = list(itertools.combinations(rows.index, 2))
        for i in range(len(each_comb)):
            comb = each_comb[i]
            fdr = absDistance(df, feature_list, comb[0], comb[1])
            intra_data = intra_data.append(pd.Series([sub,*fdr,0], index=distance_col), ignore_index=True)
    
    inter_data = pd.DataFrame(columns = distance_col)
    # Inter Distance Computation (Different Person) 
    all_rows=len(df)
    for sub_pair in all_subs: # Pairs of subjets
        sp1 = df.loc[df['Subject'] == sub_pair[0]].index
        sp2 = df.loc[df['Subject'] == sub_pair[1]].index
        for i in range(len(sp1)):
            for j in range(len(sp2)):
                fdr = absDistance(df, feature_list, sp1[i], sp2[j])
                inter_data = inter_data.append(pd.Series([sub_pair, *fdr, 1], index=distance_col), ignore_index=True)    
    return intra_data, inter_data

def absDistance(df, features, s1, s2):
    r=0
    fdr = [None]*len(features)
    for feature in features:
        f1 = df.iloc[s1][feature] 
        f2 = df.iloc[s2][feature] 
        Inter_dis = np.absolute(f1-f2) # absolute difference
        fdr[r] = Inter_dis
        r+=1
    return fdr

In [32]:
intra1, inter1 = distanceCalculation(df)

In [33]:
print("Intra length: "+str(len(intra1)))
print("Inter length: "+str(len(inter1)))

Intra length: 1893
Inter length: 57792


In [36]:
import random
#6
#takes subframe and returns a more managble table for SVM
def get_SVM_Table(intra, inter):
    svmTable = pd.DataFrame()
    rands = random.sample(range(0, len(intra)), 1800)
    for rand in rands:
        svmTable = svmTable.append(intra.iloc[rand],ignore_index=True)
    
    rands = random.sample(range(0, len(inter)), 2700)
    for rand in rands:
        svmTable = svmTable.append(inter.iloc[rand],ignore_index=True)
    return svmTable

In [37]:
svm1 = get_SVM_Table(intra1,inter1)

# SVM

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [39]:

def svmTest(svm):
    X = svm.drop(['Subject','Type'], axis=1)
    y = svm["Type"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
    
    model = SVC(C=1.0, kernel = 'rbf', degree=3, gamma='auto')
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    confusionMatrix = confusion_matrix(y_test, prediction)
    print(confusionMatrix)
    print(classification_report(y_test, prediction))
    print("Accuracy: "+str(accuracy_score(y_test, prediction)))
    return confusionMatrix, model

In [40]:
cm, model = svmTest(svm1)

[[352 203]
 [125 670]]
              precision    recall  f1-score   support

         0.0       0.74      0.63      0.68       555
         1.0       0.77      0.84      0.80       795

   micro avg       0.76      0.76      0.76      1350
   macro avg       0.75      0.74      0.74      1350
weighted avg       0.76      0.76      0.75      1350

Accuracy: 0.7570370370370371


In [41]:
TN = cm[0][0]
FN = cm[1][0]
TP = cm[1][1]
FP = cm[0][1]

sums = TN+TP+FN+FP

acc = (TN+TP)/sums

print('False Acceptance: '+str(FP/sums))
print('False Rejection: '+str(FN/sums))
print(acc)

False Acceptance: 0.15037037037037038
False Rejection: 0.09259259259259259
0.7570370370370371


In [42]:
def classify(test):
    if(model.predict([test]))==0:
        return True
    else:
        return False

##### Precision Score
TP – True Positives
FP – False Positives

Precision – Accuracy of positive predictions.
Precision = TP/(TP + FP)

##### Recall Score
FN – False Negatives

Recall (aka sensitivity or true positive rate): Fraction of positives That were correctly identified.
Recall = TP/(TP+FN)


##### F1 Score
F1 Score (aka F-Score or F-Measure) – A helpful metric for comparing two classifiers. F1 Score takes into account precision and the recall. It is created by finding the the harmonic mean of precision and recall.

F1 = 2 x (precision x recall)/(precision + recall)


In [78]:
#from sklearn.metrics import roc_auc_score
#fpr, tpr, thresholds = metrics.roc_curve(y_test, prediction, pos_label=2)


"""
    from sklearn.metrics import roc_curve, auc
    fpr, tpr, thresholds = roc_curve(y_test, prediction)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()
""" 

'\n    from sklearn.metrics import roc_curve, auc\n    fpr, tpr, thresholds = roc_curve(y_test, prediction)\n    roc_auc = auc(fpr, tpr)\n\n    plt.figure()\n    plt.plot(fpr, tpr, color=\'darkorange\', lw=1, label=\'ROC curve (area = %0.2f)\' % roc_auc)\n    plt.plot([0, 1], [0, 1], color=\'navy\', lw=2, linestyle=\'--\')\n    plt.xlim([0.0, 1.0])\n    plt.ylim([0.0, 1.05])\n    plt.xlabel(\'False Positive Rate\')\n    plt.ylabel(\'True Positive Rate\')\n    plt.title(\'Receiver operating characteristic\')\n    plt.legend(loc="lower right")\n    plt.show()\n'