In [1]:
### Implementation of XGBoost gradient boost classifier with hyper-parameter tuning.
### Data : All 7 folders.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn import svm, tree
import xgboost
import os
import csv
from sklearn.model_selection import train_test_split
#------------------
from scipy import fft, arange, signal

In [3]:
folder_root = "../../data/"

In [4]:
def find_files(PATH):
    """
    Finds all the files in a particular directory. Return only .csv files.
    """
    files = []
    for r, d, f in os.walk(PATH):
        for file in f:
            if '.csv' in file:
                files.append(os.path.join(r, file).replace("\\","/"))
    return files

In [5]:
def list_dir_files(root):
    """
    Finds all the files in a nested directory of folders and files (.csv)
    """
    files = []
    friendly_name = []
    for x in os.listdir(root):
        subfolder = root + x
        if os.path.isdir(subfolder):
            onfo = find_files(subfolder)
            if(len(onfo) > 0 and len(x.split("_")) > 1):##Removes test folder and empty folders
                files.append(onfo)
                fn = x.split("/")[-1]
                friendly_name.append(fn)
    return files, friendly_name

In [6]:
files, friendly_name = list_dir_files(folder_root)

In [7]:
def remove_meta_data(PATH):
    """
    Return:
    1. Changes in Electric potential based on Unix timestamp from
        the 5 channels of the Emotiv headset. 2 channels from the Frontal Lobe, 
        1 channel from the parietal lobe, and 2 from temporal lobe.
    2. Pandas Dataframe of the data reflected from (1).
    """
    reader = csv.reader(open(PATH, "rt"), delimiter='\t')
    i = 0
    one_file_data = []
    for line in reader:
        if(i > 0):
            one_file_data.append(line)
        i += 1
    one_file_data = np.array(one_file_data)
    columns = one_file_data[0][0].split(",")[3:8]
    row_data = []
    for rows in one_file_data[1:]:
        rd = rows[0].split(",")[3:8]
        rdt = []
        for x in rd:
            rdt.append(float(x))
        row_data.append(rdt)
    dataframe = pd.DataFrame(row_data, columns=columns)
    return np.array(row_data), dataframe

In [8]:
def data_DF_dir(list_PATH):
    """
    Returns all the data from a given set of path files and its associated pandas dataframe object.
    """
    raw_data = []
    dataframes = []
    for file in list_PATH:
        rd, dfob = remove_meta_data(file)
        raw_data.append(rd)
        dataframes.append(dfob)
    return np.array(raw_data), dataframes

In [9]:
def root_subfolder_file_data(root_list):
    """
    Extracts dataframe and np.array() of each file within each subfolder of the root folder.
    Returns:
    1. n(n will increase)x5(m varies) dataframe
    2. n(n will increase)x5(m varies) np.array()
    """
    root_df = []
    root_np = []
    for x in root_list:
        rnd, rdf = data_DF_dir(x)
        root_df.append(rdf)
        root_np.append(rnd)
    return np.array(root_np), root_df

In [10]:
_, raw_df = root_subfolder_file_data(files)

In [11]:
raw_df[0][0].head() ## Anger 0th file

Unnamed: 0,EEG.AF3,EEG.T7,EEG.Pz,EEG.T8,EEG.AF4
0,4216.410156,4259.487305,4187.179688,4088.205078,4203.589844
1,4217.94873,4253.846191,4182.05127,4098.974121,4209.230957
2,4211.794922,4252.307617,4165.128418,4102.05127,4214.871582
3,4216.410156,4252.820313,4158.974121,4105.128418,4223.077148
4,4231.794922,4253.333496,4165.641113,4108.717773,4227.692383


In [12]:
def merge_all_data(dataframe, friendly_name, merge_name):
    """
    Merges all dataframes into one dataframe with associated emotion tag.
    """
    list_df_append_emotion = []
    for x,fn in zip(dataframe, friendly_name):
        for fil in x:
            holder = np.full((1, fil.shape[0]), float(fn.split("_")[0])).T
            temp_df = fil
            temp_df[merge_name] = holder
            list_df_append_emotion.append(temp_df)
    pd_df = list_df_append_emotion[0]
    for i in range(1, len(list_df_append_emotion)):
        pd_df = pd_df.append(list_df_append_emotion[i])
    pd_df = pd_df.reset_index().drop(['index'],axis=1)
    return pd_df

In [13]:
df = merge_all_data(raw_df, friendly_name, "emotion")

In [14]:
df.describe()

Unnamed: 0,EEG.AF3,EEG.T7,EEG.Pz,EEG.T8,EEG.AF4,emotion
count,119540.0,119540.0,119540.0,119540.0,119540.0,119540.0
mean,4215.462872,4297.601779,4143.542524,4134.975258,4196.622698,3.58255
std,223.809288,143.45571,137.496719,141.010738,166.320705,2.900798
min,1529.74353,2809.743652,2167.179443,2592.307617,1709.230713,0.0
25%,4175.897461,4241.025879,4090.769287,4106.153809,4160.512695,1.0
50%,4221.538574,4300.0,4145.128418,4138.461426,4197.94873,3.0
75%,4264.102539,4359.487305,4197.94873,4169.743652,4236.410156,6.0
max,8006.666504,5584.615234,6380.0,6132.820313,6887.692383,8.0


In [15]:
### Looks good. Data all merged. @ 11:47 am

In [16]:
X = np.array(df.iloc[:,:-1])
y = np.array(df.iloc[:,-1])

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=True)

In [18]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape ### Data split.

((71724, 5), (71724,), (47816, 5), (47816,))

In [19]:
classifiers = []
model1 = xgboost.XGBClassifier()
classifiers.append(model1)
# model2 = svm.SVC()
# classifiers.append(model2)
model3 = tree.DecisionTreeClassifier()
classifiers.append(model3)
model4 = RandomForestClassifier()
classifiers.append(model4)
model5 = AdaBoostClassifier()
classifiers.append(model5)
model6 = GradientBoostingClassifier()
classifiers.append(model6)

In [20]:
for clf in classifiers:
    clf.fit(X_train, y_train)
    y_pred= clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy of %s is %s\n"%(type(clf), acc))
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix of %s is \n%s\n"%(type(clf), cm))

Accuracy of <class 'xgboost.sklearn.XGBClassifier'> is 0.5690145557972227

Confusion Matrix of <class 'xgboost.sklearn.XGBClassifier'> is 
[[2824 2016  215  909  127  645 1254]
 [ 298 5922    9  197   60  707  755]
 [ 461  945  978 1109   40  196  752]
 [ 561 1329  238 4991   29  375  994]
 [ 380  180   33  166  356   83  179]
 [  80  178    0   81    0 7367  125]
 [ 603 2618  109  633   36  903 4770]]

Accuracy of <class 'sklearn.tree.tree.DecisionTreeClassifier'> is 0.5861427137359879

Confusion Matrix of <class 'sklearn.tree.tree.DecisionTreeClassifier'> is 
[[4299  903  493  754  287  299  955]
 [ 955 3886  420  697  128  431 1431]
 [ 485  406 2283  584   92  100  531]
 [ 718  721  641 5296  137  168  836]
 [ 267  115  104  120  586   41  144]
 [ 285  441   79  161   30 6336  499]
 [ 951 1407  554  793  165  461 5341]]





Accuracy of <class 'sklearn.ensemble.forest.RandomForestClassifier'> is 0.6724527354860298

Confusion Matrix of <class 'sklearn.ensemble.forest.RandomForestClassifier'> is 
[[5139  987  258  488  135  280  703]
 [ 767 5275  134  314   43  464  951]
 [ 467  558 2461  455   43   97  400]
 [ 711  781  385 5830   48  146  616]
 [ 300  135   68  102  617   42  113]
 [ 204  263   31   75   11 7087  160]
 [ 849 1622  285  606   44  521 5745]]

Accuracy of <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'> is 0.4781453906642128

Confusion Matrix of <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'> is 
[[1404 2241  197 1334  607  938 1269]
 [ 176 5809   18  369   40  740  796]
 [ 358 1132  310 1582  186  351  562]
 [ 610 1643  284 4063  287  669  961]
 [ 177  305   24  168  368  100  235]
 [  19  305    2   78    5 7238  184]
 [ 424 3068  113 1099  160 1137 3671]]

Accuracy of <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'> is 0.5891751714907144

C

In [50]:
### From the results, the highest accuracy score was that of Random Forest Classifier ~67.25%
results = model4.predict(X[20000:30000])

In [58]:
results = list(results)
print("Class 0 result : " + str(results.count(0)), "\nClass 1 result : " + str(results.count(1)))

Class 0 result : 524 
Class 1 result : 8518


In [59]:
### Now, time to validate with real world test data.

In [149]:
test_folder = find_files(folder_root + "Test")

In [150]:
test_data, test_df = data_DF_dir(test_folder)

In [151]:
## data structure : 0,1 -> Anger ; 2,3 -> Happy

In [152]:
test_results = list(model4.predict(test_data[0]))

In [163]:
print("Fear Count : " + str(test_results.count(1)))

Fear Count : 3033


In [None]:
#Fear works...
# Reflection... need same video.