In [48]:
### Implementation of hyper-parameter tuned random forest CV algorithm
### for one.vs.one classifier.
#--> Deals with splitting of training data from source...

In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, tree
import xgboost
import os
import csv
from sklearn.model_selection import train_test_split
#---------------------RF HP-f(x) & CV---------------
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import cross_val_score

In [22]:
folder_root = "../../data/"

In [23]:
def find_files(PATH):
    """
    Finds all the files in a particular directory. Return only .csv files.
    """
    files = []
    for r, d, f in os.walk(PATH):
        for file in f:
            if '.csv' in file:
                files.append(os.path.join(r, file).replace("\\","/"))
    return files

In [24]:
def list_dir_files(root):
    """
    Finds all the files in a nested directory of folders and files (.csv)
    """
    files = []
    friendly_name = []
    for x in os.listdir(root):
        subfolder = root + x
        if os.path.isdir(subfolder):
            onfo = find_files(subfolder)
            if(len(onfo) > 0 and len(x.split("_")) > 1):##Removes test folder and empty folders
                files.append(onfo)
                fn = x.split("/")[-1]
                friendly_name.append(fn)
    return files, friendly_name

In [26]:
files, friendly_name = list_dir_files(folder_root)

In [27]:
friendly_name

['0_Anger',
 '1_Fear',
 '2_Disgust',
 '3_Happiness',
 '4_Sadness',
 '6_Neutral',
 '8_Love']

In [28]:
def remove_meta_data(PATH):
    """
    Return:
    1. Changes in Electric potential based on Unix timestamp from
        the 5 channels of the Emotiv headset. 2 channels from the Frontal Lobe, 
        1 channel from the parietal lobe, and 2 from temporal lobe.
    2. Pandas Dataframe of the data reflected from (1).
    """
    reader = csv.reader(open(PATH, "rt"), delimiter='\t')
    i = 0
    one_file_data = []
    for line in reader:
        if(i > 0):
            one_file_data.append(line)
        i += 1
    one_file_data = np.array(one_file_data)
    columns = one_file_data[0][0].split(",")[3:8]
    row_data = []
    for rows in one_file_data[1:]:
        rd = rows[0].split(",")[3:8]
        rdt = []
        for x in rd:
            rdt.append(float(x))
        row_data.append(rdt)
    dataframe = pd.DataFrame(row_data, columns=columns)
    return np.array(row_data), dataframe

In [29]:
def data_DF_dir(list_PATH):
    """
    Returns all the data from a given set of path files and its associated pandas dataframe object.
    """
    raw_data = []
    dataframes = []
    for file in list_PATH:
        rd, dfob = remove_meta_data(file)
        raw_data.append(rd)
        dataframes.append(dfob)
    return np.array(raw_data), dataframes

In [30]:
def root_subfolder_file_data(root_list):
    """
    Extracts dataframe and np.array() of each file within each subfolder of the root folder.
    Returns:
    1. n(n will increase)x5(m varies) dataframe
    2. n(n will increase)x5(m varies) np.array()
    """
    root_df = []
    root_np = []
    for x in root_list:
        rnd, rdf = data_DF_dir(x)
        root_df.append(rdf)
        root_np.append(rnd)
    return np.array(root_np), root_df

In [55]:
XXX, XXXraw_df = root_subfolder_file_data(files)

In [63]:
XXXraw_df[1][-1].head()

Unnamed: 0,EEG.AF3,EEG.T7,EEG.Pz,EEG.T8,EEG.AF4
0,4336.922852,4342.563965,4133.333496,4056.410156,4162.05127
1,4341.025879,4361.538574,4127.692383,4044.102539,4167.179688
2,4338.974121,4383.589844,4151.794922,4072.307617,4173.846191
3,4331.282227,4368.205078,4153.333496,4068.205078,4171.282227
4,4317.94873,4351.794922,4124.102539,4025.641113,4155.384766


In [65]:
XXX[1][-1].shape

(5480, 5)

In [32]:
def merge_split(dataframes, friendly_name, merge_name):
    """
    Splits dataframe into training and testing sets based on 80% rule from the directory.
    Returns:
    1. train_df : set of dataframes used for training. Can be used into training and testing split.
    2. test_df  : reserved for testing only.
    """
    train = []
    test = []
    i_train = 0
    flag = 0
    
    for x,fn in zip(dataframes, friendly_name):
        i_train = len(x)
        for dt in x:
            holder = np.full((1, dt.shape[0]), float(fn.split("_")[0])).T
            temp_df = dt
            temp_df[merge_name] = holder
            if(flag < i_train -1):
                train.append(temp_df)
            else:
                test.append(temp_df)
            flag += 1
        flag = 0
            
    train_df = train[0]
    test_df = test[0]
    
    for i in range(1, len(train)):
        train_df = train_df.append(train[i])
    train_df = train_df.reset_index().drop(['index'],axis=1) 
    
    for i in range(1, len(test)):
        test_df = test_df.append(test[i])
    test_df = test_df.reset_index().drop(['index'],axis=1) 
    
    return train_df, test_df

In [33]:
df_train, df_test = merge_split(raw_df, friendly_name, "emotion")

In [34]:
df_test.describe()

Unnamed: 0,EEG.AF3,EEG.T7,EEG.Pz,EEG.T8,EEG.AF4,emotion
count,28776.0,28776.0,28776.0,28776.0,28776.0,28776.0
mean,4226.558723,4304.404408,4149.662932,4138.171535,4195.610143,3.197769
std,131.91078,164.566229,135.555505,138.462298,184.735875,2.624773
min,3445.128174,2828.718018,2342.564209,3364.102539,1709.230713,0.0
25%,4186.153809,4252.820313,4099.487305,4100.0,4165.641113,1.0
50%,4229.743652,4333.333496,4148.717773,4139.487305,4200.0,3.0
75%,4270.769043,4371.282227,4201.025879,4175.897461,4237.94873,6.0
max,5114.871582,5250.256348,6214.358887,5431.794922,5228.717773,8.0


In [35]:
df_train.describe()

Unnamed: 0,EEG.AF3,EEG.T7,EEG.Pz,EEG.T8,EEG.AF4,emotion
count,96244.0,96244.0,96244.0,96244.0,96244.0,96244.0
mean,4213.057458,4299.585159,4141.168992,4133.638864,4197.001456,3.550549
std,239.971859,133.250635,134.667882,138.692085,155.666877,2.953992
min,1529.74353,2809.743652,2167.179443,2592.307617,1900.512817,0.0
25%,4174.871582,4242.05127,4088.718018,4106.666504,4160.0,1.0
50%,4220.0,4298.461426,4143.077148,4137.94873,4197.436035,3.0
75%,4263.077148,4360.0,4193.846191,4167.179688,4234.358887,6.0
max,8006.666504,5584.615234,6380.0,6132.820313,6887.692383,8.0


In [15]:
X_train_no_split = np.array(df_train.iloc[:,:-1])
y_train_no_split = np.array(df_train.iloc[:,-1])
#-----------------------------------------------
X_test_original = np.array(df_test.iloc[:,:-1])
y_test_original = np.array(df_test.iloc[:,-1])

In [36]:
### Hyper-parameter tuning again...

In [40]:
params={
 "n_estimators"          : [10,50,75,100,110,120,130,150,200,250,500,750,1000] ,
 "criterion"             : ['gini','entropy'],
 "max_leaf_nodes"        : [ None, 2, 3, 5, 7, 10],
 "min_impurity_decrease" : [0.0,0.05,0.1,0.2,0.3],
 "min_samples_split"     : [0.1,0.5,2,3,4,5,6],   
}

In [58]:
classifier=RandomForestClassifier()
random_search=RandomizedSearchCV(classifier,param_distributions=params,n_iter=30,n_jobs=-1,cv=30,verbose=3)
random_search.fit(X_train_no_split,y_train_no_split)

Fitting 30 folds for each of 30 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 23.3min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 27.3min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed: 46.7min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 73.7min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 87.6min finished


RandomizedSearchCV(cv=30, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=30, n_jobs=-1,
          param_distributions={'n_estimators': [10, 50, 75, 100, 110, 120, 130, 150, 200, 250, 500, 750, 1000], 'criterion': ['gini', 'entropy'], 'max_leaf_nodes': [None, 2, 3, 5, 7, 10], 'min_impurity_decrease': [0.0, 0.05, 0.1, 0.2, 0.3], 'min_samples_split': [0.1, 0.5, 2, 3, 4, 5, 6]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=3)

In [60]:
print("Best Estimator : ", random_search.best_estimator_)
print("Best parameters : ", random_search.best_params_)
print("Best Score : ", random_search.best_score_)

Best Estimator :  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Best parameters :  {'n_estimators': 250, 'min_samples_split': 5, 'min_impurity_decrease': 0.0, 'max_leaf_nodes': None, 'criterion': 'gini'}
Best Score :  0.5161794591214632


In [7]:
### Hyper parameterized conditions after 50 CV.

In [41]:
RF_HP = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [42]:
RF_HP.fit(X_train_no_split, y_train_no_split)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [63]:
RF_HP.feature_importances_

array([0.17217213, 0.28958433, 0.16645261, 0.20774313, 0.16404781])

In [43]:
y_pred = RF_HP.predict(X_test_original)

In [44]:
accuracy_score(y_test_original, y_pred)

0.38963757206330557

In [45]:
###2.5 times better than random chance.

In [66]:
y_test_horro = RF_HP.predict(XXX[1][-1])

In [70]:
def run_test(results, friendly_name):
    for x in friendly_name:
        fn = x.split("_")
        print(fn[1] + " Count : " + str(results.count(int(fn[0]))))

In [72]:
run_test(y_test_horro, friendly_name)

NameError: name 'test_results' is not defined