In [1]:
### Implementation of hyper-parameter tuned random forest CV algorithm
### for one.vs.one classifier.
#--> Deals with splitting of training data from source...

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, tree
import xgboost
import os
import csv
from sklearn.model_selection import train_test_split
#---------------------RF HP-f(x) & CV---------------
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import cross_val_score

In [3]:
folder_root = "../../data/"

In [4]:
def find_files(PATH):
    """
    Finds all the files in a particular directory. Return only .csv files.
    """
    files = []
    for r, d, f in os.walk(PATH):
        for file in f:
            if '.csv' in file:
                files.append(os.path.join(r, file).replace("\\","/"))
    return files

In [5]:
def list_dir_files(root):
    """
    Finds all the files in a nested directory of folders and files (.csv)
    """
    files = []
    friendly_name = []
    for x in os.listdir(root):
        subfolder = root + x
        if os.path.isdir(subfolder):
            onfo = find_files(subfolder)
            if(len(onfo) > 0 and len(x.split("_")) > 1):##Removes test folder and empty folders
                files.append(onfo)
                fn = x.split("/")[-1]
                friendly_name.append(fn)
    return files, friendly_name

In [6]:
files, friendly_name = list_dir_files(folder_root)

In [7]:
def remove_meta_data(PATH):
    """
    Return:
    1. Changes in Electric potential based on Unix timestamp from
        the 5 channels of the Emotiv headset. 2 channels from the Frontal Lobe, 
        1 channel from the parietal lobe, and 2 from temporal lobe.
    2. Pandas Dataframe of the data reflected from (1).
    """
    reader = csv.reader(open(PATH, "rt"), delimiter='\t')
    i = 0
    one_file_data = []
    for line in reader:
        if(i > 0):
            one_file_data.append(line)
        i += 1
    one_file_data = np.array(one_file_data)
    columns = one_file_data[0][0].split(",")[3:8]
    row_data = []
    for rows in one_file_data[1:]:
        rd = rows[0].split(",")[3:8]
        rdt = []
        for x in rd:
            rdt.append(float(x))
        row_data.append(rdt)
    dataframe = pd.DataFrame(row_data, columns=columns)
    return np.array(row_data), dataframe

In [8]:
def data_DF_dir(list_PATH):
    """
    Returns all the data from a given set of path files and its associated pandas dataframe object.
    """
    raw_data = []
    dataframes = []
    for file in list_PATH:
        rd, dfob = remove_meta_data(file)
        raw_data.append(rd)
        dataframes.append(dfob)
    return np.array(raw_data), dataframes

In [9]:
def root_subfolder_file_data(root_list):
    """
    Extracts dataframe and np.array() of each file within each subfolder of the root folder.
    Returns:
    1. n(n will increase)x5(m varies) dataframe
    2. n(n will increase)x5(m varies) np.array()
    """
    root_df = []
    root_np = []
    for x in root_list:
        rnd, rdf = data_DF_dir(x)
        root_df.append(rdf)
        root_np.append(rnd)
    return np.array(root_np), root_df

In [10]:
_, raw_df = root_subfolder_file_data(files)

In [11]:
def merge_split(dataframes, friendly_name, merge_name):
    """
    Splits dataframe into training and testing sets based on 80% rule from the directory.
    Returns:
    1. train_df : set of dataframes used for training. Can be used into training and testing split.
    2. test_df  : reserved for testing only.
    """
    train = []
    test = []
    i_train = 0
    flag = 0
    
    for x,fn in zip(dataframes, friendly_name):
        i_train = len(x)
        for dt in x:
            holder = np.full((1, dt.shape[0]), float(fn.split("_")[0])).T
            temp_df = dt
            temp_df[merge_name] = holder
            if(flag < i_train -1):
                train.append(temp_df)
            else:
                test.append(temp_df)
            flag += 1
        flag = 0
            
    train_df = train[0]
    test_df = test[0]
    
    for i in range(1, len(train)):
        train_df = train_df.append(train[i])
    train_df = train_df.reset_index().drop(['index'],axis=1) 
    
    for i in range(1, len(test)):
        test_df = test_df.append(test[i])
    test_df = test_df.reset_index().drop(['index'],axis=1) 
    
    return train_df, test_df

In [12]:
df_train, df_test = merge_split(raw_df, friendly_name, "emotion")

In [13]:
df_test.describe()

Unnamed: 0,EEG.AF3,EEG.T7,EEG.Pz,EEG.T8,EEG.AF4,emotion
count,31899.0,31899.0,31899.0,31899.0,31899.0,31899.0
mean,4225.962946,4339.589441,4150.735775,4136.736518,4195.568329,3.421424
std,115.11601,148.580112,127.110545,126.97615,172.321026,2.454666
min,3445.128174,2828.718018,2342.564209,3364.102539,1709.230713,0.0
25%,4190.256348,4266.153809,4116.410156,4108.205078,4170.256348,1.0
50%,4225.641113,4340.0,4150.427246,4138.974121,4199.487305,3.0
75%,4261.538574,4446.666504,4185.128418,4167.692383,4230.769043,4.0
max,5114.871582,5250.256348,6214.358887,5431.794922,5228.717773,8.0


In [14]:
df_train.describe()

Unnamed: 0,EEG.AF3,EEG.T7,EEG.Pz,EEG.T8,EEG.AF4,emotion
count,95955.0,95955.0,95955.0,95955.0,95955.0,95955.0
mean,4213.073086,4297.634484,4142.020672,4133.42491,4197.101429,3.672284
std,241.239572,141.426093,134.925156,140.211141,157.150127,2.911752
min,1529.74353,2809.743652,2167.179443,2592.307617,1900.512817,0.0
25%,4173.333496,4239.800537,4089.743652,4105.641113,4158.974121,1.0
50%,4220.0,4300.0,4144.102539,4137.94873,4197.436035,3.0
75%,4264.102539,4361.538574,4195.384766,4168.717773,4235.897461,6.0
max,8006.666504,5584.615234,6380.0,6132.820313,6887.692383,8.0


In [55]:
X_train_no_split = np.array(df_train.iloc[:,:-1])
y_train_no_split = np.array(df_train.iloc[:,-1])
#-----------------------------------------------
X_test_original = np.array(df_test.iloc[:,:-1])
y_test_original = np.array(df_test.iloc[:,-1])

In [56]:
### Hyper-parameter tuning again...

In [57]:
params={
 "n_estimators"          : [10,50,75,100,110,120,130,150,200,250,500,750,1000] ,
 "criterion"             : ['gini','entropy'],
 "max_leaf_nodes"        : [ None, 2, 3, 5, 7, 10],
 "min_impurity_decrease" : [0.0,0.05,0.1,0.2,0.3],
 "min_samples_split"     : [0.1,0.5,2,3,4,5,6],   
}

In [58]:
classifier=RandomForestClassifier()
random_search=RandomizedSearchCV(classifier,param_distributions=params,n_iter=30,n_jobs=-1,cv=30,verbose=3)
random_search.fit(X_train_no_split,y_train_no_split)

Fitting 30 folds for each of 30 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 23.3min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 27.3min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed: 46.7min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 73.7min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 87.6min finished


RandomizedSearchCV(cv=30, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=30, n_jobs=-1,
          param_distributions={'n_estimators': [10, 50, 75, 100, 110, 120, 130, 150, 200, 250, 500, 750, 1000], 'criterion': ['gini', 'entropy'], 'max_leaf_nodes': [None, 2, 3, 5, 7, 10], 'min_impurity_decrease': [0.0, 0.05, 0.1, 0.2, 0.3], 'min_samples_split': [0.1, 0.5, 2, 3, 4, 5, 6]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=3)

In [60]:
print("Best Estimator : ", random_search.best_estimator_)
print("Best parameters : ", random_search.best_params_)
print("Best Score : ", random_search.best_score_)

Best Estimator :  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Best parameters :  {'n_estimators': 250, 'min_samples_split': 5, 'min_impurity_decrease': 0.0, 'max_leaf_nodes': None, 'criterion': 'gini'}
Best Score :  0.5161794591214632


In [None]:
### Hyper parameterized conditions after 50 CV.

In [61]:
RF_HP = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [62]:
RF_HP.fit(X_train_no_split, y_train_no_split)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [63]:
RF_HP.feature_importances_

array([0.17217213, 0.28958433, 0.16645261, 0.20774313, 0.16404781])

In [64]:
y_pred = RF_HP.predict(X_test_original)

In [65]:
accuracy_score(y_test_original, y_pred)

0.369353271262422

In [None]:
###2.5 times better than random chance.