# Hyper Parameter Tuning for classifier for EDA Peak stat features!

In [1]:
import numpy as np
import pandas as pd
import os

# Classifier

In [2]:
import pandas as pd
import numpy as np
import os
# from os import path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import classification_report
from numpy import vstack, hstack, stack
from collections import Counter
from sklearn.model_selection import RandomizedSearchCV
import lightgbm
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [3]:
from Training_Code.config import SELECTCOLS, ECG_SELECTCOLS, EDA_SELECTCOLS
from sklearn.utils import shuffle

### Label

In [4]:
def make_kfolds(dr_feat_path):
    subjects = os.listdir(dr_feat_path)
    xtrainDriv = pd.DataFrame()
    for subTrain in subjects:
        train = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(subTrain)))
        train[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0)
        if np.isinf(train).values.sum():
            cinf = np.isinf(train).values.sum()
            print("Train Dataframe contains {} values".format(cinf))
        train.replace([np.inf], 9999, inplace=True)        
        train.replace([-np.inf], -9999, inplace=True)        

        train['scrNumPeaks'] = train['scrNumPeaks'].values.astype(int)
        train['scrNumPeaks'] = train['scrNumPeaks'].values.clip(min=0) # converting negatives to zero

        train.dropna(inplace=True)
        xtrainDriv = xtrainDriv.append(train)
        xtrainDriv.reset_index(drop=True, inplace=True)

    return xtrainDriv.copy()

In [5]:
def read_dataset(dataset, folder, basefolder):
    dr_feat_path = r'X:\All Modes\{}\ECG EDA\Combined\{}'.format(dataset, folder) # ECG_EDA_Features_Combined_scld
    bs_feat_path = r'X:\All Modes\{}\ECG EDA\Combined\{}'.format(dataset, basefolder) # ECG_EDA_Base2_Features_Combined

    XtrainDriv = make_kfolds(dr_feat_path)
    XtrainBase = make_kfolds(bs_feat_path)

    XtrainDriv = XtrainDriv[SELECTCOLS].copy()
    ytrainDriv = list(XtrainDriv['scaled label'].copy())

    XtrainBase = XtrainBase[SELECTCOLS[:-3]].copy()
    ytrainBase = XtrainBase.shape[0] * [0]

    XtrainDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)
    XtrainDriv = XtrainDriv.append(XtrainBase)

    ytrain = ytrainDriv + ytrainBase

    X = XtrainDriv.values
    
    X, ytrain = shuffle(X, ytrain, random_state=42)

    for idx, val in enumerate(ytrain):
        if val <= 4:
            ytrain[idx] = 0
        else: ytrain[idx] = 1

    return X, ytrain

## Hyperparameter tuning

In [6]:
data, label = read_dataset('MatbII', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.20, random_state=42, shuffle=True)

In [15]:
X_train.shape

(3399, 89)

In [16]:
random_grid = {'bootstrap': [True, False],
 'max_depth': [30, 50, 70, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100, 800, 1000, 2000, 3000]}

rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=50,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [30, 50, 70, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 800, 1000, 2000,
                                                         3000]},
                   random_state=42, verbose=2)

In [17]:
rf_random.best_params_

{'n_estimators': 1000,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 30,
 'bootstrap': False}

In [18]:
model = rf_random.best_estimator_
hist = model.fit(X_train, y_train)
predicted_y = model.predict(X_test)
print(classification_report(y_test, predicted_y))

              precision    recall  f1-score   support

           0       0.78      0.84      0.81       459
           1       0.79      0.71      0.75       391

    accuracy                           0.78       850
   macro avg       0.79      0.78      0.78       850
weighted avg       0.78      0.78      0.78       850



In [20]:
data, label = read_dataset('Virage', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.20, random_state=42, shuffle=True)

In [21]:
X_train.shape

(4168, 89)

In [22]:
random_grid = {'bootstrap': [True, False],
 'max_depth': [30, 50, 70, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100, 800, 1000, 2000, 3000]}

rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=50,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [30, 50, 70, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 800, 1000, 2000,
                                                         3000]},
                   random_state=42, verbose=2)

In [23]:
rf_random.best_params_

{'n_estimators': 2000,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 30,
 'bootstrap': False}

In [24]:
model = rf_random.best_estimator_
hist = model.fit(X_train, y_train)
predicted_y = model.predict(X_test)
print(classification_report(y_test, predicted_y))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90       636
           1       0.85      0.83      0.84       407

    accuracy                           0.88      1043
   macro avg       0.87      0.87      0.87      1043
weighted avg       0.88      0.88      0.88      1043



In [25]:
from sklearn.feature_selection import SelectFromModel
sel = SelectFromModel(model)
sel.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=False, max_depth=30,
                                                 n_estimators=2000))

# SVM

In [9]:
parameters = {'C': [1, 10, 50, 100], 'class_weight': ['balanced']}
from sklearn.preprocessing import StandardScaler


data, label = read_dataset('MatbII', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')
scaler = StandardScaler()
data = scaler.fit_transform(data)
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.20, random_state=42, shuffle=True)

svc_ = SVC()
rf_random = RandomizedSearchCV(estimator = svc_, param_distributions = parameters, n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)



Fitting 5 folds for each of 4 candidates, totalling 20 fits


RandomizedSearchCV(cv=5, estimator=SVC(), n_iter=50, n_jobs=-1,
                   param_distributions={'C': [1, 10, 50, 100],
                                        'class_weight': ['balanced']},
                   random_state=42, verbose=2)

In [10]:
rf_random.best_params_

{'class_weight': 'balanced', 'C': 10}

In [11]:
model = rf_random.best_estimator_
hist = model.fit(X_train, y_train)
predicted_y = model.predict(X_test)
print(classification_report(y_test, predicted_y))

              precision    recall  f1-score   support

           0       0.76      0.72      0.74       459
           1       0.69      0.73      0.71       391

    accuracy                           0.72       850
   macro avg       0.72      0.72      0.72       850
weighted avg       0.73      0.72      0.73       850



In [12]:
parameters = {'C': [1, 10, 50, 100], 'class_weight': ['balanced']}
from sklearn.preprocessing import StandardScaler


data, label = read_dataset('Virage', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')
scaler = StandardScaler()
data = scaler.fit_transform(data)
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.20, random_state=42, shuffle=True)

svc_ = SVC()
rf_random = RandomizedSearchCV(estimator = svc_, param_distributions = parameters, n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)



Fitting 5 folds for each of 4 candidates, totalling 20 fits


RandomizedSearchCV(cv=5, estimator=SVC(), n_iter=50, n_jobs=-1,
                   param_distributions={'C': [1, 10, 50, 100],
                                        'class_weight': ['balanced']},
                   random_state=42, verbose=2)

In [13]:
rf_random.best_params_

{'class_weight': 'balanced', 'C': 10}

In [14]:
model = rf_random.best_estimator_
hist = model.fit(X_train, y_train)
predicted_y = model.predict(X_test)
print(classification_report(y_test, predicted_y))

              precision    recall  f1-score   support

           0       0.86      0.87      0.87       636
           1       0.80      0.78      0.79       407

    accuracy                           0.84      1043
   macro avg       0.83      0.83      0.83      1043
weighted avg       0.84      0.84      0.84      1043



# Logistic Regression

In [45]:
from sklearn.preprocessing import StandardScaler

data, label = read_dataset('MatbII', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')

scaler = StandardScaler()
data = scaler.fit_transform(data)

X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.20, random_state=42, shuffle=True)


parameters_lr = {'C': [0.5, 1, 5, 10], 'max_iter': [400, 500, 600, 1000]}

lr_ = LogisticRegression()
rf_random = RandomizedSearchCV(estimator = lr_, param_distributions = parameters_lr, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)



Fitting 5 folds for each of 16 candidates, totalling 80 fits


RandomizedSearchCV(cv=5, estimator=LogisticRegression(), n_iter=100, n_jobs=-1,
                   param_distributions={'C': [0.5, 1, 5, 10],
                                        'max_iter': [400, 500, 600, 1000]},
                   random_state=42, verbose=2)

In [46]:
rf_random.best_params_

{'max_iter': 400, 'C': 1}

In [47]:
model = rf_random.best_estimator_
hist = model.fit(X_train, y_train)
predicted_y = model.predict(X_test)
print(classification_report(y_test, predicted_y))

              precision    recall  f1-score   support

           0       0.67      0.70      0.69       459
           1       0.63      0.60      0.62       391

    accuracy                           0.66       850
   macro avg       0.65      0.65      0.65       850
weighted avg       0.65      0.66      0.65       850



In [48]:
from sklearn.preprocessing import StandardScaler

data, label = read_dataset('Virage', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')

scaler = StandardScaler()
data = scaler.fit_transform(data)

X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.20, random_state=42, shuffle=True)


parameters_lr = {'C': [0.5, 1, 5, 10], 'max_iter': [400, 500, 600, 1000]}

lr_ = LogisticRegression()
rf_random = RandomizedSearchCV(estimator = lr_, param_distributions = parameters_lr, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)



Fitting 5 folds for each of 16 candidates, totalling 80 fits


RandomizedSearchCV(cv=5, estimator=LogisticRegression(), n_iter=100, n_jobs=-1,
                   param_distributions={'C': [0.5, 1, 5, 10],
                                        'max_iter': [400, 500, 600, 1000]},
                   random_state=42, verbose=2)

In [49]:
rf_random.best_params_

{'max_iter': 400, 'C': 0.5}

In [50]:
model = rf_random.best_estimator_
hist = model.fit(X_train, y_train)
predicted_y = model.predict(X_test)
print(classification_report(y_test, predicted_y))

              precision    recall  f1-score   support

           0       0.77      0.91      0.83       636
           1       0.80      0.57      0.67       407

    accuracy                           0.78      1043
   macro avg       0.78      0.74      0.75      1043
weighted avg       0.78      0.78      0.77      1043



# LGBM

In [24]:
data, label = read_dataset('MatbII', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')

scaler = StandardScaler()
data = scaler.fit_transform(data)

X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.20, random_state=42, shuffle=True)

import lightgbm as lgb
lgbmcls = lgb.LGBMClassifier()
default_params = lgbmcls.get_params()

params = {'boosting_type': ['gbdt', 'dart'],
 'importance_type': ['split', 'gains'],
 'learning_rate': [0.001],
 'n_estimators': [1000, 2000],
 'num_leaves': [10, 50, 100]} # , 'reg_lambda': [0.01, 0.001]

lgb_ = lgb.LGBMClassifier()
rf_random = RandomizedSearchCV(estimator = lgb_, param_distributions = params, n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5, estimator=LGBMClassifier(), n_jobs=-1,
                   param_distributions={'boosting_type': ['gbdt', 'dart'],
                                        'importance_type': ['split', 'gains'],
                                        'learning_rate': [0.001],
                                        'n_estimators': [1000, 2000],
                                        'num_leaves': [10, 50, 100]},
                   random_state=42, verbose=2)

In [18]:
default_params

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [25]:
rf_random.best_params_

{'num_leaves': 100,
 'n_estimators': 2000,
 'learning_rate': 0.001,
 'importance_type': 'gains',
 'boosting_type': 'gbdt'}

In [26]:
model = rf_random.best_estimator_
hist = model.fit(X_train, y_train)
predicted_y = model.predict(X_test)
print(classification_report(y_test, predicted_y))

              precision    recall  f1-score   support

           0       0.77      0.81      0.79       459
           1       0.76      0.71      0.74       391

    accuracy                           0.77       850
   macro avg       0.77      0.76      0.76       850
weighted avg       0.77      0.77      0.77       850



In [27]:
data, label = read_dataset('Virage', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')

scaler = StandardScaler()
data = scaler.fit_transform(data)

X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.20, random_state=42, shuffle=True)

import lightgbm as lgb
lgbmcls = lgb.LGBMClassifier()
default_params = lgbmcls.get_params()

params = {'boosting_type': ['gbdt', 'dart'],
 'importance_type': ['split', 'gains'],
 'learning_rate': [0.001],
 'n_estimators': [1000, 2000],
 'num_leaves': [10, 50, 100]} # , 'reg_lambda': [0.01, 0.001]

lgb_ = lgb.LGBMClassifier()
rf_random = RandomizedSearchCV(estimator = lgb_, param_distributions = params, n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5, estimator=LGBMClassifier(), n_jobs=-1,
                   param_distributions={'boosting_type': ['gbdt', 'dart'],
                                        'importance_type': ['split', 'gains'],
                                        'learning_rate': [0.001],
                                        'n_estimators': [1000, 2000],
                                        'num_leaves': [10, 50, 100]},
                   random_state=42, verbose=2)

In [28]:
rf_random.best_params_

{'num_leaves': 100,
 'n_estimators': 2000,
 'learning_rate': 0.001,
 'importance_type': 'gains',
 'boosting_type': 'gbdt'}

In [29]:
model = rf_random.best_estimator_
hist = model.fit(X_train, y_train)
predicted_y = model.predict(X_test)
print(classification_report(y_test, predicted_y))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90       636
           1       0.86      0.82      0.84       407

    accuracy                           0.88      1043
   macro avg       0.87      0.87      0.87      1043
weighted avg       0.88      0.88      0.88      1043



In [51]:
import lightgbm as lgb
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test)

params = {
'boosting_type': 'gbdt', 'min_data_in_leaf':10, 'feature_fraction':0.9, 'bagging_fraction':1,
'bagging_freq':20, 'max_depth':32,
'num_leaves':100,
'learning_rate':0.005,
'objective': 'multiclass',
'metric': 'multi_logloss',
'num_class':3
}
gbm = lgb.train(
params,
lgb_train,
num_boost_round=3000,
valid_sets=[lgb_train, lgb_test],
early_stopping_rounds=10
)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17747
[LightGBM] [Info] Number of data points in the train set: 5224, number of used features: 70
[LightGBM] [Info] Start training from score -1.062703
[LightGBM] [Info] Start training from score -1.098804
[LightGBM] [Info] Start training from score -1.135661
[1]	training's multi_logloss: 1.09439	valid_1's multi_logloss: 1.09528
Training until validation scores don't improve for 10 rounds
[2]	training's multi_logloss: 1.09041	valid_1's multi_logloss: 1.09264
[3]	training's multi_logloss: 1.08675	valid_1's multi_logloss: 1.09011
[4]	training's multi_logloss: 1.08293	valid_1's multi_logloss: 1.08743
[5]	training's multi_logloss: 1.0792	valid_1's multi_logloss: 1.08488
[6]	training's multi_logloss: 1.07528	valid_1's multi_logloss: 1.08219
[7]	training's multi_logloss: 1.07161	valid_1's multi_logloss: 1.07956
[8]	training's multi_logloss: 1.06801	valid_1's multi_logloss: 1.07704
[9]	training's multi_logl

In [52]:
predicted_y = gbm.predict(X_test)
pred_y = [np.argmax(y) for y in predicted_y]
print(classification_report(y_test, pred_y))

              precision    recall  f1-score   support

           0       0.76      0.78      0.77       469
           1       0.67      0.69      0.68       410
           2       0.77      0.72      0.74       428

    accuracy                           0.73      1307
   macro avg       0.73      0.73      0.73      1307
weighted avg       0.73      0.73      0.73      1307



# Classifier Performance of LOSO

In [None]:
def loso_validation():

    dr_feat_path = r'X:\IDEaS_2\Driving Simulator\Data\Subjectwise\ecg_eda_t2_scld'
    mat_feat_path = r'X:\IDEaS_2\MATBII\Data\Subjectwise\ecg_eda_t2_scld'
    bas_feat_path = r'X:\IDEaS_2\Driving Simulator\Data\Subjectwise\base_ecg_eda_t2'

    xtrainMat, yMat = load_matb(mat_feat_path, 'label')
    xtrainBas, yBas = load_viragebase(bas_feat_path, 'label')

    subjects = os.listdir(dr_feat_path)

    for sdriv in subjects:
        xtestDriv = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(sdriv)))
        ytestDriv = xtestDriv['label'].copy()

        xtrainDriv = pd.DataFrame()
        for subTrain in subjects:
            if subTrain != sdriv:
                train = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(subTrain)))
                xtrainDriv = xtrainDriv.append(train)
                xtrainDriv.reset_index(drop=True, inplace=True)

        XtrainDriv = xtrainDriv[selected_cols].copy()
        xtestDriv = xtestDriv[selected_cols].copy()
        ytrainDriv = list(XtrainDriv['label'].copy())

        XtrainDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)
        xtestDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)

        XtrainDriv = XtrainDriv.append(xtrainMat)
        XtrainDriv = XtrainDriv.append(xtrainBas)

        ytrain = ytrainDriv + yMat + yBas
        X = XtrainDriv.values

        for idx, val in enumerate(ytrain):
            if val <= 4:
                ytrain[idx] = 0
            else: ytrain[idx] = 2

        for idx, val in enumerate(ytestDriv):
            if val <= 4:
                ytestDriv[idx] = 0
            else: ytestDriv[idx] = 2

        paramsrf = {'n_estimators': 3000,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'max_features': 'sqrt',
        'max_depth': 50,
        'bootstrap': False, 'verbose':0, 'n_jobs': -1, 'class_weight': 'balanced'}

        clf2 = RandomForestClassifier(**paramsrf)
        clf3 = lightgbm.LGBMClassifier(n_estimators = 3000, num_leaves=100, learning_rate=0.005, class_weight='balanced')
        clf4 = SVC(C=1000, probability=True, class_weight='balanced')

        eclf = VotingClassifier(estimators=[('rf', clf2), ('gbm', clf3), ('svm', clf4)], voting='soft') # 
        hist = eclf.fit(X, ytrain)
        yPred = hist.predict(xtestDriv)
        print('Test Subject: {}'.format(sdriv))
        print(classification_report(ytestDriv, yPred, zero_division=1))
    return

loso_validation()

In [25]:
def losoValidation(folder):
    dr_feat_path = r'X:\RealTimeSegment\Driving Simulator\Extracted\{}'.format(folder) # Norm_ECG_EDA_Features_Combined_scld
    mat_feat_path = r'X:\RealTimeSegment\MatbII\Extracted\{}'.format(folder)
    bas_feat_path = r'X:\RealTimeSegment\Driving Simulator\Extracted\Norm_ECG_EDA_Features_Baseline_Combined'

    xtrainBas, yBas = load_viragebase(bas_feat_path, 'label')
    xtrainMat, yMat = load_matb(mat_feat_path, 'scaled label')

    subjects = os.listdir(dr_feat_path)
    xtrainDriv = pd.DataFrame()

    for sdriv in subjects:

        xtestDriv = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(sdriv)))

        xtestDriv[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0)
        xtestDriv.replace([np.inf, -np.inf], 0, inplace=True)        
        xtestDriv.dropna(inplace=True) # .reset_index(drop=True, inplace=True)

        ytestDriv = list(xtestDriv['label'].copy())

        xtrainDriv = pd.DataFrame()
        for subTrain in subjects:
            if subTrain != sdriv:
                train = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(subTrain)))

                train[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0)
                train.replace([np.inf, -np.inf], 0, inplace=True)        
                train.dropna(inplace=True)
                xtrainDriv = xtrainDriv.append(train)
                xtrainDriv.reset_index(drop=True, inplace=True)

        XtrainDriv = xtrainDriv[SELECTCOLS].copy()
        xtestDriv = xtestDriv[SELECTCOLS].copy()
        ytrainDriv = list(XtrainDriv['scaled label'].copy())

        XtrainDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)
        xtestDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)

        XtrainDriv = XtrainDriv.append(xtrainMat)
        XtrainDriv = XtrainDriv.append(xtrainBas)

        ytrain = ytrainDriv + yMat + yBas
        X = XtrainDriv.values

        for idx, val in enumerate(ytrain):
            if val <= 4:
                ytrain[idx] = 0
            else: ytrain[idx] = 1

        for idx, val in enumerate(ytestDriv):
            if val <= 4:
                ytestDriv[idx] = 0
            else: ytestDriv[idx] = 1

        paramsrf = {
            'n_estimators': 3000,
            'min_samples_split': 2,
            'min_samples_leaf': 1,
            'max_features': 'sqrt',
            'max_depth': 50,
            'bootstrap': False
            }

        paramlgbm = {
            'n_estimators': 3000,
            'num_leaves': 100,
            'learning_rate': 0.05,
            'class_weight': 'balanced',
            'random_state': 24
            }

        clf2 = RandomForestClassifier(**paramsrf)
        clf3 = lightgbm.LGBMClassifier(**paramlgbm)
        clf4 = SVC(C=700, probability=True, class_weight='balanced')

        estimatorList = [('rf', clf2), ('svm', clf4), ('gbm', clf3)]
        eclf = VotingClassifier(estimators=estimatorList, voting='soft')

        # clf2 = RandomForestClassifier(**paramsrf)
        # clf3 = lightgbm.LGBMClassifier(n_estimators = 3000, num_leaves=100, learning_rate=0.005, class_weight='balanced')
        # clf4 = SVC(C=1000, probability=True, class_weight='balanced')
        # eclf = VotingClassifier(estimators=[('rf', clf2), ('gbm', clf3), ('svm', clf4)], voting='soft') # 

        hist = eclf.fit(X, ytrain)
        yPred = hist.predict(xtestDriv)
        print('Test Subject: {}'.format(sdriv))
        print(classification_report(ytestDriv, yPred, zero_division=1))            

losoValidation('Norm_ECG_EDA_Features_Combined_scld')
    # return XtrainDriv, y_train

Test Subject: 1030.csv
              precision    recall  f1-score   support

           0       0.35      1.00      0.52        19
           1       1.00      0.19      0.31        43

    accuracy                           0.44        62
   macro avg       0.68      0.59      0.42        62
weighted avg       0.80      0.44      0.38        62

Test Subject: 1105.csv
              precision    recall  f1-score   support

           0       0.35      0.71      0.47        24
           1       0.84      0.53      0.65        68

    accuracy                           0.58        92
   macro avg       0.59      0.62      0.56        92
weighted avg       0.71      0.58      0.60        92

Test Subject: 1106.csv
              precision    recall  f1-score   support

           0       0.20      1.00      0.33         2
           1       1.00      0.50      0.67        16

    accuracy                           0.56        18
   macro avg       0.60      0.75      0.50        18
weigh

In [15]:
def losoValidation(folder):
    dr_feat_path = r'X:\RealTimeSegment\Driving Simulator\Extracted\{}'.format(folder) # Norm_ECG_EDA_Features_Combined_scld
    mat_feat_path = r'X:\RealTimeSegment\MatbII\Extracted\{}'.format(folder)
    bas_feat_path = r'X:\RealTimeSegment\Driving Simulator\Extracted\Norm_ECG_EDA_Features_Baseline_Combined'

    xtrainBas, yBas = load_viragebase(bas_feat_path, 'label')
    xtrainMat, yMat = load_matb(mat_feat_path, 'scaled label')

    subjects = os.listdir(dr_feat_path)
    xtrainDriv = pd.DataFrame()

    for sdriv in subjects:
        if sdriv in ['1868.csv', '1744.csv']:
            continue
        xtestDriv = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(sdriv)))

        xtestDriv[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0)
        xtestDriv.replace([np.inf, -np.inf], 0, inplace=True)        
        xtestDriv.dropna(inplace=True) # .reset_index(drop=True, inplace=True)

        ytestDriv = list(xtestDriv['label'].copy())

        xtrainDriv = pd.DataFrame()
        for subTrain in subjects:
            if subTrain != sdriv:
                train = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(subTrain)))

                train[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0)
                train.replace([np.inf, -np.inf], 0, inplace=True)        
                train.dropna(inplace=True)
                xtrainDriv = xtrainDriv.append(train)
                xtrainDriv.reset_index(drop=True, inplace=True)

        XtrainDriv = xtrainDriv[SELECTCOLS].copy()
        xtestDriv = xtestDriv[SELECTCOLS].copy()
        ytrainDriv = list(XtrainDriv['scaled label'].copy())

        XtrainDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)
        xtestDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)

        XtrainDriv = XtrainDriv.append(xtrainMat)
        XtrainDriv = XtrainDriv.append(xtrainBas)

        ytrain = ytrainDriv + yMat + yBas
        X = XtrainDriv.values

        for idx, val in enumerate(ytrain):
            if val <= 4:
                ytrain[idx] = 0
            else: ytrain[idx] = 1

        for idx, val in enumerate(ytestDriv):
            if val <= 4:
                ytestDriv[idx] = 0
            else: ytestDriv[idx] = 1

        paramsrf = {
            'n_estimators': 3000,
            'min_samples_split': 2,
            'min_samples_leaf': 1,
            'max_features': 'sqrt',
            'max_depth': 50,
            'bootstrap': False
            }

        paramlgbm = {
            'n_estimators': 3000,
            'num_leaves': 100,
            'learning_rate': 0.001,
            'class_weight': 'balanced',
            'random_state': 24
            }

        clf2 = RandomForestClassifier(**paramsrf)
        clf3 = lightgbm.LGBMClassifier(**paramlgbm)
        clf4 = SVC(C=700, probability=True, class_weight='balanced', kernel='rbf')

        estimatorList = [('rf', clf2), ('svm', clf4), ('gbm', clf3)] #  
        eclf = VotingClassifier(estimators=estimatorList, voting='soft')

        # clf2 = RandomForestClassifier(**paramsrf)
        # clf3 = lightgbm.LGBMClassifier(n_estimators = 3000, num_leaves=100, learning_rate=0.005, class_weight='balanced')
        # clf4 = SVC(C=1000, probability=True, class_weight='balanced')
        # eclf = VotingClassifier(estimators=[('rf', clf2), ('gbm', clf3), ('svm', clf4)], voting='soft') # 

        hist = eclf.fit(X, ytrain)
        yPred = hist.predict(xtestDriv)
        print('Test Subject: {}'.format(sdriv))
        print(classification_report(ytestDriv, yPred, zero_division=1))            

losoValidation('Norm_ECG_EDA_Features_Combined_scld')
    # return XtrainDriv, y_train

Test Subject: 1030.csv
              precision    recall  f1-score   support

           0       0.38      0.95      0.55        19
           1       0.93      0.33      0.48        43

    accuracy                           0.52        62
   macro avg       0.66      0.64      0.51        62
weighted avg       0.76      0.52      0.50        62

Test Subject: 1105.csv
              precision    recall  f1-score   support

           0       0.34      0.88      0.49        24
           1       0.90      0.40      0.55        68

    accuracy                           0.52        92
   macro avg       0.62      0.64      0.52        92
weighted avg       0.75      0.52      0.53        92

Test Subject: 1106.csv
              precision    recall  f1-score   support

           0       0.20      1.00      0.33         2
           1       1.00      0.50      0.67        16

    accuracy                           0.56        18
   macro avg       0.60      0.75      0.50        18
weigh

In [27]:
def losoValidation(folder):
    dr_feat_path = r'X:\RealTimeSegment\Driving Simulator\Extracted\{}'.format(folder) # Norm_ECG_EDA_Features_Combined_scld
    mat_feat_path = r'X:\RealTimeSegment\MatbII\Extracted_0\{}'.format(folder)
    bas_feat_path = r'X:\RealTimeSegment\Driving Simulator\Extracted\Norm_ECG_EDA_Features_Baseline_Combined'

    xtrainBas, yBas = load_viragebase(bas_feat_path, 'label')
    xtrainMat, yMat = load_matb(mat_feat_path, 'scaled label')

    subjects = os.listdir(dr_feat_path)
    xtrainDriv = pd.DataFrame()

    for sdriv in subjects:
        xtestDriv = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(sdriv)))

        xtestDriv[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0)
        xtestDriv.replace([np.inf, -np.inf], 0, inplace=True)        
        xtestDriv.dropna(inplace=True) # .reset_index(drop=True, inplace=True)

        ytestDriv = list(xtestDriv['label'].copy())

        xtrainDriv = pd.DataFrame()
        for subTrain in subjects:
            if subTrain != sdriv:
                train = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(subTrain)))

                train[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0)
                train.replace([np.inf, -np.inf], 0, inplace=True)        
                train.dropna(inplace=True)
                xtrainDriv = xtrainDriv.append(train)
                xtrainDriv.reset_index(drop=True, inplace=True)

        XtrainDriv = xtrainDriv[SELECTCOLS].copy()
        xtestDriv = xtestDriv[SELECTCOLS].copy()
        ytrainDriv = list(XtrainDriv['scaled label'].copy())

        XtrainDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)
        xtestDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)

        XtrainDriv = XtrainDriv.append(xtrainMat)
        XtrainDriv = XtrainDriv.append(xtrainBas)

        ytrain = ytrainDriv + yMat + yBas
        X = XtrainDriv.values

        for idx, val in enumerate(ytrain):
            if val <= 4:
                ytrain[idx] = 0
            else: ytrain[idx] = 1

        for idx, val in enumerate(ytestDriv):
            if val <= 4:
                ytestDriv[idx] = 0
            else: ytestDriv[idx] = 1

        paramsrf = {
            'n_estimators': 3000,
            'min_samples_split': 2,
            'min_samples_leaf': 1,
            'max_features': 'sqrt',
            'max_depth': 50,
            'bootstrap': False
            }

        paramlgbm = {
            'n_estimators': 3000,
            'num_leaves': 100,
            'learning_rate': 0.05,
            'class_weight': 'balanced',
            'random_state': 24
            }

        clf2 = RandomForestClassifier(**paramsrf)
        clf3 = lightgbm.LGBMClassifier(**paramlgbm)
        clf4 = SVC(C=700, probability=True, class_weight='balanced')

        estimatorList = [('rf', clf2), ('svm', clf4), ('gbm', clf3)]
        eclf = VotingClassifier(estimators=estimatorList, voting='soft')

        # clf2 = RandomForestClassifier(**paramsrf)
        # clf3 = lightgbm.LGBMClassifier(n_estimators = 3000, num_leaves=100, learning_rate=0.005, class_weight='balanced')
        # clf4 = SVC(C=1000, probability=True, class_weight='balanced')
        # eclf = VotingClassifier(estimators=[('rf', clf2), ('gbm', clf3), ('svm', clf4)], voting='soft') # 

        hist = eclf.fit(X, ytrain)
        yPred = hist.predict(xtestDriv)
        print('Test Subject: {}'.format(sdriv))
        print(classification_report(ytestDriv, yPred, zero_division=1))            

losoValidation('Norm_ECG_EDA_Features_Combined_scld')
    # return XtrainDriv, y_train

Test Subject: 1030.csv
              precision    recall  f1-score   support

           0       0.34      1.00      0.51        19
           1       1.00      0.14      0.24        43

    accuracy                           0.40        62
   macro avg       0.67      0.57      0.38        62
weighted avg       0.80      0.40      0.33        62

Test Subject: 1105.csv
              precision    recall  f1-score   support

           0       0.33      0.71      0.45        24
           1       0.82      0.49      0.61        68

    accuracy                           0.54        92
   macro avg       0.58      0.60      0.53        92
weighted avg       0.70      0.54      0.57        92

Test Subject: 1106.csv
              precision    recall  f1-score   support

           0       0.11      0.50      0.18         2
           1       0.89      0.50      0.64        16

    accuracy                           0.50        18
   macro avg       0.50      0.50      0.41        18
weigh

Test Subject: 1030.csv
              precision    recall  f1-score   support

           0       0.34      0.97      0.50        38
           1       0.97      0.31      0.47       104

    accuracy                           0.49       142
   macro avg       0.65      0.64      0.49       142
weighted avg       0.80      0.49      0.48       142

Test Subject: 1105.csv
              precision    recall  f1-score   support

           0       0.25      0.94      0.39        31
           1       0.95      0.32      0.48       128

    accuracy                           0.44       159
   macro avg       0.60      0.63      0.44       159
weighted avg       0.82      0.44      0.46       159

Test Subject: 1106.csv
              precision    recall  f1-score   support

           0       0.21      0.90      0.34        21
           1       0.89      0.18      0.30        87

    accuracy                           0.32       108
   macro avg       0.55      0.54      0.32       108
weigh

KeyboardInterrupt: 

In [None]:
def losoValidation(folder):
    dr_feat_path = r'X:\RealTimeSegment\Driving Simulator\Extracted\{}'.format(folder) # Norm_ECG_EDA_Features_Combined_scld
    mat_feat_path = r'X:\RealTimeSegment\MatbII\Extracted\{}'.format(folder)
    bas_feat_path = r'X:\RealTimeSegment\Driving Simulator\Extracted\Norm_ECG_EDA_Features_Baseline_Combined'

    xtrainBas, yBas = load_viragebase(bas_feat_path, 'label')
    xtrainMat, yMat = load_matb(mat_feat_path, 'scaled label')

    subjects = os.listdir(dr_feat_path)
    xtrainDriv = pd.DataFrame()

    for sdriv in subjects:
        if sdriv not in ['1629.csv']:
            continue
        xtestDriv = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(sdriv)))

        xtestDriv[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0)
        xtestDriv.replace([np.inf, -np.inf], 0, inplace=True)  
        xtestDriv.dropna(inplace=True) # .reset_index(drop=True, inplace=True)

        ytestDriv = list(xtestDriv['label'].copy())

        xtrainDriv = pd.DataFrame()
        for subTrain in subjects:
            if subTrain != sdriv:
                train = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(subTrain)))

                train[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0)
                train.replace([np.inf, -np.inf], 0, inplace=True)        
                train.dropna(inplace=True)
                xtrainDriv = xtrainDriv.append(train)
                xtrainDriv.reset_index(drop=True, inplace=True)

        XtrainDriv = xtrainDriv[SELECTCOLS].copy()
        xtestDriv = xtestDriv[SELECTCOLS].copy()
        ytrainDriv = list(XtrainDriv['scaled label'].copy())

        XtrainDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)
        xtestDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)

        XtrainDriv = XtrainDriv.append(xtrainMat)
        XtrainDriv = XtrainDriv.append(xtrainBas)

        ytrain = ytrainDriv + yMat + yBas
        X = XtrainDriv.values

        for idx, val in enumerate(ytrain):
            if val <= 4:
                ytrain[idx] = 0
            else: ytrain[idx] = 1

        for idx, val in enumerate(ytestDriv):
            if val <= 4:
                ytestDriv[idx] = 0
            else: ytestDriv[idx] = 1

        paramsrf = {
            'n_estimators': 3000,
            'min_samples_split': 2,
            'min_samples_leaf': 1,
            'max_features': 'sqrt',
            'max_depth': 50,
            'bootstrap': False
            }

        paramlgbm = {
            'n_estimators': 3000,
            'num_leaves': 100,
            'learning_rate': 0.05,
            'class_weight': 'balanced',
            'random_state': 24
            }

        clf2 = RandomForestClassifier(**paramsrf)
        clf3 = lightgbm.LGBMClassifier(**paramlgbm)
        clf4 = SVC(C=700, probability=True, class_weight='balanced')

        estimatorList = [('rf', clf2), ('svm', clf4), ('gbm', clf3)]
        eclf = VotingClassifier(estimators=estimatorList, voting='soft')

        # clf2 = RandomForestClassifier(**paramsrf)
        # clf3 = lightgbm.LGBMClassifier(n_estimators = 3000, num_leaves=100, learning_rate=0.005, class_weight='balanced')
        # clf4 = SVC(C=1000, probability=True, class_weight='balanced')
        # eclf = VotingClassifier(estimators=[('rf', clf2), ('gbm', clf3), ('svm', clf4)], voting='soft') # 

        hist = eclf.fit(X, ytrain)
        yPred = hist.predict(xtestDriv)
        print('Test Subject: {}'.format(sdriv))
        print(classification_report(ytestDriv, yPred, zero_division=1))            

losoValidation('Norm_ECG_EDA_Features_Combined_scld')
    # return XtrainDriv, y_train

# Corrected MatbII features!

In [26]:
def losoValidation(folder):
    dr_feat_path = r'X:\RealTimeSegment\Driving Simulator\Extracted\{}'.format(folder) # Norm_ECG_EDA_Features_Combined_scld
    mat_feat_path = r'X:\RealTimeSegment\MatbII\Extracted\{}'.format(folder)
    bas_feat_path = r'X:\RealTimeSegment\Driving Simulator\Extracted\Norm_ECG_EDA_Features_Baseline_Combined'

    xtrainBas, yBas = load_viragebase(bas_feat_path, 'label')
    xtrainMat, yMat = load_matb(mat_feat_path, 'scaled label')

    subjects = os.listdir(dr_feat_path)
    xtrainDriv = pd.DataFrame()

    for sdriv in subjects:
        xtestDriv = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(sdriv)))

        xtestDriv[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0)
        xtestDriv.replace([np.inf, -np.inf], 0, inplace=True)        
        xtestDriv.dropna(inplace=True) # .reset_index(drop=True, inplace=True)

        ytestDriv = list(xtestDriv['label'].copy())

        xtrainDriv = pd.DataFrame()
        for subTrain in subjects:
            if subTrain != sdriv:
                train = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(subTrain)))

                train[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0)
                train.replace([np.inf, -np.inf], 0, inplace=True)        
                train.dropna(inplace=True)
                xtrainDriv = xtrainDriv.append(train)
                xtrainDriv.reset_index(drop=True, inplace=True)

        XtrainDriv = xtrainDriv[SELECTCOLS].copy()
        xtestDriv = xtestDriv[SELECTCOLS].copy()
        ytrainDriv = list(XtrainDriv['scaled label'].copy())

        XtrainDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)
        xtestDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)

        XtrainDriv.reset_index(inplace=True, drop=True)
        XtrainDriv = XtrainDriv.append(xtrainMat)
        XtrainDriv.reset_index(inplace=True, drop=True)
        XtrainDriv = XtrainDriv.append(xtrainBas)
        XtrainDriv.reset_index(inplace=True, drop=True)

        ytrain = ytrainDriv + yMat + yBas
        X = XtrainDriv.values

        for idx, val in enumerate(ytrain):
            if val <= 4:
                ytrain[idx] = 0
            else: ytrain[idx] = 1

        for idx, val in enumerate(ytestDriv):
            if val <= 4:
                ytestDriv[idx] = 0
            else: ytestDriv[idx] = 1

        paramsrf = {
            'n_estimators': 3000,
            'min_samples_split': 2,
            'min_samples_leaf': 1,
            'max_features': 'sqrt',
            'max_depth': 50,
            'bootstrap': False
            }

        paramlgbm = {
            'n_estimators': 3000,
            'num_leaves': 100,
            'learning_rate': 0.05,
            'class_weight': 'balanced',
            'random_state': 24
            }

        clf2 = RandomForestClassifier(**paramsrf)
        clf3 = lightgbm.LGBMClassifier(**paramlgbm)
        clf4 = SVC(C=700, probability=True, class_weight='balanced')

        estimatorList = [('rf', clf2), ('svm', clf4), ('gbm', clf3)]
        eclf = VotingClassifier(estimators=estimatorList, voting='soft')

        # clf2 = RandomForestClassifier(**paramsrf)
        # clf3 = lightgbm.LGBMClassifier(n_estimators = 3000, num_leaves=100, learning_rate=0.005, class_weight='balanced')
        # clf4 = SVC(C=1000, probability=True, class_weight='balanced')
        # eclf = VotingClassifier(estimators=[('rf', clf2), ('gbm', clf3), ('svm', clf4)], voting='soft') # 

        hist = eclf.fit(X, ytrain)
        yPred = hist.predict(xtestDriv)
        print('Test Subject: {}'.format(sdriv))
        print(classification_report(ytestDriv, yPred, zero_division=1))            

losoValidation('Norm_ECG_EDA_Features_Combined_scld')
    # return XtrainDriv, y_train

Test Subject: 1030.csv
              precision    recall  f1-score   support

           0       0.37      1.00      0.54        19
           1       1.00      0.23      0.38        43

    accuracy                           0.47        62
   macro avg       0.68      0.62      0.46        62
weighted avg       0.81      0.47      0.43        62

Test Subject: 1105.csv
              precision    recall  f1-score   support

           0       0.34      0.88      0.49        24
           1       0.90      0.41      0.57        68

    accuracy                           0.53        92
   macro avg       0.62      0.64      0.53        92
weighted avg       0.76      0.53      0.55        92

Test Subject: 1106.csv
              precision    recall  f1-score   support

           0       0.12      1.00      0.22         2
           1       1.00      0.12      0.22        16

    accuracy                           0.22        18
   macro avg       0.56      0.56      0.22        18
weigh

In [8]:
# Fixed missing sampling rate while calculating phasic and tonic response.
def losoValidation(folder):
    dr_feat_path = r'X:\RealTimeSegment\Driving Simulator\Extracted\{}'.format(folder) # Norm_ECG_EDA_Features_Combined_scld
    mat_feat_path = r'X:\RealTimeSegment\MatbII\Extracted\{}'.format(folder)
    bas_feat_path = r'X:\RealTimeSegment\Driving Simulator\Extracted\Norm_ECG_EDA_Features_Baseline_Combined'

    xtrainBas, yBas = load_viragebase(bas_feat_path, 'label')
    xtrainMat, yMat = load_matb(mat_feat_path, 'scaled label')

    subjects = os.listdir(dr_feat_path)
    xtrainDriv = pd.DataFrame()

    for sdriv in subjects:
        # if sdriv not in ['1629.csv']:
        #     continue
        xtestDriv = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(sdriv)))

        # xtestDriv[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0, inplace=True)
        xtestDriv = xtestDriv[SELECTCOLS].copy()
        xtestDriv.replace([np.inf, -np.inf], 0, inplace=True)  
        xtestDriv.dropna(inplace=True) # .reset_index(drop=True, inplace=True)

        ytestDriv = list(xtestDriv['label'].copy())

        xtrainDriv = pd.DataFrame()
        for subTrain in subjects:
            if subTrain != sdriv:
                train = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(subTrain)))

                # train[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0, inplace=True)
                train.replace([np.inf, -np.inf], 0, inplace=True)        
                # train.dropna(inplace=True)
                xtrainDriv = xtrainDriv.append(train)
                xtrainDriv.reset_index(drop=True, inplace=True)

        XtrainDriv = xtrainDriv[SELECTCOLS].copy()
        XtrainDriv.dropna(inplace=True)
        ytrainDriv = list(XtrainDriv['scaled label'].copy())

        XtrainDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)
        xtestDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)

        XtrainDriv = XtrainDriv.append(xtrainMat)
        XtrainDriv = XtrainDriv.append(xtrainBas)

        ytrain = ytrainDriv + yMat + yBas
        X = XtrainDriv.values

        for idx, val in enumerate(ytrain):
            if val <= 4:
                ytrain[idx] = 0
            else: ytrain[idx] = 1

        for idx, val in enumerate(ytestDriv):
            if val <= 4:
                ytestDriv[idx] = 0
            else: ytestDriv[idx] = 1

        paramsrf = {
            'n_estimators': 3000,
            'min_samples_split': 2,
            'min_samples_leaf': 1,
            'max_features': 'sqrt',
            'max_depth': 50,
            'bootstrap': False
            }

        paramlgbm = {
            'n_estimators': 3000,
            'num_leaves': 100,
            'learning_rate': 0.05,
            'class_weight': 'balanced',
            'random_state': 24
            }

        clf2 = RandomForestClassifier(**paramsrf)
        clf3 = lightgbm.LGBMClassifier(**paramlgbm)
        clf4 = SVC(C=700, probability=True, class_weight='balanced')

        estimatorList = [('rf', clf2), ('svm', clf4), ('gbm', clf3)]
        eclf = VotingClassifier(estimators=estimatorList, voting='soft')

        # clf2 = RandomForestClassifier(**paramsrf)
        # clf3 = lightgbm.LGBMClassifier(n_estimators = 3000, num_leaves=100, learning_rate=0.005, class_weight='balanced')
        # clf4 = SVC(C=1000, probability=True, class_weight='balanced')
        # eclf = VotingClassifier(estimators=[('rf', clf2), ('gbm', clf3), ('svm', clf4)], voting='soft') # 

        hist = eclf.fit(X, ytrain)
        yPred = hist.predict(xtestDriv)
        print('Test Subject: {}'.format(sdriv))
        print(classification_report(ytestDriv, yPred, zero_division=1))            

losoValidation('Norm_ECG_EDA_Features_Combined_scld')
    # return XtrainDriv, y_train

Test Subject: 1030.csv
              precision    recall  f1-score   support

           0       0.33      0.79      0.47        38
           1       0.89      0.51      0.65       122

    accuracy                           0.57       160
   macro avg       0.61      0.65      0.56       160
weighted avg       0.75      0.57      0.60       160

Test Subject: 1105.csv
              precision    recall  f1-score   support

           0       0.28      0.97      0.43        31
           1       0.98      0.38      0.55       128

    accuracy                           0.50       159
   macro avg       0.63      0.68      0.49       159
weighted avg       0.84      0.50      0.53       159

Test Subject: 1106.csv
              precision    recall  f1-score   support

           0       0.20      0.88      0.33        24
           1       0.95      0.40      0.56       136

    accuracy                           0.47       160
   macro avg       0.58      0.64      0.45       160
weigh

In [8]:
# standardized baseline before normalizing features

def losoValidation(folder):
    dr_feat_path = r'X:\RealTimeSegment\Driving Simulator\Extracted\{}'.format(folder) # Norm_ECG_EDA_Features_Combined_scld
    mat_feat_path = r'X:\RealTimeSegment\MatbII\Extracted\{}'.format(folder)
    bas_feat_path = r'X:\RealTimeSegment\Driving Simulator\Extracted\Norm_ECG_EDA_Features_Baseline_Combined'

    xtrainBas, yBas = load_viragebase(bas_feat_path, 'label')
    xtrainMat, yMat = load_matb(mat_feat_path, 'scaled label')

    subjects = os.listdir(dr_feat_path)
    xtrainDriv = pd.DataFrame()

    for sdriv in subjects:
        # if sdriv not in ['1629.csv']:
        #     continue
        xtestDriv = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(sdriv)))

        # xtestDriv[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0, inplace=True)
        xtestDriv = xtestDriv[SELECTCOLS].copy()
        xtestDriv.replace([np.inf, -np.inf], 0, inplace=True)  
        xtestDriv.dropna(inplace=True) # .reset_index(drop=True, inplace=True)

        ytestDriv = list(xtestDriv['label'].copy())

        xtrainDriv = pd.DataFrame()
        for subTrain in subjects:
            if subTrain != sdriv:
                train = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(subTrain)))

                # train[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0, inplace=True)
                train.replace([np.inf, -np.inf], 0, inplace=True)        
                # train.dropna(inplace=True)
                xtrainDriv = xtrainDriv.append(train)
                xtrainDriv.reset_index(drop=True, inplace=True)

        XtrainDriv = xtrainDriv[SELECTCOLS].copy()
        XtrainDriv.dropna(inplace=True)
        ytrainDriv = list(XtrainDriv['scaled label'].copy())

        XtrainDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)
        xtestDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)

        XtrainDriv = XtrainDriv.append(xtrainMat)
        XtrainDriv = XtrainDriv.append(xtrainBas)

        ytrain = ytrainDriv + yMat + yBas
        X = XtrainDriv.values

        for idx, val in enumerate(ytrain):
            if val <= 4:
                ytrain[idx] = 0
            else: ytrain[idx] = 1

        for idx, val in enumerate(ytestDriv):
            if val <= 4:
                ytestDriv[idx] = 0
            else: ytestDriv[idx] = 1

        paramsrf = {
            'n_estimators': 3000,
            'min_samples_split': 2,
            'min_samples_leaf': 1,
            'max_features': 'sqrt',
            'max_depth': 50,
            'bootstrap': False
            }

        paramlgbm = {
            'n_estimators': 3000,
            'num_leaves': 100,
            'learning_rate': 0.05,
            'class_weight': 'balanced',
            'random_state': 24
            }

        clf2 = RandomForestClassifier(**paramsrf)
        clf3 = lightgbm.LGBMClassifier(**paramlgbm)
        clf4 = SVC(C=700, probability=True, class_weight='balanced')

        estimatorList = [('rf', clf2), ('svm', clf4), ('gbm', clf3)]
        eclf = VotingClassifier(estimators=estimatorList, voting='soft')

        # clf2 = RandomForestClassifier(**paramsrf)
        # clf3 = lightgbm.LGBMClassifier(n_estimators = 3000, num_leaves=100, learning_rate=0.005, class_weight='balanced')
        # clf4 = SVC(C=1000, probability=True, class_weight='balanced')
        # eclf = VotingClassifier(estimators=[('rf', clf2), ('gbm', clf3), ('svm', clf4)], voting='soft') # 

        hist = eclf.fit(X, ytrain)
        yPred = hist.predict(xtestDriv)
        print('Test Subject: {}'.format(sdriv))
        print(classification_report(ytestDriv, yPred, zero_division=1))            

losoValidation('Norm_ECG_EDA_Features_Combined_scld')
    # return XtrainDriv, y_train

Test Subject: 1030.csv
              precision    recall  f1-score   support

           0       0.31      0.74      0.44        38
           1       0.86      0.49      0.62       122

    accuracy                           0.55       160
   macro avg       0.58      0.61      0.53       160
weighted avg       0.73      0.55      0.58       160

Test Subject: 1105.csv
              precision    recall  f1-score   support

           0       0.25      0.97      0.40        31
           1       0.98      0.31      0.47       128

    accuracy                           0.44       159
   macro avg       0.61      0.64      0.44       159
weighted avg       0.83      0.44      0.46       159

Test Subject: 1106.csv
              precision    recall  f1-score   support

           0       0.20      0.79      0.32        24
           1       0.92      0.45      0.60       136

    accuracy                           0.50       160
   macro avg       0.56      0.62      0.46       160
weigh

In [8]:
# CHANGED PEAK NaN VALUES TO 10000
# DROPPED BAD SUBJECTS!

def losoValidation(folder):
    dr_feat_path = r'X:\RealTimeSegment\Driving Simulator\Extracted\{}'.format(folder) # Norm_ECG_EDA_Features_Combined_scld
    mat_feat_path = r'X:\RealTimeSegment\MatbII\Extracted\{}'.format(folder)
    bas_feat_path = r'X:\RealTimeSegment\Driving Simulator\Extracted\Norm_ECG_EDA_Features_Baseline_Combined'

    xtrainBas, yBas = load_viragebase(bas_feat_path, 'label')
    xtrainMat, yMat = load_matb(mat_feat_path, 'scaled label')

    subjects = os.listdir(dr_feat_path)
    xtrainDriv = pd.DataFrame()

    test_cls = {}
    train_cls = {}
    for sdriv in subjects:
        # if sdriv not in ['1629.csv']:
        #     continue
        xtestDriv = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(sdriv)))

        # xtestDriv[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0, inplace=True)
        xtestDriv = xtestDriv[SELECTCOLS].copy()
        xtestDriv.replace([np.inf, -np.inf], 0, inplace=True)  
        xtestDriv.dropna(inplace=True) # .reset_index(drop=True, inplace=True)

        ytestDriv = list(xtestDriv['label'].copy())

        xtrainDriv = pd.DataFrame()
        for subTrain in subjects:
            if subTrain != sdriv:
                train = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(subTrain)))

                # train[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0, inplace=True)
                train.replace([np.inf, -np.inf], 0, inplace=True)        
                # train.dropna(inplace=True)
                xtrainDriv = xtrainDriv.append(train)
                xtrainDriv.reset_index(drop=True, inplace=True)

        XtrainDriv = xtrainDriv[SELECTCOLS].copy()
        XtrainDriv.dropna(inplace=True)
        ytrainDriv = list(XtrainDriv['scaled label'].copy())

        XtrainDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)
        xtestDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)

        XtrainDriv = XtrainDriv.append(xtrainMat)
        XtrainDriv = XtrainDriv.append(xtrainBas)

        ytrain = ytrainDriv + yMat + yBas
        X = XtrainDriv.values

        for idx, val in enumerate(ytrain):
            if val <= 4:
                ytrain[idx] = 0
            else: ytrain[idx] = 1

        for idx, val in enumerate(ytestDriv):
            if val <= 4:
                ytestDriv[idx] = 0
            else: ytestDriv[idx] = 1

        paramsrf = {
            'n_estimators': 3000,
            'min_samples_split': 2,
            'min_samples_leaf': 1,
            'max_features': 'sqrt',
            'max_depth': 50,
            'bootstrap': False
            }

        paramlgbm = {
            'n_estimators': 3000,
            'num_leaves': 100,
            'learning_rate': 0.05,
            'class_weight': 'balanced',
            'random_state': 24
            }

        clf2 = RandomForestClassifier(**paramsrf)
        clf3 = lightgbm.LGBMClassifier(**paramlgbm)
        clf4 = SVC(C=700, probability=True, class_weight='balanced')

        estimatorList = [('rf', clf2), ('svm', clf4), ('gbm', clf3)]
        eclf = VotingClassifier(estimators=estimatorList, voting='soft')

        # clf2 = RandomForestClassifier(**paramsrf)
        # clf3 = lightgbm.LGBMClassifier(n_estimators = 3000, num_leaves=100, learning_rate=0.005, class_weight='balanced')
        # clf4 = SVC(C=1000, probability=True, class_weight='balanced')
        # eclf = VotingClassifier(estimators=[('rf', clf2), ('gbm', clf3), ('svm', clf4)], voting='soft') # 

        hist = eclf.fit(X, ytrain)
        yPred = hist.predict(xtestDriv)
        ytrainpred = hist.predict(X)
        
        test_cls[sdriv] = classification_report(ytestDriv, yPred, zero_division=1, output_dict=True)
        train_cls[sdriv] = classification_report(ytrain, ytrainpred, zero_division=1, output_dict=True)
        
        print('---------------------------------------------------\n')
        print('Test Subject: {}'.format(sdriv))
        print(classification_report(ytestDriv, yPred, zero_division=1))

        print('Train Subject: {}'.format(sdriv))
        print(classification_report(ytrain, ytrainpred, zero_division=1))
        print('---------------------------------------------------\n')

    return train_cls, test_cls
train_cls, test_cls = losoValidation('Norm_ECG_EDA_Features_Combined_scld')
    # return XtrainDriv, y_train

---------------------------------------------------

Test Subject: 1030.csv
              precision    recall  f1-score   support

           0       0.32      0.74      0.45        38
           1       0.86      0.52      0.65       122

    accuracy                           0.57       160
   macro avg       0.59      0.63      0.55       160
weighted avg       0.73      0.57      0.60       160

Train Subject: 1030.csv
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3600
           1       1.00      1.00      1.00      3307

    accuracy                           1.00      6907
   macro avg       1.00      1.00      1.00      6907
weighted avg       1.00      1.00      1.00      6907

---------------------------------------------------

---------------------------------------------------

Test Subject: 1105.csv
              precision    recall  f1-score   support

           0       0.26      1.00      0.42        31
        

In [22]:
acc=[]
f_1=[]
for key in test_cls.keys():
    if key in ['1868.csv', '1717.csv', '1547.csv']:
        continue

    accuracy_ = test_cls[key]['accuracy']
    acc.append(accuracy_)
    fscore_ = test_cls[key]['macro avg']['f1-score']
    f_1.append(fscore_)

# Average acc and f1
print("Mean accuracy: {}".format(np.mean(acc)))
print("Mean f1: {}".format(np.mean(f_1)))

Mean accuracy: 0.6254702294699697
Mean f1: 0.5544823378126126


In [24]:
len(acc)

18