# Hyper Parameter Tuning for classifier for four modes

In [1]:
import numpy as np
import pandas as pd
import os

# Classifier

In [2]:
import pandas as pd
import numpy as np
import os
# from os import path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import classification_report
from numpy import vstack, hstack, stack
from collections import Counter
from sklearn.model_selection import RandomizedSearchCV
import lightgbm
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [3]:
from Training_Code.config import SELECTCOLS, ECG_SELECTCOLS, EDA_SELECTCOLS, SELECTFOUR
from sklearn.utils import shuffle

### Label

In [4]:
def make_kfolds(dr_feat_path):
    subjects = os.listdir(dr_feat_path)
    xtrainDriv = pd.DataFrame()
    for subTrain in subjects:
        train = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(subTrain)))
        train[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0)
        if np.isinf(train).values.sum():
            cinf = np.isinf(train).values.sum()
            print("Train Dataframe contains {} values".format(cinf))
        train.replace([np.inf], 9999, inplace=True)        
        train.replace([-np.inf], -9999, inplace=True)        

        train['scrNumPeaks'] = train['scrNumPeaks'].values.astype(int)
        train['scrNumPeaks'] = train['scrNumPeaks'].values.clip(min=0) # converting negatives to zero

        train.dropna(inplace=True)
        xtrainDriv = xtrainDriv.append(train)
        xtrainDriv.reset_index(drop=True, inplace=True)

    return xtrainDriv.copy()

In [17]:
def read_dataset(dataset, folder, basefolder):
    dr_feat_path = r'X:\All Modes\All\{}\Combine'.format(dataset) # ECG_EDA_Features_Combined_scld
    bs_feat_path = r'X:\All Modes\All\{}\Combine'.format(dataset) # ECG_EDA_Base2_Features_Combined

    XtrainDriv = make_kfolds(dr_feat_path)
    # XtrainBase = make_kfolds(bs_feat_path)

    XtrainDriv = XtrainDriv[SELECTFOUR].copy()
    ytrainDriv = list(XtrainDriv['scaled label'].copy())

    # XtrainBase = XtrainBase[SELECTFOUR[:-3]].copy()
    # ytrainBase = XtrainBase.shape[0] * [0]

    XtrainDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)
    # XtrainDriv = XtrainDriv.append(XtrainBase)

    ytrain = ytrainDriv # + ytrainBase

    X = XtrainDriv.values
    
    X, ytrain = shuffle(X, ytrain, random_state=42)

    for idx, val in enumerate(ytrain):
        if val <= 4:
            ytrain[idx] = 0
        else: ytrain[idx] = 1

    return X, ytrain

## Hyperparameter tuning

In [84]:
data, label = read_dataset('MatbII', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')
scaler = StandardScaler()
data = scaler.fit_transform(data)
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.20, random_state=42, shuffle=True)

In [85]:
X_train.shape

(2352, 315)

In [86]:
random_grid = {'bootstrap': [True, False],
 'max_depth': [30, 50, 70, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100, 800, 1000, 2000, 3000], 'class_weight': ['balanced']}

rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=50,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'class_weight': ['balanced'],
                                        'max_depth': [30, 50, 70, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 800, 1000, 2000,
                                                         3000]},
                   random_state=42, verbose=2)

In [87]:
rf_random.best_params_

{'n_estimators': 800,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'class_weight': 'balanced',
 'bootstrap': False}

In [None]:
{'n_estimators': 2000,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 50,
 'bootstrap': False}

In [88]:
model = rf_random.best_estimator_
hist = model.fit(X_train, y_train)
predicted_y = model.predict(X_test)
print(classification_report(y_test, predicted_y))

              precision    recall  f1-score   support

           0       0.74      0.68      0.71       260
           1       0.76      0.81      0.78       329

    accuracy                           0.75       589
   macro avg       0.75      0.75      0.75       589
weighted avg       0.75      0.75      0.75       589



In [20]:
data, label = read_dataset('Virage', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.20, random_state=42, shuffle=True)

In [21]:
X_train.shape

(2141, 315)

In [22]:
random_grid = {'bootstrap': [True, False],
 'max_depth': [30, 50, 70, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100, 800, 1000, 2000, 3000]}

rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=50,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [30, 50, 70, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 800, 1000, 2000,
                                                         3000]},
                   random_state=42, verbose=2)

In [23]:
rf_random.best_params_

{'n_estimators': 2000,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 50,
 'bootstrap': False}

In [24]:
model = rf_random.best_estimator_
hist = model.fit(X_train, y_train)
predicted_y = model.predict(X_test)
print(classification_report(y_test, predicted_y))

              precision    recall  f1-score   support

           0       0.87      0.69      0.77       205
           1       0.83      0.94      0.88       331

    accuracy                           0.84       536
   macro avg       0.85      0.81      0.82       536
weighted avg       0.84      0.84      0.84       536



In [25]:
from sklearn.feature_selection import SelectFromModel
sel = SelectFromModel(model)
sel.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=False, max_depth=30,
                                                 n_estimators=2000))

# SVM

In [60]:
data, label = read_dataset('MatbII', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')
scaler = StandardScaler()
data = scaler.fit_transform(data)
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.20, random_state=42, shuffle=True)

parameters = {'C': [1, 5, 15, 20], 'kernel':['rbf'], 'class_weight': ['balanced']}

svc_ = SVC()
rf_random = RandomizedSearchCV(estimator = svc_, param_distributions = parameters, n_iter = 50, cv = 10, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)



Fitting 10 folds for each of 4 candidates, totalling 40 fits


RandomizedSearchCV(cv=10, estimator=SVC(), n_iter=50, n_jobs=-1,
                   param_distributions={'C': [1, 5, 15, 20],
                                        'class_weight': ['balanced'],
                                        'kernel': ['rbf']},
                   random_state=42, verbose=2)

In [61]:
X_train.shape

(2352, 315)

In [62]:
rf_random.best_params_

{'kernel': 'rbf', 'class_weight': 'balanced', 'C': 5}

In [63]:
model = rf_random.best_estimator_
hist = model.fit(X_train, y_train)
predicted_y = model.predict(X_test)
print(classification_report(y_test, predicted_y))

              precision    recall  f1-score   support

           0       0.69      0.73      0.71       260
           1       0.78      0.74      0.76       329

    accuracy                           0.74       589
   macro avg       0.74      0.74      0.74       589
weighted avg       0.74      0.74      0.74       589



In [57]:
data, label = read_dataset('Virage', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')
scaler = StandardScaler()
data = scaler.fit_transform(data)
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.20, random_state=42, shuffle=True)

parameters = {'C': [1, 5, 15, 20], 'kernel':['rbf'], 'class_weight': ['balanced']}

svc_ = SVC()
rf_random = RandomizedSearchCV(estimator = svc_, param_distributions = parameters, n_iter = 50, cv = 10, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)



Fitting 10 folds for each of 4 candidates, totalling 40 fits


RandomizedSearchCV(cv=10, estimator=SVC(), n_iter=50, n_jobs=-1,
                   param_distributions={'C': [1, 5, 15, 20],
                                        'class_weight': ['balanced'],
                                        'kernel': ['rbf']},
                   random_state=42, verbose=2)

In [58]:
rf_random.best_params_

{'kernel': 'rbf', 'class_weight': 'balanced', 'C': 20}

In [59]:
model = rf_random.best_estimator_
hist = model.fit(X_train, y_train)
predicted_y = model.predict(X_test)
print(classification_report(y_test, predicted_y))

              precision    recall  f1-score   support

           0       0.79      0.73      0.76       205
           1       0.84      0.88      0.86       331

    accuracy                           0.82       536
   macro avg       0.82      0.81      0.81       536
weighted avg       0.82      0.82      0.82       536



# Logistic Regression

In [75]:
from sklearn.preprocessing import StandardScaler

data, label = read_dataset('MatbII', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')

scaler = StandardScaler()
data = scaler.fit_transform(data)

X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.20, random_state=42, shuffle=True)


parameters_lr = {'C': [0.5, 1, 5, 10], 'max_iter': [400, 500, 600, 1000], 'class_weight': ['balanced']}

lr_ = LogisticRegression()
rf_random = RandomizedSearchCV(estimator = lr_, param_distributions = parameters_lr, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)



Fitting 5 folds for each of 16 candidates, totalling 80 fits


RandomizedSearchCV(cv=5, estimator=LogisticRegression(), n_iter=100, n_jobs=-1,
                   param_distributions={'C': [0.5, 1, 5, 10],
                                        'class_weight': ['balanced'],
                                        'max_iter': [400, 500, 600, 1000]},
                   random_state=42, verbose=2)

In [76]:
rf_random.best_params_

{'max_iter': 400, 'class_weight': 'balanced', 'C': 0.5}

In [77]:
model = rf_random.best_estimator_
hist = model.fit(X_train, y_train)
predicted_y = model.predict(X_test)
print(classification_report(y_test, predicted_y))

              precision    recall  f1-score   support

           0       0.61      0.65      0.63       260
           1       0.71      0.67      0.69       329

    accuracy                           0.66       589
   macro avg       0.66      0.66      0.66       589
weighted avg       0.67      0.66      0.66       589



In [72]:
from sklearn.preprocessing import StandardScaler

data, label = read_dataset('Virage', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')

scaler = StandardScaler()
data = scaler.fit_transform(data)

X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.20, random_state=42, shuffle=True)


parameters_lr = {'C': [0.5, 1, 5, 10], 'max_iter': [1000, 1500], 'class_weight': ['balanced']}

lr_ = LogisticRegression()
rf_random = RandomizedSearchCV(estimator = lr_, param_distributions = parameters_lr, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)



Fitting 5 folds for each of 8 candidates, totalling 40 fits


RandomizedSearchCV(cv=5, estimator=LogisticRegression(), n_iter=100, n_jobs=-1,
                   param_distributions={'C': [0.5, 1, 5, 10],
                                        'class_weight': ['balanced'],
                                        'max_iter': [1000, 1500]},
                   random_state=42, verbose=2)

In [73]:
rf_random.best_params_

{'max_iter': 1000, 'class_weight': 'balanced', 'C': 0.5}

In [74]:
model = rf_random.best_estimator_
hist = model.fit(X_train, y_train)
predicted_y = model.predict(X_test)
print(classification_report(y_test, predicted_y))

              precision    recall  f1-score   support

           0       0.65      0.79      0.71       205
           1       0.85      0.74      0.79       331

    accuracy                           0.76       536
   macro avg       0.75      0.76      0.75       536
weighted avg       0.77      0.76      0.76       536



# LGBM

In [78]:
data, label = read_dataset('MatbII', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')
scaler = StandardScaler()
data = scaler.fit_transform(data)
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.20, random_state=42, shuffle=True)

import lightgbm as lgb
lgbmcls = lgb.LGBMClassifier()
default_params = lgbmcls.get_params()

params = {'boosting_type': ['gbdt', 'dart'],
 'class_weight': ['balanced'],
 'importance_type': ['split', 'gains'],
 'learning_rate': [0.1, 0.01, 0.05],
 'n_estimators': [1000, 500, 200, 1500],
 'num_leaves': [100, 500, 1000],
 'reg_lambda': [0.01, 0.001]}

lgb_ = lgb.LGBMClassifier()
rf_random = RandomizedSearchCV(estimator = lgb_, param_distributions = params, n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


RandomizedSearchCV(cv=5, estimator=LGBMClassifier(), n_iter=50, n_jobs=-1,
                   param_distributions={'boosting_type': ['gbdt', 'dart'],
                                        'class_weight': ['balanced'],
                                        'importance_type': ['split', 'gains'],
                                        'learning_rate': [0.1, 0.01, 0.05],
                                        'n_estimators': [1000, 500, 200, 1500],
                                        'num_leaves': [100, 500, 1000],
                                        'reg_lambda': [0.01, 0.001]},
                   random_state=42, verbose=2)

In [79]:
rf_random.best_params_

{'reg_lambda': 0.01,
 'num_leaves': 100,
 'n_estimators': 1000,
 'learning_rate': 0.1,
 'importance_type': 'gains',
 'class_weight': 'balanced',
 'boosting_type': 'dart'}

In [80]:
model = rf_random.best_estimator_
hist = model.fit(X_train, y_train)
predicted_y = model.predict(X_test)
print(classification_report(y_test, predicted_y))

              precision    recall  f1-score   support

           0       0.76      0.67      0.71       260
           1       0.76      0.83      0.80       329

    accuracy                           0.76       589
   macro avg       0.76      0.75      0.76       589
weighted avg       0.76      0.76      0.76       589



In [81]:
data, label = read_dataset('Virage', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')
scaler = StandardScaler()
data = scaler.fit_transform(data)
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.20, random_state=42, shuffle=True)

import lightgbm as lgb
lgbmcls = lgb.LGBMClassifier()
default_params = lgbmcls.get_params()

params = {'boosting_type': ['gbdt', 'dart'],
 'class_weight': ['balanced'],
 'importance_type': ['split', 'gains'],
 'learning_rate': [0.1, 0.01, 0.05],
 'n_estimators': [1000, 500, 200, 1500],
 'num_leaves': [100, 500, 1000],
 'reg_lambda': [0.01, 0.001]}

lgb_ = lgb.LGBMClassifier()
rf_random = RandomizedSearchCV(estimator = lgb_, param_distributions = params, n_iter = 25, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


RandomizedSearchCV(cv=5, estimator=LGBMClassifier(), n_iter=25, n_jobs=-1,
                   param_distributions={'boosting_type': ['gbdt', 'dart'],
                                        'class_weight': ['balanced'],
                                        'importance_type': ['split', 'gains'],
                                        'learning_rate': [0.1, 0.01, 0.05],
                                        'n_estimators': [1000, 500, 200, 1500],
                                        'num_leaves': [100, 500, 1000],
                                        'reg_lambda': [0.01, 0.001]},
                   random_state=42, verbose=2)

In [82]:
rf_random.best_params_

{'reg_lambda': 0.01,
 'num_leaves': 100,
 'n_estimators': 1000,
 'learning_rate': 0.1,
 'importance_type': 'gains',
 'class_weight': 'balanced',
 'boosting_type': 'dart'}

In [83]:
model = rf_random.best_estimator_
hist = model.fit(X_train, y_train)
predicted_y = model.predict(X_test)
print(classification_report(y_test, predicted_y))

              precision    recall  f1-score   support

           0       0.84      0.76      0.80       205
           1       0.86      0.91      0.89       331

    accuracy                           0.85       536
   macro avg       0.85      0.84      0.84       536
weighted avg       0.85      0.85      0.85       536

