In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import classification_report
import lightgbm
from sklearn.svm import SVC
from sklearn.utils import shuffle
import pickle
import main_utils_1
from sklearn.metrics import accuracy_score, f1_score
from datetime import datetime

In [2]:
from Training_Code.config import SELECTCOLS, ECG_SELECTCOLS, EDA_SELECTCOLS

In [3]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network  import MLPClassifier
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [4]:
funcs_for_matbii = [
    (RandomForestClassifier , {'n_estimators': 1000,
                                'min_samples_split': 5,
                                'min_samples_leaf': 1,
                                'max_features': 'auto',
                                'max_depth': 30,
                                'bootstrap': False, 'random_state': 24, 'class_weight': 'balanced'}),
    (LinearDiscriminantAnalysis, {'solver': 'lsqr'}), 
    (GradientBoostingClassifier, {'max_depth': 3, 'n_estimators': 300, 'max_features': 'auto'}), 
    (SVC , {'C': 800, 'kernel': 'poly'}), 
    (MLPClassifier, {'hidden_layer_sizes': (100, 10), 'learning_rate': 'adaptive', 'max_iter': 1000}),
    (LogisticRegression, {'C': 1, 'max_iter': 400}),
    (xgb.XGBClassifier, {'n_estimators': 300, 'learning_rate': 0.01, 
                         'use_label_encoder': False, 
                         'booster': 'dart', 'n_jobs': 4,
                         'reg_lambda': 0.0001, 'random_state': 24}),
    (lightgbm.LGBMClassifier, {'reg_lambda': 0.001,
                                'num_leaves': 100,
                                'n_estimators': 1500,
                                'learning_rate': 0.05,
                                'importance_type': 'gains',
                                'class_weight': 'balanced',
                                'boosting_type': 'dart',
                                'class_weight': 'balanced',
                                'boosting_type': 'dart',
                                'class_weight': 'balanced',
                                'random_state': 24})
]

funcs_for_virage = [
    (RandomForestClassifier , {'n_estimators': 2000,
                                'min_samples_split': 2,
                                'min_samples_leaf': 1,
                                'max_features': 'auto',
                                'max_depth': 30,
                                'bootstrap': False, 'random_state': 24, 'class_weight': 'balanced'}),
    (LinearDiscriminantAnalysis, {'solver': 'lsqr'}), 
    (GradientBoostingClassifier, {'max_depth': 3, 'n_estimators': 300, 'max_features': 'auto'}), 
    (SVC , {'C': 700, 'kernel': 'poly'}), 
    (MLPClassifier, {'hidden_layer_sizes': (100, 10), 'learning_rate': 'adaptive', 'max_iter': 1000}),
    (LogisticRegression, {'C': 0.5, 'max_iter': 400}),
    (xgb.XGBClassifier, {'n_estimators': 1000, 'learning_rate': 0.001, 
                         'use_label_encoder': False, 
                         'booster': 'dart', 'n_jobs': 4,
                         'reg_lambda': 0.0001, 'random_state': 24}),
    (lightgbm.LGBMClassifier, {'reg_lambda': 0.0,
                                'num_leaves': 50,
                                'n_estimators': 1000,
                                'learning_rate': 0.01,
                                'importance_type': 'split',
                                'class_weight': 'balanced',
                                'boosting_type': 'gbdt',
                                'class_weight': 'balanced',
                                'random_state': 24})
]

In [5]:
def make_loso(dr_feat_path):
    subjects = os.listdir(dr_feat_path)
    xtrainDriv = pd.DataFrame()
    for subTrain in subjects:
        train = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(subTrain)))
        train[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0)
        if np.isinf(train).values.sum():
            cinf = np.isinf(train).values.sum()
            print("Train Dataframe contains {} values".format(cinf))
        train.replace([np.inf], 9999, inplace=True)        
        train.replace([-np.inf], -9999, inplace=True)        

        train['scrNumPeaks'] = train['scrNumPeaks'].values.astype(int)
        train['scrNumPeaks'] = train['scrNumPeaks'].values.clip(min=0) # converting negatives to zero

        train.dropna(inplace=True)
        xtrainDriv = xtrainDriv.append(train)
        xtrainDriv.reset_index(drop=True, inplace=True)

    return xtrainDriv.copy()

In [6]:
def read_dataset(dataset, folder, basefolder):
    dr_feat_path = r'X:\All Modes\{}\ECG EDA\Combined\{}'.format(dataset, folder) # ECG_EDA_Features_Combined_scld
    bs_feat_path = r'X:\All Modes\{}\ECG EDA\Combined\{}'.format(dataset, basefolder) # ECG_EDA_Base2_Features_Combined

    XtrainDriv = make_loso(dr_feat_path)
    XtrainBase = make_loso(bs_feat_path)

    XtrainDriv = XtrainDriv[SELECTCOLS].copy()
    ytrainDriv = list(XtrainDriv['scaled label'].copy())

    XtrainBase = XtrainBase[SELECTCOLS[:-3]].copy()
    ytrainBase = XtrainBase.shape[0] * [0]

    XtrainDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)
    XtrainDriv = XtrainDriv.append(XtrainBase)

    ytrain = ytrainDriv + ytrainBase

    X = XtrainDriv.values
    
    X, ytrain = shuffle(X, ytrain, random_state=42)

    for idx, val in enumerate(ytrain):
        if val <= 4:
            ytrain[idx] = 0
        else: ytrain[idx] = 1

    return X, ytrain

In [7]:
def mk_training_data(dr_feat_path, sdriv):
    subjects = os.listdir(dr_feat_path)
    xtrainDriv = pd.DataFrame()

    for subTrain in subjects:
        if subTrain != sdriv:
            train = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(subTrain)))

            train[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0)
            if np.isinf(train).values.sum():
                cinf = np.isinf(train).values.sum()
                print("Train Dataframe contains {} values".format(cinf))
            train.replace([np.inf], 9999, inplace=True)        
            train.replace([-np.inf], -9999, inplace=True)        

            train['scrNumPeaks'] = train['scrNumPeaks'].values.astype(int)
            train['scrNumPeaks'] = train['scrNumPeaks'].values.clip(min=0) # converting negatives to zero

            train.dropna(inplace=True)
            xtrainDriv = xtrainDriv.append(train)
            xtrainDriv.reset_index(drop=True, inplace=True)

    return xtrainDriv.copy()

In [8]:
def losoValidation(dataset, folder, basefolder):
    dr_feat_path = r'X:\All Modes\{}\ECG EDA\Combined\{}'.format(dataset, folder) # ECG_EDA_Features_Combined_scld
    bs_feat_path = r'X:\All Modes\{}\ECG EDA\Combined\{}'.format(dataset, basefolder) # ECG_EDA_Base2_Features_Combined
    date_time = datetime.now().strftime("%Y_%m_%d_%H_%M")
    savePath_0 = f"X:/All Modes/Data Files/{date_time}"
    main_utils_1.mk_dirs(savePath_0)
    savePath1 = os.path.join(savePath_0, f'{dataset}')
    main_utils_1.mk_dirs(savePath1)
    savePath = os.path.join(savePath1, 'ECG EDA')
    main_utils_1.mk_dirs(savePath)

    mycls = {}

    if dataset == 'MatbII':
        parameter_list = funcs_for_matbii
    elif dataset == 'Virage':
        parameter_list = funcs_for_virage

    subjects = os.listdir(dr_feat_path)
    results_df = pd.DataFrame(columns=['dataset', 'method', 'test_subject', 'test_acc', 'test_f1'])
    for sdriv in subjects:
        main_utils_1.mk_dirs(os.path.join(savePath, sdriv))

        xtestDriv = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(sdriv)))
        xtestDriv[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0)
        if np.isinf(xtestDriv).values.sum():
            cinf = np.isinf(xtestDriv).values.sum()
            print("Dataframe contains {} values".format(cinf))

        xtestDriv.replace([np.inf], 9999, inplace=True)
        xtestDriv.replace([-np.inf], -9999, inplace=True)

        xtestDriv['scrNumPeaks'] = xtestDriv['scrNumPeaks'].values.astype(int)
        xtestDriv['scrNumPeaks'] = xtestDriv['scrNumPeaks'].values.clip(min=0) # converting negatives to zero
        xtestDriv.dropna(inplace=True) # .reset_index(drop=True, inplace=True)

        ytestDriv = list(xtestDriv['scaled label'].copy()) 

        XtrainDriv = mk_training_data(dr_feat_path, sdriv)
        XtrainBase = mk_training_data(bs_feat_path, sdriv)

        XtrainDriv = XtrainDriv[SELECTCOLS].copy()
        XtestDriv = xtestDriv[SELECTCOLS].copy()  ### Look out for small x and X
        XtrainBase = XtrainBase[SELECTCOLS[:-3]].copy()

        ytrainDriv = list(XtrainDriv['scaled label'].copy())
        ytrainBase = XtrainBase.shape[0] * [0]

        XtrainDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)
        XtestDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)

        ytrain = ytrainDriv + ytrainBase
        XtrainDriv = XtrainDriv.append(XtrainBase)

        X = XtrainDriv.values
        X, ytrain = shuffle(X, ytrain, random_state=42)

        for idx, val in enumerate(ytrain):
            if val <= 4:
                ytrain[idx] = 0
            else: ytrain[idx] = 1

        for idx, val in enumerate(ytestDriv):
            if val <= 4:
                ytestDriv[idx] = 0
            else: ytestDriv[idx] = 1

        # training different classifier for all subjects and saving them in different dictionnaries
        mycls = {}
        for cls_modl, cls_parameters in parameter_list:
            print("--------------------------------------------")
            print(f"---- Training classifier {cls_modl.__name__} for subject: {sdriv} ---")

            classifier_save_path = os.path.join(savePath, sdriv, cls_modl.__name__)
            main_utils_1.mk_dirs(classifier_save_path)

            classifier_report = os.path.join(classifier_save_path, 'report')
            classifier_sav = os.path.join(classifier_save_path, 'classifier')
            main_utils_1.mk_dirs(classifier_report)
            main_utils_1.mk_dirs(classifier_sav)

            if cls_modl.__name__ in ['LogisticRegression', 'SVC']:
                scaler = StandardScaler()
                X = scaler.fit_transform(X)
            clf = cls_modl(**cls_parameters)
            hist = clf.fit(X, ytrain)

            yPred = hist.predict(XtestDriv)

            test_accuray = accuracy_score(ytestDriv, yPred)
            test_f1 = f1_score(ytestDriv, yPred, average='macro')
            
            results_df = results_df.append({'dataset': folder,
                                            'method':cls_modl.__name__,
                                            'test_subject': sdriv,
                                            'test_acc': test_accuray,
                                            'test_f1':test_f1}, ignore_index=True)
            print('Test Subject: {}'.format(sdriv))

            mycls[sdriv] = classification_report(ytestDriv, yPred, zero_division=1, output_dict=True)
            print("----- Classification Report ------")
            print(f"Test accuracy for {sdriv} is: {test_accuray} and f1 score is: {test_f1}\n")

            print(classification_report(ytestDriv, yPred, zero_division=1))
            with open(os.path.join(classifier_report, 'Test_fold_{}_report.pickle'.format(sdriv)), 'wb') as handle:
                pickle.dump(mycls, handle, protocol= pickle.HIGHEST_PROTOCOL)

            with open(os.path.join(classifier_sav, 'Test_fold_{}_report.sav'.format(sdriv)), 'wb') as handle:
                pickle.dump(hist, handle, protocol= pickle.HIGHEST_PROTOCOL)

        # estimatorPath = os.path.join(savePath, '{}.sav'.format(saveName))
        # pickle.dump(hist, open(estimatorPath, 'wb'))

    results_df.to_csv(os.path.join(savePath, 'results.csv'), index=False)
    return

losoValidation('Virage', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')
    # return XtrainDriv, y_train

--------------------------------------------
---- Training classifier RandomForestClassifier for subject: 1030.csv ---
Test Subject: 1030.csv
----- Classification Report ------
Test accuracy for 1030.csv is: 0.775 and f1 score is: 0.7321428571428571

              precision    recall  f1-score   support

           0       0.52      0.79      0.62        38
           1       0.92      0.77      0.84       122

    accuracy                           0.78       160
   macro avg       0.72      0.78      0.73       160
weighted avg       0.83      0.78      0.79       160

--------------------------------------------
---- Training classifier LinearDiscriminantAnalysis for subject: 1030.csv ---
Test Subject: 1030.csv
----- Classification Report ------
Test accuracy for 1030.csv is: 0.74375 and f1 score is: 0.7034223970342239

              precision    recall  f1-score   support

           0       0.48      0.79      0.59        38
           1       0.92      0.73      0.81       122

 

In [9]:
def losoValidation(dataset, folder, basefolder):
    dr_feat_path = r'X:\All Modes\{}\ECG EDA\Combined\{}'.format(dataset, folder) # ECG_EDA_Features_Combined_scld
    bs_feat_path = r'X:\All Modes\{}\ECG EDA\Combined\{}'.format(dataset, basefolder) # ECG_EDA_Base2_Features_Combined
    date_time = datetime.now().strftime("%Y_%m_%d_%H_%M")
    savePath_0 = f"X:/All Modes/Data Files/{date_time}"
    main_utils_1.mk_dirs(savePath_0)
    savePath1 = os.path.join(savePath_0, f'{dataset}')
    main_utils_1.mk_dirs(savePath1)
    savePath = os.path.join(savePath1, 'ECG EDA')
    main_utils_1.mk_dirs(savePath)

    mycls = {}

    if dataset == 'MatbII':
        parameter_list = funcs_for_matbii
    elif dataset == 'Virage':
        parameter_list = funcs_for_virage

    subjects = os.listdir(dr_feat_path)
    results_df = pd.DataFrame(columns=['dataset', 'method', 'test_subject', 'test_acc', 'test_f1'])
    for sdriv in subjects:
        main_utils_1.mk_dirs(os.path.join(savePath, sdriv))

        xtestDriv = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(sdriv)))
        xtestDriv[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0)
        if np.isinf(xtestDriv).values.sum():
            cinf = np.isinf(xtestDriv).values.sum()
            print("Dataframe contains {} values".format(cinf))

        xtestDriv.replace([np.inf], 9999, inplace=True)
        xtestDriv.replace([-np.inf], -9999, inplace=True)

        xtestDriv['scrNumPeaks'] = xtestDriv['scrNumPeaks'].values.astype(int)
        xtestDriv['scrNumPeaks'] = xtestDriv['scrNumPeaks'].values.clip(min=0) # converting negatives to zero
        xtestDriv.dropna(inplace=True) # .reset_index(drop=True, inplace=True)

        ytestDriv = list(xtestDriv['scaled label'].copy()) 

        XtrainDriv = mk_training_data(dr_feat_path, sdriv)
        # XtrainBase = mk_training_data(bs_feat_path, sdriv)

        XtrainDriv = XtrainDriv[SELECTCOLS].copy()
        XtestDriv = xtestDriv[SELECTCOLS].copy()  ### Look out for small x and X
        # XtrainBase = XtrainBase[SELECTCOLS[:-3]].copy()

        ytrainDriv = list(XtrainDriv['scaled label'].copy())
        # ytrainBase = XtrainBase.shape[0] * [0]

        XtrainDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)
        XtestDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)

        ytrain = ytrainDriv #+ ytrainBase
        # XtrainDriv = XtrainDriv.append(XtrainBase)

        X = XtrainDriv.values
        X, ytrain = shuffle(X, ytrain, random_state=42)

        for idx, val in enumerate(ytrain):
            if val <= 4:
                ytrain[idx] = 0
            else: ytrain[idx] = 1

        for idx, val in enumerate(ytestDriv):
            if val <= 4:
                ytestDriv[idx] = 0
            else: ytestDriv[idx] = 1

        # training different classifier for all subjects and saving them in different dictionnaries
        mycls = {}
        for cls_modl, cls_parameters in parameter_list:
            print("--------------------------------------------")
            print(f"---- Training classifier {cls_modl.__name__} for subject: {sdriv} ---")

            classifier_save_path = os.path.join(savePath, sdriv, cls_modl.__name__)
            main_utils_1.mk_dirs(classifier_save_path)

            classifier_report = os.path.join(classifier_save_path, 'report')
            classifier_sav = os.path.join(classifier_save_path, 'classifier')
            main_utils_1.mk_dirs(classifier_report)
            main_utils_1.mk_dirs(classifier_sav)

            if cls_modl.__name__ in ['LogisticRegression', 'SVC']:
                scaler = StandardScaler()
                X = scaler.fit_transform(X)
            clf = cls_modl(**cls_parameters)
            hist = clf.fit(X, ytrain)

            yPred = hist.predict(XtestDriv)

            test_accuray = accuracy_score(ytestDriv, yPred)
            test_f1 = f1_score(ytestDriv, yPred, average='macro')
            
            results_df = results_df.append({'dataset': folder,
                                            'method':cls_modl.__name__,
                                            'test_subject': sdriv,
                                            'test_acc': test_accuray,
                                            'test_f1':test_f1}, ignore_index=True)
            print('Test Subject: {}'.format(sdriv))

            mycls[sdriv] = classification_report(ytestDriv, yPred, zero_division=1, output_dict=True)
            print("----- Classification Report ------")
            print(f"Test accuracy for {sdriv} is: {test_accuray} and f1 score is: {test_f1}\n")

            print(classification_report(ytestDriv, yPred, zero_division=1))
            with open(os.path.join(classifier_report, 'Test_fold_{}_report.pickle'.format(sdriv)), 'wb') as handle:
                pickle.dump(mycls, handle, protocol= pickle.HIGHEST_PROTOCOL)

            with open(os.path.join(classifier_sav, 'Test_fold_{}_report.sav'.format(sdriv)), 'wb') as handle:
                pickle.dump(hist, handle, protocol= pickle.HIGHEST_PROTOCOL)

        # estimatorPath = os.path.join(savePath, '{}.sav'.format(saveName))
        # pickle.dump(hist, open(estimatorPath, 'wb'))

    results_df.to_csv(os.path.join(savePath, 'results.csv'), index=False)
    return

losoValidation('Virage', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')
    # return XtrainDriv, y_train

--------------------------------------------
---- Training classifier RandomForestClassifier for subject: 1030.csv ---
Test Subject: 1030.csv
----- Classification Report ------
Test accuracy for 1030.csv is: 0.8 and f1 score is: 0.7187431333772798

              precision    recall  f1-score   support

           0       0.58      0.55      0.57        38
           1       0.86      0.88      0.87       122

    accuracy                           0.80       160
   macro avg       0.72      0.71      0.72       160
weighted avg       0.80      0.80      0.80       160

--------------------------------------------
---- Training classifier LinearDiscriminantAnalysis for subject: 1030.csv ---
Test Subject: 1030.csv
----- Classification Report ------
Test accuracy for 1030.csv is: 0.79375 and f1 score is: 0.6951325134245625

              precision    recall  f1-score   support

           0       0.58      0.47      0.52        38
           1       0.84      0.89      0.87       122

   

In [11]:
def losoValidation(dataset, folder, basefolder):
    dr_feat_path = r'X:\All Modes\{}\ECG EDA\Combined\{}'.format(dataset, folder) # ECG_EDA_Features_Combined_scld
    bs_feat_path = r'X:\All Modes\{}\ECG EDA\Combined\{}'.format(dataset, basefolder) # ECG_EDA_Base2_Features_Combined
    date_time = datetime.now().strftime("%Y_%m_%d_%H_%M")
    savePath_0 = f"X:/All Modes/Data Files/{date_time}"
    main_utils_1.mk_dirs(savePath_0)
    savePath1 = os.path.join(savePath_0, f'{dataset}')
    main_utils_1.mk_dirs(savePath1)
    savePath = os.path.join(savePath1, 'ECG EDA')
    main_utils_1.mk_dirs(savePath)

    mycls = {}

    if dataset == 'MatbII':
        parameter_list = funcs_for_matbii
    elif dataset == 'Virage':
        parameter_list = funcs_for_virage

    subjects = os.listdir(dr_feat_path)
    results_df = pd.DataFrame(columns=['dataset', 'method', 'test_subject', 'test_acc', 'test_f1'])
    for sdriv in subjects:
        main_utils_1.mk_dirs(os.path.join(savePath, sdriv))

        xtestDriv = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(sdriv)))
        xtestDriv[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0)
        if np.isinf(xtestDriv).values.sum():
            cinf = np.isinf(xtestDriv).values.sum()
            print("Dataframe contains {} values".format(cinf))

        xtestDriv.replace([np.inf], 9999, inplace=True)
        xtestDriv.replace([-np.inf], -9999, inplace=True)

        xtestDriv['scrNumPeaks'] = xtestDriv['scrNumPeaks'].values.astype(int)
        xtestDriv['scrNumPeaks'] = xtestDriv['scrNumPeaks'].values.clip(min=0) # converting negatives to zero
        xtestDriv.dropna(inplace=True) # .reset_index(drop=True, inplace=True)

        ytestDriv = list(xtestDriv['scaled label'].copy()) 

        XtrainDriv = mk_training_data(dr_feat_path, sdriv)
        XtrainBase = mk_training_data(bs_feat_path, sdriv)

        XtrainDriv = XtrainDriv[SELECTCOLS].copy()
        XtestDriv = xtestDriv[SELECTCOLS].copy()  ### Look out for small x and X
        XtrainBase = XtrainBase[SELECTCOLS[:-3]].copy()

        ytrainDriv = list(XtrainDriv['scaled label'].copy())
        ytrainBase = XtrainBase.shape[0] * [0]

        XtrainDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)
        XtestDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)

        ytrain = ytrainDriv + ytrainBase
        XtrainDriv = XtrainDriv.append(XtrainBase)

        X = XtrainDriv.values
        X, ytrain = shuffle(X, ytrain, random_state=42)

        for idx, val in enumerate(ytrain):
            if val <= 4:
                ytrain[idx] = 0
            else: ytrain[idx] = 1

        for idx, val in enumerate(ytestDriv):
            if val <= 4:
                ytestDriv[idx] = 0
            else: ytestDriv[idx] = 1

        # training different classifier for all subjects and saving them in different dictionnaries
        mycls = {}
        for cls_modl, cls_parameters in parameter_list:
            print("--------------------------------------------")
            print(f"---- Training classifier {cls_modl.__name__} for subject: {sdriv} ---")

            classifier_save_path = os.path.join(savePath, sdriv, cls_modl.__name__)
            main_utils_1.mk_dirs(classifier_save_path)

            classifier_report = os.path.join(classifier_save_path, 'report')
            classifier_sav = os.path.join(classifier_save_path, 'classifier')
            main_utils_1.mk_dirs(classifier_report)
            main_utils_1.mk_dirs(classifier_sav)

            if cls_modl.__name__ in ['LogisticRegression', 'SVC']:
                scaler = StandardScaler()
                X = scaler.fit_transform(X)
            clf = cls_modl(**cls_parameters)
            hist = clf.fit(X, ytrain)

            yPred = hist.predict(XtestDriv)

            test_accuray = accuracy_score(ytestDriv, yPred)
            test_f1 = f1_score(ytestDriv, yPred, average='macro')
            
            results_df = results_df.append({'dataset': folder,
                                            'method':cls_modl.__name__,
                                            'test_subject': sdriv,
                                            'test_acc': test_accuray,
                                            'test_f1':test_f1}, ignore_index=True)
            print('Test Subject: {}'.format(sdriv))

            mycls[sdriv] = classification_report(ytestDriv, yPred, zero_division=1, output_dict=True)
            print("----- Classification Report ------")
            print(f"Test accuracy for {sdriv} is: {test_accuray} and f1 score is: {test_f1}\n")

            print(classification_report(ytestDriv, yPred, zero_division=1))
            with open(os.path.join(classifier_report, 'Test_fold_{}_report.pickle'.format(sdriv)), 'wb') as handle:
                pickle.dump(mycls, handle, protocol= pickle.HIGHEST_PROTOCOL)

            with open(os.path.join(classifier_sav, 'Test_fold_{}_report.sav'.format(sdriv)), 'wb') as handle:
                pickle.dump(hist, handle, protocol= pickle.HIGHEST_PROTOCOL)

        # estimatorPath = os.path.join(savePath, '{}.sav'.format(saveName))
        # pickle.dump(hist, open(estimatorPath, 'wb'))

    results_df.to_csv(os.path.join(savePath, 'results.csv'), index=False)
    return

losoValidation('MatbII', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')
    # return XtrainDriv, y_train

--------------------------------------------
---- Training classifier RandomForestClassifier for subject: 1105.csv ---
Test Subject: 1105.csv
----- Classification Report ------
Test accuracy for 1105.csv is: 0.6805555555555556 and f1 score is: 0.6646792809413459

              precision    recall  f1-score   support

           0       0.70      0.51      0.59        98
           1       0.67      0.82      0.74       118

    accuracy                           0.68       216
   macro avg       0.69      0.67      0.66       216
weighted avg       0.68      0.68      0.67       216

--------------------------------------------
---- Training classifier LinearDiscriminantAnalysis for subject: 1105.csv ---
Test Subject: 1105.csv
----- Classification Report ------
Test accuracy for 1105.csv is: 0.5879629629629629 and f1 score is: 0.47838827838827835

              precision    recall  f1-score   support

           0       0.74      0.14      0.24        98
           1       0.57      0.

In [10]:
def losoValidation(dataset, folder, basefolder):
    dr_feat_path = r'X:\All Modes\{}\ECG EDA\Combined\{}'.format(dataset, folder) # ECG_EDA_Features_Combined_scld
    bs_feat_path = r'X:\All Modes\{}\ECG EDA\Combined\{}'.format(dataset, basefolder) # ECG_EDA_Base2_Features_Combined
    date_time = datetime.now().strftime("%Y_%m_%d_%H_%M")
    savePath_0 = f"X:/All Modes/Data Files/{date_time}"
    main_utils_1.mk_dirs(savePath_0)
    savePath1 = os.path.join(savePath_0, f'{dataset}')
    main_utils_1.mk_dirs(savePath1)
    savePath = os.path.join(savePath1, 'ECG EDA')
    main_utils_1.mk_dirs(savePath)

    mycls = {}

    if dataset == 'MatbII':
        parameter_list = funcs_for_matbii
    elif dataset == 'Virage':
        parameter_list = funcs_for_virage

    subjects = os.listdir(dr_feat_path)
    results_df = pd.DataFrame(columns=['dataset', 'method', 'test_subject', 'test_acc', 'test_f1'])
    for sdriv in subjects:
        main_utils_1.mk_dirs(os.path.join(savePath, sdriv))

        xtestDriv = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(sdriv)))
        xtestDriv[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0)
        if np.isinf(xtestDriv).values.sum():
            cinf = np.isinf(xtestDriv).values.sum()
            print("Dataframe contains {} values".format(cinf))

        xtestDriv.replace([np.inf], 9999, inplace=True)
        xtestDriv.replace([-np.inf], -9999, inplace=True)

        xtestDriv['scrNumPeaks'] = xtestDriv['scrNumPeaks'].values.astype(int)
        xtestDriv['scrNumPeaks'] = xtestDriv['scrNumPeaks'].values.clip(min=0) # converting negatives to zero
        xtestDriv.dropna(inplace=True) # .reset_index(drop=True, inplace=True)

        ytestDriv = list(xtestDriv['scaled label'].copy()) 

        XtrainDriv = mk_training_data(dr_feat_path, sdriv)
        # XtrainBase = mk_training_data(bs_feat_path, sdriv)

        XtrainDriv = XtrainDriv[SELECTCOLS].copy()
        XtestDriv = xtestDriv[SELECTCOLS].copy()  ### Look out for small x and X
        # XtrainBase = XtrainBase[SELECTCOLS[:-3]].copy()

        ytrainDriv = list(XtrainDriv['scaled label'].copy())
        # ytrainBase = XtrainBase.shape[0] * [0]

        XtrainDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)
        XtestDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)

        ytrain = ytrainDriv #+ ytrainBase
        # XtrainDriv = XtrainDriv.append(XtrainBase)

        X = XtrainDriv.values
        X, ytrain = shuffle(X, ytrain, random_state=42)

        for idx, val in enumerate(ytrain):
            if val <= 4:
                ytrain[idx] = 0
            else: ytrain[idx] = 1

        for idx, val in enumerate(ytestDriv):
            if val <= 4:
                ytestDriv[idx] = 0
            else: ytestDriv[idx] = 1

        # training different classifier for all subjects and saving them in different dictionnaries
        mycls = {}
        for cls_modl, cls_parameters in parameter_list:
            print("--------------------------------------------")
            print(f"---- Training classifier {cls_modl.__name__} for subject: {sdriv} ---")

            classifier_save_path = os.path.join(savePath, sdriv, cls_modl.__name__)
            main_utils_1.mk_dirs(classifier_save_path)

            classifier_report = os.path.join(classifier_save_path, 'report')
            classifier_sav = os.path.join(classifier_save_path, 'classifier')
            main_utils_1.mk_dirs(classifier_report)
            main_utils_1.mk_dirs(classifier_sav)

            if cls_modl.__name__ in ['LogisticRegression', 'SVC']:
                scaler = StandardScaler()
                X = scaler.fit_transform(X)
            clf = cls_modl(**cls_parameters)
            hist = clf.fit(X, ytrain)

            yPred = hist.predict(XtestDriv)

            test_accuray = accuracy_score(ytestDriv, yPred)
            test_f1 = f1_score(ytestDriv, yPred, average='macro')
            
            results_df = results_df.append({'dataset': folder,
                                            'method':cls_modl.__name__,
                                            'test_subject': sdriv,
                                            'test_acc': test_accuray,
                                            'test_f1':test_f1}, ignore_index=True)
            print('Test Subject: {}'.format(sdriv))

            mycls[sdriv] = classification_report(ytestDriv, yPred, zero_division=1, output_dict=True)
            print("----- Classification Report ------")
            print(f"Test accuracy for {sdriv} is: {test_accuray} and f1 score is: {test_f1}\n")

            print(classification_report(ytestDriv, yPred, zero_division=1))
            with open(os.path.join(classifier_report, 'Test_fold_{}_report.pickle'.format(sdriv)), 'wb') as handle:
                pickle.dump(mycls, handle, protocol= pickle.HIGHEST_PROTOCOL)

            with open(os.path.join(classifier_sav, 'Test_fold_{}_report.sav'.format(sdriv)), 'wb') as handle:
                pickle.dump(hist, handle, protocol= pickle.HIGHEST_PROTOCOL)

        # estimatorPath = os.path.join(savePath, '{}.sav'.format(saveName))
        # pickle.dump(hist, open(estimatorPath, 'wb'))

    results_df.to_csv(os.path.join(savePath, 'results.csv'), index=False)
    return

losoValidation('MatbII', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')
    # return XtrainDriv, y_train

--------------------------------------------
---- Training classifier RandomForestClassifier for subject: 1105.csv ---
Test Subject: 1105.csv
----- Classification Report ------
Test accuracy for 1105.csv is: 0.5740740740740741 and f1 score is: 0.4174484052532833

              precision    recall  f1-score   support

           0       1.00      0.06      0.12        98
           1       0.56      1.00      0.72       118

    accuracy                           0.57       216
   macro avg       0.78      0.53      0.42       216
weighted avg       0.76      0.57      0.45       216

--------------------------------------------
---- Training classifier LinearDiscriminantAnalysis for subject: 1105.csv ---
Test Subject: 1105.csv
----- Classification Report ------
Test accuracy for 1105.csv is: 0.5740740740740741 and f1 score is: 0.4174484052532833

              precision    recall  f1-score   support

           0       1.00      0.06      0.12        98
           1       0.56      1.0

### ECG Virage

In [12]:
def losoValidation(dataset, folder, basefolder):
    dr_feat_path = r'X:\All Modes\{}\ECG EDA\Combined\{}'.format(dataset, folder) # ECG_EDA_Features_Combined_scld
    bs_feat_path = r'X:\All Modes\{}\ECG EDA\Combined\{}'.format(dataset, basefolder) # ECG_EDA_Base2_Features_Combined
    date_time = datetime.now().strftime("%Y_%m_%d_%H_%M")
    savePath_0 = f"X:/All Modes/Data Files/{date_time}"
    main_utils_1.mk_dirs(savePath_0)
    savePath1 = os.path.join(savePath_0, f'{dataset}')
    main_utils_1.mk_dirs(savePath1)
    savePath = os.path.join(savePath1, 'ECG EDA')
    main_utils_1.mk_dirs(savePath)

    mycls = {}

    if dataset == 'MatbII':
        parameter_list = funcs_for_matbii
    elif dataset == 'Virage':
        parameter_list = funcs_for_virage

    subjects = os.listdir(dr_feat_path)
    results_df = pd.DataFrame(columns=['dataset', 'method', 'test_subject', 'test_acc', 'test_f1'])
    for sdriv in subjects:
        main_utils_1.mk_dirs(os.path.join(savePath, sdriv))

        xtestDriv = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(sdriv)))
        xtestDriv[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0)
        if np.isinf(xtestDriv).values.sum():
            cinf = np.isinf(xtestDriv).values.sum()
            print("Dataframe contains {} values".format(cinf))

        xtestDriv.replace([np.inf], 9999, inplace=True)
        xtestDriv.replace([-np.inf], -9999, inplace=True)

        xtestDriv['scrNumPeaks'] = xtestDriv['scrNumPeaks'].values.astype(int)
        xtestDriv['scrNumPeaks'] = xtestDriv['scrNumPeaks'].values.clip(min=0) # converting negatives to zero
        xtestDriv.dropna(inplace=True) # .reset_index(drop=True, inplace=True)

        ytestDriv = list(xtestDriv['scaled label'].copy()) 

        XtrainDriv = mk_training_data(dr_feat_path, sdriv)
        # XtrainBase = mk_training_data(bs_feat_path, sdriv)

        XtrainDriv = XtrainDriv[ECG_SELECTCOLS].copy()
        XtestDriv = xtestDriv[ECG_SELECTCOLS].copy()  ### Look out for small x and X
        # XtrainBase = XtrainBase[SELECTCOLS[:-3]].copy()

        ytrainDriv = list(XtrainDriv['scaled label'].copy())
        # ytrainBase = XtrainBase.shape[0] * [0]

        XtrainDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)
        XtestDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)

        ytrain = ytrainDriv #+ ytrainBase
        # XtrainDriv = XtrainDriv.append(XtrainBase)

        X = XtrainDriv.values
        X, ytrain = shuffle(X, ytrain, random_state=42)

        for idx, val in enumerate(ytrain):
            if val <= 4:
                ytrain[idx] = 0
            else: ytrain[idx] = 1

        for idx, val in enumerate(ytestDriv):
            if val <= 4:
                ytestDriv[idx] = 0
            else: ytestDriv[idx] = 1

        # training different classifier for all subjects and saving them in different dictionnaries
        mycls = {}
        for cls_modl, cls_parameters in parameter_list:
            print("--------------------------------------------")
            print(f"---- Training classifier {cls_modl.__name__} for subject: {sdriv} ---")

            classifier_save_path = os.path.join(savePath, sdriv, cls_modl.__name__)
            main_utils_1.mk_dirs(classifier_save_path)

            classifier_report = os.path.join(classifier_save_path, 'report')
            classifier_sav = os.path.join(classifier_save_path, 'classifier')
            main_utils_1.mk_dirs(classifier_report)
            main_utils_1.mk_dirs(classifier_sav)

            if cls_modl.__name__ in ['LogisticRegression', 'SVC']:
                scaler = StandardScaler()
                X = scaler.fit_transform(X)
            clf = cls_modl(**cls_parameters)
            hist = clf.fit(X, ytrain)

            yPred = hist.predict(XtestDriv)

            test_accuray = accuracy_score(ytestDriv, yPred)
            test_f1 = f1_score(ytestDriv, yPred, average='macro')
            
            results_df = results_df.append({'dataset': folder,
                                            'method':cls_modl.__name__,
                                            'test_subject': sdriv,
                                            'test_acc': test_accuray,
                                            'test_f1':test_f1}, ignore_index=True)
            print('Test Subject: {}'.format(sdriv))

            mycls[sdriv] = classification_report(ytestDriv, yPred, zero_division=1, output_dict=True)
            print("----- Classification Report ------")
            print(f"Test accuracy for {sdriv} is: {test_accuray} and f1 score is: {test_f1}\n")

            print(classification_report(ytestDriv, yPred, zero_division=1))
            with open(os.path.join(classifier_report, 'Test_fold_{}_report.pickle'.format(sdriv)), 'wb') as handle:
                pickle.dump(mycls, handle, protocol= pickle.HIGHEST_PROTOCOL)

            with open(os.path.join(classifier_sav, 'Test_fold_{}_report.sav'.format(sdriv)), 'wb') as handle:
                pickle.dump(hist, handle, protocol= pickle.HIGHEST_PROTOCOL)

        # estimatorPath = os.path.join(savePath, '{}.sav'.format(saveName))
        # pickle.dump(hist, open(estimatorPath, 'wb'))

    results_df.to_csv(os.path.join(savePath, 'results.csv'), index=False)
    return

losoValidation('Virage', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')
    # return XtrainDriv, y_train

--------------------------------------------
---- Training classifier RandomForestClassifier for subject: 1030.csv ---
Test Subject: 1030.csv
----- Classification Report ------
Test accuracy for 1030.csv is: 0.7125 and f1 score is: 0.628732849071832

              precision    recall  f1-score   support

           0       0.41      0.50      0.45        38
           1       0.83      0.78      0.81       122

    accuracy                           0.71       160
   macro avg       0.62      0.64      0.63       160
weighted avg       0.73      0.71      0.72       160

--------------------------------------------
---- Training classifier LinearDiscriminantAnalysis for subject: 1030.csv ---
Test Subject: 1030.csv
----- Classification Report ------
Test accuracy for 1030.csv is: 0.83125 and f1 score is: 0.7118270962577546

              precision    recall  f1-score   support

           0       0.79      0.39      0.53        38
           1       0.84      0.97      0.90       122

 

In [13]:
def losoValidation(dataset, folder, basefolder):
    dr_feat_path = r'X:\All Modes\{}\ECG EDA\Combined\{}'.format(dataset, folder) # ECG_EDA_Features_Combined_scld
    bs_feat_path = r'X:\All Modes\{}\ECG EDA\Combined\{}'.format(dataset, basefolder) # ECG_EDA_Base2_Features_Combined
    date_time = datetime.now().strftime("%Y_%m_%d_%H_%M")
    savePath_0 = f"X:/All Modes/Data Files/{date_time}"
    main_utils_1.mk_dirs(savePath_0)
    savePath1 = os.path.join(savePath_0, f'{dataset}')
    main_utils_1.mk_dirs(savePath1)
    savePath = os.path.join(savePath1, 'ECG EDA')
    main_utils_1.mk_dirs(savePath)

    mycls = {}

    if dataset == 'MatbII':
        parameter_list = funcs_for_matbii
    elif dataset == 'Virage':
        parameter_list = funcs_for_virage

    subjects = os.listdir(dr_feat_path)
    results_df = pd.DataFrame(columns=['dataset', 'method', 'test_subject', 'test_acc', 'test_f1'])
    for sdriv in subjects:
        main_utils_1.mk_dirs(os.path.join(savePath, sdriv))

        xtestDriv = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(sdriv)))
        xtestDriv[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0)
        if np.isinf(xtestDriv).values.sum():
            cinf = np.isinf(xtestDriv).values.sum()
            print("Dataframe contains {} values".format(cinf))

        xtestDriv.replace([np.inf], 9999, inplace=True)
        xtestDriv.replace([-np.inf], -9999, inplace=True)

        xtestDriv['scrNumPeaks'] = xtestDriv['scrNumPeaks'].values.astype(int)
        xtestDriv['scrNumPeaks'] = xtestDriv['scrNumPeaks'].values.clip(min=0) # converting negatives to zero
        xtestDriv.dropna(inplace=True) # .reset_index(drop=True, inplace=True)

        ytestDriv = list(xtestDriv['scaled label'].copy()) 

        XtrainDriv = mk_training_data(dr_feat_path, sdriv)
        # XtrainBase = mk_training_data(bs_feat_path, sdriv)

        XtrainDriv = XtrainDriv[ECG_SELECTCOLS].copy()
        XtestDriv = xtestDriv[ECG_SELECTCOLS].copy()  ### Look out for small x and X
        # XtrainBase = XtrainBase[SELECTCOLS[:-3]].copy()

        ytrainDriv = list(XtrainDriv['scaled label'].copy())
        # ytrainBase = XtrainBase.shape[0] * [0]

        XtrainDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)
        XtestDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)

        ytrain = ytrainDriv #+ ytrainBase
        # XtrainDriv = XtrainDriv.append(XtrainBase)

        X = XtrainDriv.values
        X, ytrain = shuffle(X, ytrain, random_state=42)

        for idx, val in enumerate(ytrain):
            if val <= 4:
                ytrain[idx] = 0
            else: ytrain[idx] = 1

        for idx, val in enumerate(ytestDriv):
            if val <= 4:
                ytestDriv[idx] = 0
            else: ytestDriv[idx] = 1

        # training different classifier for all subjects and saving them in different dictionnaries
        mycls = {}
        for cls_modl, cls_parameters in parameter_list:
            print("--------------------------------------------")
            print(f"---- Training classifier {cls_modl.__name__} for subject: {sdriv} ---")

            classifier_save_path = os.path.join(savePath, sdriv, cls_modl.__name__)
            main_utils_1.mk_dirs(classifier_save_path)

            classifier_report = os.path.join(classifier_save_path, 'report')
            classifier_sav = os.path.join(classifier_save_path, 'classifier')
            main_utils_1.mk_dirs(classifier_report)
            main_utils_1.mk_dirs(classifier_sav)

            if cls_modl.__name__ in ['LogisticRegression', 'SVC']:
                scaler = StandardScaler()
                X = scaler.fit_transform(X)
            clf = cls_modl(**cls_parameters)
            hist = clf.fit(X, ytrain)

            yPred = hist.predict(XtestDriv)

            test_accuray = accuracy_score(ytestDriv, yPred)
            test_f1 = f1_score(ytestDriv, yPred, average='macro')
            
            results_df = results_df.append({'dataset': folder,
                                            'method':cls_modl.__name__,
                                            'test_subject': sdriv,
                                            'test_acc': test_accuray,
                                            'test_f1':test_f1}, ignore_index=True)
            print('Test Subject: {}'.format(sdriv))

            mycls[sdriv] = classification_report(ytestDriv, yPred, zero_division=1, output_dict=True)
            print("----- Classification Report ------")
            print(f"Test accuracy for {sdriv} is: {test_accuray} and f1 score is: {test_f1}\n")

            print(classification_report(ytestDriv, yPred, zero_division=1))
            with open(os.path.join(classifier_report, 'Test_fold_{}_report.pickle'.format(sdriv)), 'wb') as handle:
                pickle.dump(mycls, handle, protocol= pickle.HIGHEST_PROTOCOL)

            with open(os.path.join(classifier_sav, 'Test_fold_{}_report.sav'.format(sdriv)), 'wb') as handle:
                pickle.dump(hist, handle, protocol= pickle.HIGHEST_PROTOCOL)

        # estimatorPath = os.path.join(savePath, '{}.sav'.format(saveName))
        # pickle.dump(hist, open(estimatorPath, 'wb'))

    results_df.to_csv(os.path.join(savePath, 'results.csv'), index=False)
    return

losoValidation('MatbII', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')
    # return XtrainDriv, y_train

--------------------------------------------
---- Training classifier RandomForestClassifier for subject: 1105.csv ---
Test Subject: 1105.csv
----- Classification Report ------
Test accuracy for 1105.csv is: 0.5555555555555556 and f1 score is: 0.4403540969448343

              precision    recall  f1-score   support

           0       0.55      0.11      0.19        98
           1       0.56      0.92      0.69       118

    accuracy                           0.56       216
   macro avg       0.55      0.52      0.44       216
weighted avg       0.55      0.56      0.46       216

--------------------------------------------
---- Training classifier LinearDiscriminantAnalysis for subject: 1105.csv ---
Test Subject: 1105.csv
----- Classification Report ------
Test accuracy for 1105.csv is: 0.5648148148148148 and f1 score is: 0.3967914438502673

              precision    recall  f1-score   support

           0       1.00      0.04      0.08        98
           1       0.56      1.0

### EDA Virage

In [14]:
def losoValidation(dataset, folder, basefolder):
    dr_feat_path = r'X:\All Modes\{}\ECG EDA\Combined\{}'.format(dataset, folder) # ECG_EDA_Features_Combined_scld
    bs_feat_path = r'X:\All Modes\{}\ECG EDA\Combined\{}'.format(dataset, basefolder) # ECG_EDA_Base2_Features_Combined
    date_time = datetime.now().strftime("%Y_%m_%d_%H_%M")
    savePath_0 = f"X:/All Modes/Data Files/{date_time}"
    main_utils_1.mk_dirs(savePath_0)
    savePath1 = os.path.join(savePath_0, f'{dataset}')
    main_utils_1.mk_dirs(savePath1)
    savePath = os.path.join(savePath1, 'ECG EDA')
    main_utils_1.mk_dirs(savePath)

    mycls = {}

    if dataset == 'MatbII':
        parameter_list = funcs_for_matbii
    elif dataset == 'Virage':
        parameter_list = funcs_for_virage

    subjects = os.listdir(dr_feat_path)
    results_df = pd.DataFrame(columns=['dataset', 'method', 'test_subject', 'test_acc', 'test_f1'])
    for sdriv in subjects:
        main_utils_1.mk_dirs(os.path.join(savePath, sdriv))

        xtestDriv = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(sdriv)))
        xtestDriv[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0)
        if np.isinf(xtestDriv).values.sum():
            cinf = np.isinf(xtestDriv).values.sum()
            print("Dataframe contains {} values".format(cinf))

        xtestDriv.replace([np.inf], 9999, inplace=True)
        xtestDriv.replace([-np.inf], -9999, inplace=True)

        xtestDriv['scrNumPeaks'] = xtestDriv['scrNumPeaks'].values.astype(int)
        xtestDriv['scrNumPeaks'] = xtestDriv['scrNumPeaks'].values.clip(min=0) # converting negatives to zero
        xtestDriv.dropna(inplace=True) # .reset_index(drop=True, inplace=True)

        ytestDriv = list(xtestDriv['scaled label'].copy()) 

        XtrainDriv = mk_training_data(dr_feat_path, sdriv)
        # XtrainBase = mk_training_data(bs_feat_path, sdriv)

        XtrainDriv = XtrainDriv[EDA_SELECTCOLS].copy()
        XtestDriv = xtestDriv[EDA_SELECTCOLS].copy()  ### Look out for small x and X
        # XtrainBase = XtrainBase[SELECTCOLS[:-3]].copy()

        ytrainDriv = list(XtrainDriv['scaled label'].copy())
        # ytrainBase = XtrainBase.shape[0] * [0]

        XtrainDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)
        XtestDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)

        ytrain = ytrainDriv #+ ytrainBase
        # XtrainDriv = XtrainDriv.append(XtrainBase)

        X = XtrainDriv.values
        X, ytrain = shuffle(X, ytrain, random_state=42)

        for idx, val in enumerate(ytrain):
            if val <= 4:
                ytrain[idx] = 0
            else: ytrain[idx] = 1

        for idx, val in enumerate(ytestDriv):
            if val <= 4:
                ytestDriv[idx] = 0
            else: ytestDriv[idx] = 1

        # training different classifier for all subjects and saving them in different dictionnaries
        mycls = {}
        for cls_modl, cls_parameters in parameter_list:
            print("--------------------------------------------")
            print(f"---- Training classifier {cls_modl.__name__} for subject: {sdriv} ---")

            classifier_save_path = os.path.join(savePath, sdriv, cls_modl.__name__)
            main_utils_1.mk_dirs(classifier_save_path)

            classifier_report = os.path.join(classifier_save_path, 'report')
            classifier_sav = os.path.join(classifier_save_path, 'classifier')
            main_utils_1.mk_dirs(classifier_report)
            main_utils_1.mk_dirs(classifier_sav)

            if cls_modl.__name__ in ['LogisticRegression', 'SVC']:
                scaler = StandardScaler()
                X = scaler.fit_transform(X)
            clf = cls_modl(**cls_parameters)
            hist = clf.fit(X, ytrain)

            yPred = hist.predict(XtestDriv)

            test_accuray = accuracy_score(ytestDriv, yPred)
            test_f1 = f1_score(ytestDriv, yPred, average='macro')
            
            results_df = results_df.append({'dataset': folder,
                                            'method':cls_modl.__name__,
                                            'test_subject': sdriv,
                                            'test_acc': test_accuray,
                                            'test_f1':test_f1}, ignore_index=True)
            print('Test Subject: {}'.format(sdriv))

            mycls[sdriv] = classification_report(ytestDriv, yPred, zero_division=1, output_dict=True)
            print("----- Classification Report ------")
            print(f"Test accuracy for {sdriv} is: {test_accuray} and f1 score is: {test_f1}\n")

            print(classification_report(ytestDriv, yPred, zero_division=1))
            with open(os.path.join(classifier_report, 'Test_fold_{}_report.pickle'.format(sdriv)), 'wb') as handle:
                pickle.dump(mycls, handle, protocol= pickle.HIGHEST_PROTOCOL)

            with open(os.path.join(classifier_sav, 'Test_fold_{}_report.sav'.format(sdriv)), 'wb') as handle:
                pickle.dump(hist, handle, protocol= pickle.HIGHEST_PROTOCOL)

        # estimatorPath = os.path.join(savePath, '{}.sav'.format(saveName))
        # pickle.dump(hist, open(estimatorPath, 'wb'))

    results_df.to_csv(os.path.join(savePath, 'results.csv'), index=False)
    return

losoValidation('Virage', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')
    # return XtrainDriv, y_train

--------------------------------------------
---- Training classifier RandomForestClassifier for subject: 1030.csv ---
Test Subject: 1030.csv
----- Classification Report ------
Test accuracy for 1030.csv is: 0.8 and f1 score is: 0.7012138188608776

              precision    recall  f1-score   support

           0       0.60      0.47      0.53        38
           1       0.85      0.90      0.87       122

    accuracy                           0.80       160
   macro avg       0.72      0.69      0.70       160
weighted avg       0.79      0.80      0.79       160

--------------------------------------------
---- Training classifier LinearDiscriminantAnalysis for subject: 1030.csv ---
Test Subject: 1030.csv
----- Classification Report ------
Test accuracy for 1030.csv is: 0.8125 and f1 score is: 0.70703125

              precision    recall  f1-score   support

           0       0.65      0.45      0.53        38
           1       0.84      0.93      0.88       122

    accuracy

In [15]:
def losoValidation(dataset, folder, basefolder):
    dr_feat_path = r'X:\All Modes\{}\ECG EDA\Combined\{}'.format(dataset, folder) # ECG_EDA_Features_Combined_scld
    bs_feat_path = r'X:\All Modes\{}\ECG EDA\Combined\{}'.format(dataset, basefolder) # ECG_EDA_Base2_Features_Combined
    date_time = datetime.now().strftime("%Y_%m_%d_%H_%M")
    savePath_0 = f"X:/All Modes/Data Files/{date_time}"
    main_utils_1.mk_dirs(savePath_0)
    savePath1 = os.path.join(savePath_0, f'{dataset}')
    main_utils_1.mk_dirs(savePath1)
    savePath = os.path.join(savePath1, 'ECG EDA')
    main_utils_1.mk_dirs(savePath)

    mycls = {}

    if dataset == 'MatbII':
        parameter_list = funcs_for_matbii
    elif dataset == 'Virage':
        parameter_list = funcs_for_virage

    subjects = os.listdir(dr_feat_path)
    results_df = pd.DataFrame(columns=['dataset', 'method', 'test_subject', 'test_acc', 'test_f1'])
    for sdriv in subjects:
        main_utils_1.mk_dirs(os.path.join(savePath, sdriv))

        xtestDriv = pd.read_csv(os.path.join(dr_feat_path, '{}'.format(sdriv)))
        xtestDriv[['scrAmpDF_min','scrRecoveryTime_min', 'scrRiseTime_min']].fillna(0)
        if np.isinf(xtestDriv).values.sum():
            cinf = np.isinf(xtestDriv).values.sum()
            print("Dataframe contains {} values".format(cinf))

        xtestDriv.replace([np.inf], 9999, inplace=True)
        xtestDriv.replace([-np.inf], -9999, inplace=True)

        xtestDriv['scrNumPeaks'] = xtestDriv['scrNumPeaks'].values.astype(int)
        xtestDriv['scrNumPeaks'] = xtestDriv['scrNumPeaks'].values.clip(min=0) # converting negatives to zero
        xtestDriv.dropna(inplace=True) # .reset_index(drop=True, inplace=True)

        ytestDriv = list(xtestDriv['scaled label'].copy()) 

        XtrainDriv = mk_training_data(dr_feat_path, sdriv)
        # XtrainBase = mk_training_data(bs_feat_path, sdriv)

        XtrainDriv = XtrainDriv[EDA_SELECTCOLS].copy()
        XtestDriv = xtestDriv[EDA_SELECTCOLS].copy()  ### Look out for small x and X
        # XtrainBase = XtrainBase[SELECTCOLS[:-3]].copy()

        ytrainDriv = list(XtrainDriv['scaled label'].copy())
        # ytrainBase = XtrainBase.shape[0] * [0]

        XtrainDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)
        XtestDriv.drop(columns=['label', 'complexity', 'scaled label'], inplace=True)

        ytrain = ytrainDriv #+ ytrainBase
        # XtrainDriv = XtrainDriv.append(XtrainBase)

        X = XtrainDriv.values
        X, ytrain = shuffle(X, ytrain, random_state=42)

        for idx, val in enumerate(ytrain):
            if val <= 4:
                ytrain[idx] = 0
            else: ytrain[idx] = 1

        for idx, val in enumerate(ytestDriv):
            if val <= 4:
                ytestDriv[idx] = 0
            else: ytestDriv[idx] = 1

        # training different classifier for all subjects and saving them in different dictionnaries
        mycls = {}
        for cls_modl, cls_parameters in parameter_list:
            print("--------------------------------------------")
            print(f"---- Training classifier {cls_modl.__name__} for subject: {sdriv} ---")

            classifier_save_path = os.path.join(savePath, sdriv, cls_modl.__name__)
            main_utils_1.mk_dirs(classifier_save_path)

            classifier_report = os.path.join(classifier_save_path, 'report')
            classifier_sav = os.path.join(classifier_save_path, 'classifier')
            main_utils_1.mk_dirs(classifier_report)
            main_utils_1.mk_dirs(classifier_sav)

            if cls_modl.__name__ in ['LogisticRegression', 'SVC']:
                scaler = StandardScaler()
                X = scaler.fit_transform(X)
            clf = cls_modl(**cls_parameters)
            hist = clf.fit(X, ytrain)

            yPred = hist.predict(XtestDriv)

            test_accuray = accuracy_score(ytestDriv, yPred)
            test_f1 = f1_score(ytestDriv, yPred, average='macro')
            
            results_df = results_df.append({'dataset': folder,
                                            'method':cls_modl.__name__,
                                            'test_subject': sdriv,
                                            'test_acc': test_accuray,
                                            'test_f1':test_f1}, ignore_index=True)
            print('Test Subject: {}'.format(sdriv))

            mycls[sdriv] = classification_report(ytestDriv, yPred, zero_division=1, output_dict=True)
            print("----- Classification Report ------")
            print(f"Test accuracy for {sdriv} is: {test_accuray} and f1 score is: {test_f1}\n")

            print(classification_report(ytestDriv, yPred, zero_division=1))
            with open(os.path.join(classifier_report, 'Test_fold_{}_report.pickle'.format(sdriv)), 'wb') as handle:
                pickle.dump(mycls, handle, protocol= pickle.HIGHEST_PROTOCOL)

            with open(os.path.join(classifier_sav, 'Test_fold_{}_report.sav'.format(sdriv)), 'wb') as handle:
                pickle.dump(hist, handle, protocol= pickle.HIGHEST_PROTOCOL)

        # estimatorPath = os.path.join(savePath, '{}.sav'.format(saveName))
        # pickle.dump(hist, open(estimatorPath, 'wb'))

    results_df.to_csv(os.path.join(savePath, 'results.csv'), index=False)
    return

losoValidation('MatbII', 'ECG_EDA_Features_Combined_scld', 'ECG_EDA_Base2_Features_Combined')
    # return XtrainDriv, y_train

--------------------------------------------
---- Training classifier RandomForestClassifier for subject: 1105.csv ---
Test Subject: 1105.csv
----- Classification Report ------
Test accuracy for 1105.csv is: 0.6018518518518519 and f1 score is: 0.4755505364201016

              precision    recall  f1-score   support

           0       1.00      0.12      0.22        98
           1       0.58      1.00      0.73       118

    accuracy                           0.60       216
   macro avg       0.79      0.56      0.48       216
weighted avg       0.77      0.60      0.50       216

--------------------------------------------
---- Training classifier LinearDiscriminantAnalysis for subject: 1105.csv ---
Test Subject: 1105.csv
----- Classification Report ------
Test accuracy for 1105.csv is: 0.5925925925925926 and f1 score is: 0.45679012345679015

              precision    recall  f1-score   support

           0       1.00      0.10      0.19        98
           1       0.57      1.