In [11]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing
import wfdb
import copy as cp
import scipy.signal as signal
import scipy.stats as stats
from sklearn import preprocessing
from tqdm import tqdm
import os
import pathlib
import re
import pandas as pd
import pickle
import csv
import statistics

In [12]:
import timeit

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

from sklearn.preprocessing import PolynomialFeatures

import sklearn.linear_model as skl_lm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn import neighbors
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeRegressor 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.tree import export_graphviz
from sklearn.svm import SVC

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn import metrics
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc

import lightgbm
import lightgbm as lgb
from sklearn.preprocessing import LabelBinarizer

import catboost as cb
from catboost import CatBoostClassifier

In [13]:
parent_path = str(pathlib.Path(os.getcwd()).parent.parent)

rlist = []
records = os.path.normpath(parent_path + '/mit-bih-dataframes/subject_list.csv')
with open(records) as rfile:
    recordreader = csv.reader(rfile, delimiter=' ', quotechar='|')
    for row in recordreader:
        rlist.append(row[0])

In [32]:
performance_dict = {
    "Avg Accuracy": [],
    "Std Accuracy": [],
    "Sensitivity": [],
    "Specificity": [],
    "Precision": [],
    "F1 score": [],
    "Run time": [],
    "TPS": []
}

In [15]:
feature_dfs = {}
for record in tqdm(rlist):
    feature_dfs[record] = pd.read_csv(os.path.normpath(parent_path + '/mit-bih-time-features/'+record+'.csv'), index_col=0, dtype={'subjectID': str})

combined_features = pd.concat([feature_dfs[key][1:] for key in feature_dfs])

100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:00<00:00, 43.69it/s]


In [16]:
X = combined_features[['StoS', 'StoR', 'StoL', 'RtoS', 'RtoR', 'RtoL', 'LtoS', 'LtoR', 'LtoL', 'rmssd', 'iqr', 'mad', 'cov']]
y = combined_features['mappedLabel'].map({"Non-Afib": 0, "Afib": 1})
groups = combined_features['subjectID'].astype('int64')

logo = LeaveOneGroupOut()
splits = list(logo.split(X, y, groups=groups))

In [33]:
# Logistic regression
logreg = LogisticRegression(solver='liblinear')

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_train = X.iloc[train]
    y_train = y.iloc[train]

    X_test = X.iloc[test]
    y_test = y.iloc[test]

    logreg.fit(X_train, y_train)
    
    start_time = timeit.default_timer()
    pred_values = logreg.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    total_time+=elapsed
    total_tests+=len(X_test)
    
tps = total_time/total_tests

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:19<00:00,  1.21it/s]


---Run time is 0.02649360003124457 seconds ---
---Run time per subset is 1.412562582641254e-07 seconds ---

Accuracy of each fold: 
 [0.9138755980861244, 0.9256382875448985, 0.9926093514328809, 0.9459573273441887, 0.9861826403718126, 0.9458880778588807, 0.8871329298363596, 0.9667703388443281, 0.9105485232067511, 0.8924560708151671, 0.9702322538436375, 0.9704510108864697, 0.8133986928104575, 0.9198590254367147, 0.4799002493765586, 0.9609475032010243, 0.9864260883959609, 0.9139486467730743, 0.6809977692151693, 0.9409355179704016, 0.9869267694821775, 0.9782543038356992, 0.9949525540076721]
Avg accuracy: 0.9114908491554958
Std of accuracy : 
0.1143626744796147
confusion matrix: 
[[94204  8681]
 [ 9458 75214]]
classification report: 
              precision    recall  f1-score   support

           0       0.91      0.92      0.91    102885
           1       0.90      0.89      0.89     84672

    accuracy                           0.90    187557
   macro avg       0.90      0.90      0.90

In [34]:
# Linear discriminant analysis
start_time = timeit.default_timer()

lda = LinearDiscriminantAnalysis()

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_train = X.iloc[train]
    y_train = y.iloc[train]

    X_test = X.iloc[test]
    y_test = y.iloc[test]

    lda.fit(X_train, y_train)
    
    start_time = timeit.default_timer()
    pred_values = lda.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    total_time+=elapsed
    total_tests+=len(X_test)
    
tps = total_time/total_tests

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:05<00:00,  4.03it/s]


---Run time is 0.027353700017556548 seconds ---
---Run time per subset is 1.4584206410614666e-07 seconds ---

Accuracy of each fold: 
 [0.9142857142857143, 0.921366857586642, 0.9920060331825038, 0.936692869174621, 0.9860570280115564, 0.9472506082725061, 0.8944182918628111, 0.9661155671959404, 0.9035563592525618, 0.8952305456467169, 0.971540726202159, 0.9709694142042509, 0.8347494553376906, 0.9505056696291756, 0.44119700748129675, 0.947609901835254, 0.9892401920211885, 0.9231089521165857, 0.7191239099574123, 0.9310253699788583, 0.9852926156674497, 0.9620960434913923, 0.9917221885725823]
Avg accuracy: 0.9119635356942117
Std of accuracy : 
0.11660166415425957
confusion matrix: 
[[94870  8015]
 [10071 74601]]
classification report: 
              precision    recall  f1-score   support

           0       0.90      0.92      0.91    102885
           1       0.90      0.88      0.89     84672

    accuracy                           0.90    187557
   macro avg       0.90      0.90      0.90

In [35]:
# Quadratic discriminant analysis
start_time = timeit.default_timer()

qda = QuadraticDiscriminantAnalysis()

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_train = X.iloc[train]
    y_train = y.iloc[train]

    X_test = X.iloc[test]
    y_test = y.iloc[test]

    qda.fit(X_train, y_train)
    
    start_time = timeit.default_timer()
    pred_values = qda.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    total_time+=elapsed
    total_tests+=len(X_test)
    
tps = total_time/total_tests

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:01<00:00, 14.82it/s]


---Run time is 0.06390159997681621 seconds ---
---Run time per subset is 3.407049589021802e-07 seconds ---

Accuracy of each fold: 
 [0.8829801777170199, 0.93214251043588, 0.9897435897435898, 0.877877596855699, 0.9888204999371938, 0.9279805352798054, 0.9058507061197041, 0.9467998035685055, 0.8230259192284509, 0.7884793235566125, 0.830334750845055, 0.9417660273025747, 0.7855119825708061, 0.3746552252528348, 0.519002493765586, 0.7633376013657703, 0.9566296970700215, 0.8438584316446912, 0.7710403569255729, 0.84223044397463, 0.9836584618527219, 0.9479009362730293, 0.9903089036947305]
Avg accuracy: 0.8527798249991515
Std of accuracy : 
0.14613036940370777
confusion matrix: 
[[92884 10001]
 [17978 66694]]
classification report: 
              precision    recall  f1-score   support

           0       0.84      0.90      0.87    102885
           1       0.87      0.79      0.83     84672

    accuracy                           0.85    187557
   macro avg       0.85      0.85      0.85    18

In [38]:
# KNN-CV
start_time = timeit.default_timer()

params = {
    "n_neighbors": list(range(1, 16))
}

randomsearch = RandomizedSearchCV(neighbors.KNeighborsClassifier(), param_distributions=params, cv=splits)
randomsearch.fit(X, y)

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    start_time = timeit.default_timer()
    pred_values = randomsearch.best_estimator_.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    total_time+=elapsed
    total_tests+=len(X_test)

tps = total_time/total_tests

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(randomsearch.best_params_)
print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

KeyboardInterrupt: 

In [21]:
# Decision tree
params = {
    "max_depth": list(range(10, 16))
}

randomsearch = RandomizedSearchCV(DecisionTreeClassifier(), param_distributions=params, cv=splits)
randomsearch.fit(X, y)

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    start_time = timeit.default_timer()
    pred_values = randomsearch.best_estimator_.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    total_time+=elapsed
    total_tests+=len(X_test)

tps = total_time/total_tests

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(randomsearch.best_params_)
print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

100%|█████████████████████████████████████████████████████████████████████████████████| 23/23 [00:00<00:00, 392.84it/s]


---Run time is 223.71726049999415 seconds ---

Accuracy of each fold: 
 [0.9063568010936432, 0.9353460829045723, 0.9921568627450981, 0.9271476698483998, 0.9952267303102625, 0.9604866180048662, 0.9275947097063438, 0.9857587166475692, 0.9563592525617842, 0.917162108600872, 0.9711045687493185, 0.9891135303265941, 0.8790849673202614, 0.985902543671468, 0.9427431421446384, 0.9833546734955185, 0.9812944876676047, 0.9898681471200556, 0.8053133238693977, 0.9796511627906976, 0.9939740578081911, 0.9842947749924494, 0.9958610942862911]
Avg accuracy: 0.955876348985474
Std of accuracy : 
0.045556118989562695
confusion matrix: 
[[96265  6620]
 [ 2192 82480]]
classification report: 
              precision    recall  f1-score   support

           0       0.98      0.94      0.96    102885
           1       0.93      0.97      0.95     84672

    accuracy                           0.95    187557
   macro avg       0.95      0.95      0.95    187557
weighted avg       0.95      0.95      0.95    1875

In [43]:
# random forest
params = {
    "max_depth": list(range(1, 16)),
    "n_estimators": np.linspace(100, 500, 9).astype(int)
}

randomsearch = RandomizedSearchCV(RandomForestClassifier(max_features='sqrt', random_state=2), param_distributions=params, cv=splits)
randomsearch.fit(X, y)

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    start_time = timeit.default_timer()
    pred_values = randomsearch.best_estimator_.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    total_time+=elapsed
    total_tests+=len(X_test)

tps = total_time/total_tests

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(randomsearch.best_params_)
print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

KeyboardInterrupt: 

In [39]:
# bagging
params = {
    "max_depth": list(range(1, 16)),
    "n_estimators": np.linspace(100, 500, 9).astype(int)
}

randomsearch = RandomizedSearchCV(RandomForestClassifier(max_features=None, random_state=2), param_distributions=params, cv=splits)
randomsearch.fit(X, y)

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    start_time = timeit.default_timer()
    pred_values = randomsearch.best_estimator_.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    total_time+=elapsed
    total_tests+=len(X_test)

tps = total_time/total_tests

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(randomsearch.best_params_)
print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

SyntaxError: invalid syntax. Perhaps you forgot a comma? (2552442146.py, line 3)

In [None]:
# AdaBoost
params = {
    "n_estimators": np.linspace(100, 500, 9).astype(int)
}

randomsearch = RandomizedSearchCV(AdaBoostClassifier(learning_rate = 0.1, algorithm="SAMME.R", random_state=2), param_distributions=params, cv=splits)
randomsearch.fit(X, y)

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    start_time = timeit.default_timer()
    pred_values = randomsearch.best_estimator_.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    total_time+=elapsed
    total_tests+=len(X_test)

tps = total_time/total_tests

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(randomsearch.best_params_)
print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

In [None]:
# Gradient boost (sklearn)
params = {
    "n_estimators": np.linspace(100, 500, 9).astype(int)
}

randomsearch = RandomizedSearchCV(GradientBoostingClassifier(n_estimators = 500, learning_rate = 0.1, max_depth = -1, random_state = 2), param_distributions=params, cv=splits)
randomsearch.fit(X, y)

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    start_time = timeit.default_timer()
    pred_values = randomsearch.best_estimator_.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    total_time+=elapsed
    total_tests+=len(X_test)

tps = total_time/total_tests

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(randomsearch.best_params_)
print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

In [None]:
# XGBoost
params = {
    "n_estimators": np.linspace(100, 500, 9).astype(int)
}

randomsearch = RandomizedSearchCV(XGBClassifier(learning_rate = 0.1, use_label_encoder=False, verbose=None, max_depth = 5, eval_metric='logloss'), param_distributions=params, cv=splits)
randomsearch.fit(X, y)

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    start_time = timeit.default_timer()
    pred_values = randomsearch.best_estimator_.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    total_time+=elapsed
    total_tests+=len(X_test)

tps = total_time/total_tests

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(randomsearch.best_params_)
print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [None]:
# LightGBM
params = {
    "n_estimators": np.linspace(100, 500, 9).astype(int)
}

randomsearch = RandomizedSearchCV(lightgbm.LGBMClassifier(learning_rate=0.09,max_depth=-1,verbose=0,random_state=2), param_distributions=params, cv=splits)
randomsearch.fit(X, y)

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    start_time = timeit.default_timer()
    pred_values = randomsearch.best_estimator_.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    
    total_time+=elapsed
    total_tests+=len(X_test)

tps = total_time/total_tests

#print(model.feature_importances_)
Importance = pd.DataFrame({'Importance':(randomsearch.best_estimator_.feature_importances_*100)[0:10]}, 
                      index = (X_train.columns)[0:10])
Importance.sort_values(by = 'Importance', 
                   axis = 0, 
                   ascending = True).plot(kind = 'barh', 
                                          color = 'r')
plt.xlabel('Variable Importance')
plt.gca().legend_ = None
#plt.savefig('plot1.png')

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(randomsearch.best_params_)
print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

In [None]:
# CatBoost
params = {
    "iterations": np.linspace(100, 500, 9).astype(int)
}

randomsearch = RandomizedSearchCV(CatBoostClassifier(learning_rate=0.1, loss_function='Logloss'), param_distributions=params, cv=splits)
randomsearch.fit(X, y)

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    start_time = timeit.default_timer()
    pred_values = randomsearch.best_estimator_.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    
    total_time+=elapsed
    total_tests+=len(X_test)

tps = total_time/total_tests

#print(model.feature_importances_)
Importance = pd.DataFrame({'Importance':(randomsearch.best_estimator_.feature_importances_*100)[0:10]}, 
                      index = (X_train.columns)[0:10])
Importance.sort_values(by = 'Importance', 
                   axis = 0, 
                   ascending = True).plot(kind = 'barh', 
                                          color = 'r')
plt.xlabel('Variable Importance')
plt.gca().legend_ = None
#plt.savefig('plot1.png')

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(randomsearch.best_params_)
print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)