In [None]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing
import wfdb
import copy as cp
import scipy.signal as signal
import scipy.stats as stats
from sklearn import preprocessing
from tqdm import tqdm
import os
import pathlib
import re
import pandas as pd
import pickle
import csv
import statistics

In [None]:
import timeit

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

from sklearn.preprocessing import PolynomialFeatures

import sklearn.linear_model as skl_lm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn import neighbors
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeRegressor 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.tree import export_graphviz
from sklearn.svm import SVC

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn import metrics
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc

import lightgbm
import lightgbm as lgb
from sklearn.preprocessing import LabelBinarizer

import catboost as cb
from catboost import CatBoostClassifier

In [None]:
parent_path = str(pathlib.Path(os.getcwd()).parent.parent)

rlist = []
records = os.path.normpath(parent_path + '/mit-bih-dataframes/subject_list.csv')
with open(records) as rfile:
    recordreader = csv.reader(rfile, delimiter=' ', quotechar='|')
    for row in recordreader:
        rlist.append(row[0])

In [None]:
performance_dict = {
    "Model name": [],
    "Avg Accuracy": [],
    "Std Accuracy": [],
    "Sensitivity": [],
    "Specificity": [],
    "Precision": [],
    "F1 score": [],
    "Run time": [],
    "TPS": []
}

In [None]:
feature_dfs = {}
for record in tqdm(rlist):
    feature_dfs[record] = pd.read_csv(os.path.normpath(parent_path + '/mit-bih-time-features/'+record+'.csv'), index_col=0, dtype={'subjectID': str})

combined_features = pd.concat([feature_dfs[key][1:] for key in feature_dfs])

In [None]:
X = combined_features[['StoS', 'StoR', 'StoL', 'RtoS', 'RtoR', 'RtoL', 'LtoS', 'LtoR', 'LtoL', 'rmssd', 'iqr', 'mad', 'cov']]
y = combined_features['mappedLabel'].map({"Non-Afib": 0, "Afib": 1})
groups = combined_features['subjectID'].astype('int64')

logo = LeaveOneGroupOut()
splits = list(logo.split(X, y, groups=groups))

In [None]:
# Logistic regression
logreg = LogisticRegression(solver='liblinear')

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_train = X.iloc[train]
    y_train = y.iloc[train]

    X_test = X.iloc[test]
    y_test = y.iloc[test]

    logreg.fit(X_train, y_train)
    
    start_time = timeit.default_timer()
    pred_values = logreg.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    total_time+=elapsed
    total_tests+=len(X_test)
    
tps = total_time/total_tests

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Model name'].append("Logistic regression")
performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

In [None]:
# Linear discriminant analysis
start_time = timeit.default_timer()

lda = LinearDiscriminantAnalysis()

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_train = X.iloc[train]
    y_train = y.iloc[train]

    X_test = X.iloc[test]
    y_test = y.iloc[test]

    lda.fit(X_train, y_train)
    
    start_time = timeit.default_timer()
    pred_values = lda.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    total_time+=elapsed
    total_tests+=len(X_test)
    
tps = total_time/total_tests

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Model name'].append("LDA")
performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

In [None]:
# Quadratic discriminant analysis
start_time = timeit.default_timer()

qda = QuadraticDiscriminantAnalysis()

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_train = X.iloc[train]
    y_train = y.iloc[train]

    X_test = X.iloc[test]
    y_test = y.iloc[test]

    qda.fit(X_train, y_train)
    
    start_time = timeit.default_timer()
    pred_values = qda.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    total_time+=elapsed
    total_tests+=len(X_test)
    
tps = total_time/total_tests

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Model name'].append("QDA")
performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

In [None]:
# KNN-CV
params = {
    "n_neighbors": list(range(1, 16))
}

print("random search started")
randomsearch = RandomizedSearchCV(neighbors.KNeighborsClassifier(), param_distributions=params, cv=splits)
randomsearch.fit(X, y)
print("random search completed")

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    start_time = timeit.default_timer()
    pred_values = randomsearch.best_estimator_.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    total_time+=elapsed
    total_tests+=len(X_test)

tps = total_time/total_tests

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(randomsearch.best_params_)
print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Model name'].append("KNN-CV")
performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

In [None]:
# Decision tree
params = {
    "max_depth": list(range(10, 16))
}

print("random search started")
randomsearch = RandomizedSearchCV(DecisionTreeClassifier(), param_distributions=params, cv=splits)
randomsearch.fit(X, y)
print("random search completed")

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    start_time = timeit.default_timer()
    pred_values = randomsearch.best_estimator_.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    total_time+=elapsed
    total_tests+=len(X_test)

tps = total_time/total_tests

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(randomsearch.best_params_)
print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Model name'].append("Decision tree")
performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

In [None]:
# random forest
params = {
    "max_depth": list(range(1, 16)),
    "n_estimators": np.linspace(100, 500, 9).astype(int)
}

print("random search started")
randomsearch = RandomizedSearchCV(RandomForestClassifier(max_features='sqrt', random_state=2), param_distributions=params, cv=splits)
randomsearch.fit(X, y)
print("random search completed")

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    start_time = timeit.default_timer()
    pred_values = randomsearch.best_estimator_.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    total_time+=elapsed
    total_tests+=len(X_test)

tps = total_time/total_tests

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(randomsearch.best_params_)
print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Model name'].append("Random forest")
performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

In [None]:
# bagging
params = {
    "max_depth": list(range(1, 16)),
    "n_estimators": np.linspace(100, 500, 9).astype(int)
}

print("random search started")
randomsearch = RandomizedSearchCV(RandomForestClassifier(max_features='sqrt', random_state=2), param_distributions=params, cv=splits)
randomsearch.fit(X, y)
print("random search completed")

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    start_time = timeit.default_timer()
    pred_values = randomsearch.best_estimator_.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    total_time+=elapsed
    total_tests+=len(X_test)

tps = total_time/total_tests

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(randomsearch.best_params_)
print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Model name'].append("Bagging")
performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

In [None]:
# AdaBoost
params = {
    "max_depth": list(range(1, 16)),
    "n_estimators": np.linspace(100, 500, 9).astype(int)
}

print("random search started")
randomsearch = RandomizedSearchCV(AdaBoostClassifier(learning_rate = 0.1, algorithm="SAMME.R", random_state=2), param_distributions=params, cv=splits)
randomsearch.fit(X, y)
print("random search completed")

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    start_time = timeit.default_timer()
    pred_values = randomsearch.best_estimator_.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    total_time+=elapsed
    total_tests+=len(X_test)

tps = total_time/total_tests

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(randomsearch.best_params_)
print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Model name'].append("AdaBoost")
performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

In [None]:
# Gradient boost (sklearn)
params = {
    "max_depth": list(range(1, 16)),
    "n_estimators": np.linspace(100, 500, 9).astype(int)
}

print("random search started")
randomsearch = RandomizedSearchCV(GradientBoostingClassifier(learning_rate = 0.1, random_state = 2), param_distributions=params, cv=splits)
randomsearch.fit(X, y)
print("random search completed")

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    start_time = timeit.default_timer()
    pred_values = randomsearch.best_estimator_.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    total_time+=elapsed
    total_tests+=len(X_test)

tps = total_time/total_tests

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(randomsearch.best_params_)
print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Model name'].append("Gradient Boost")
performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

In [None]:
# XGBoost
params = {
    "max_depth": list(range(1, 16)),
    "n_estimators": np.linspace(100, 500, 9).astype(int)
}

print("random search started")
randomsearch = RandomizedSearchCV(XGBClassifier(learning_rate = 0.1, use_label_encoder=False, verbose=None, eval_metric='logloss'), param_distributions=params, cv=splits)
randomsearch.fit(X, y)
print("random search completed")

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    start_time = timeit.default_timer()
    pred_values = randomsearch.best_estimator_.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    total_time+=elapsed
    total_tests+=len(X_test)

tps = total_time/total_tests

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(randomsearch.best_params_)
print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Model name'].append("XGBoost")
performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

In [None]:
# LightGBM
params = {
    "max_depth": list(range(1, 16)),
    "n_estimators": np.linspace(100, 500, 9).astype(int)
}

print("random search started")
randomsearch = RandomizedSearchCV(lightgbm.LGBMClassifier(learning_rate=0.09,verbose=0,random_state=2), param_distributions=params, cv=splits)
randomsearch.fit(X, y)
print("random search completed")

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    start_time = timeit.default_timer()
    pred_values = randomsearch.best_estimator_.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    
    total_time+=elapsed
    total_tests+=len(X_test)

tps = total_time/total_tests

#print(model.feature_importances_)
Importance = pd.DataFrame({'Importance':(randomsearch.best_estimator_.feature_importances_*100)[0:10]}, 
                      index = (X_train.columns)[0:10])
Importance.sort_values(by = 'Importance', 
                   axis = 0, 
                   ascending = True).plot(kind = 'barh', 
                                          color = 'r')
plt.xlabel('Variable Importance')
plt.gca().legend_ = None
#plt.savefig('plot1.png')

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(randomsearch.best_params_)
print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Model name'].append("LightGBM")
performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

In [None]:
# CatBoost
params = {
    "max_depth": list(range(1, 16)),
    "iterations": np.linspace(100, 500, 9).astype(int)
}

print("random search started")
randomsearch = RandomizedSearchCV(CatBoostClassifier(learning_rate=0.1, loss_function='Logloss', verbose=None), param_distributions=params, cv=splits)
randomsearch.fit(X, y)
print("random search completed")

acc_score = []
Truth = []
Output = []

total_time = 0
total_tests = 0

for train, test in tqdm(splits):
    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    start_time = timeit.default_timer()
    pred_values = randomsearch.best_estimator_.predict(X_test)
    elapsed = timeit.default_timer() - start_time

    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)

    Truth.extend(y_test.values.reshape(y_test.shape[0])) 
    Output.extend(pred_values)
    
    total_time+=elapsed
    total_tests+=len(X_test)

tps = total_time/total_tests

#print(model.feature_importances_)
Importance = pd.DataFrame({'Importance':(randomsearch.best_estimator_.feature_importances_*100)[0:10]}, 
                      index = (X_train.columns)[0:10])
Importance.sort_values(by = 'Importance', 
                   axis = 0, 
                   ascending = True).plot(kind = 'barh', 
                                          color = 'r')
plt.xlabel('Variable Importance')
plt.gca().legend_ = None
#plt.savefig('plot1.png')

print("---Run time is %s seconds ---" % total_time)
print("---Run time per subset is %s seconds ---" % tps)
print()
print('Accuracy of each fold: \n {}'.format(acc_score))
print("Avg accuracy: {}".format(np.mean(acc_score)))
print('Std of accuracy : \n{}'.format(np.std(acc_score)))
print("confusion matrix: ")

cm = confusion_matrix(Truth, Output)
print(cm)
print("classification report: ")
print(classification_report(Truth, Output))

sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
specificity = cm[1][1]/(cm[1][0]+cm[1][1])
precision = (cm[0][0])/(cm[0][0]+cm[1][0])
f1_score = (2*precision*sensitivity)/(precision+sensitivity)

print(randomsearch.best_params_)
print(sensitivity)
print(specificity)
print(precision)
print(f1_score)

performance_dict['Model name'].append("Catboost")
performance_dict['Avg Accuracy'].append(np.mean(acc_score))
performance_dict['Std Accuracy'].append(np.std(acc_score))
performance_dict['Sensitivity'].append(sensitivity)
performance_dict['Specificity'].append(specificity)
performance_dict['Precision'].append(precision)
performance_dict['F1 score'].append(f1_score)
performance_dict['Run time'].append(total_time)
performance_dict['TPS'].append(tps)

In [None]:
performance_df = pd.DataFrame(performance_dict).set_index("Model name")
performance_df