In [None]:
from matplotlib import pyplot as plt
from tqdm import tqdm
import os
from pathlib import Path, PurePath
import csv
import pandas as pd
import numpy as np
import pickle
from joblib import Memory
from shutil import rmtree

import timeit

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

from sklearn.model_selection import LeaveOneGroupOut
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.feature_selection import RFE

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.inspection import permutation_importance

import catboost as cb
from catboost import CatBoostClassifier

import xgboost as xgb
from xgboost import XGBClassifier

import lightgbm

In [None]:
rlist = []
records = PurePath(Path(os.getcwd()).parents[1], Path('mit-bih-dataframes/subject_list.csv'))
with open(records) as rfile:
    recordreader = csv.reader(rfile, delimiter=' ', quotechar='|')
    for row in recordreader:
        rlist.append(row[0])

In [None]:
performance_dict = {
    "Model name": [],
    "Avg Accuracy": [],
    "Std Accuracy": [],
    "Sensitivity": [],
    "Specificity": [],
    "Precision": [],
    "F1 score": [],
    "Run time": [],
    "TPS": []
}

moving_accuracy = {}

In [None]:
feature_dfs = {}
for record in tqdm(rlist):
    feature_dfs[record] = pd.read_parquet(PurePath(Path(os.getcwd()).parents[1], Path('mit-bih-time-features/' + record+ '.parquet')))

combined_features = pd.concat([feature_dfs[key][1:] for key in feature_dfs])

In [None]:
X = combined_features[['StoS', 'StoR', 'StoL', 'RtoS', 'RtoR', 'RtoL', 'LtoS', 'LtoR', 'LtoL', 'std', 'cov', 'range', 'rrInt_var', 'rmean_var', 'rmssd', 'mad', 'iqr']]
y = combined_features['mappedLabel'].map({"Non-Afib": 0, "Afib": 1})
groups = combined_features['subjectID'].astype('int64')

logo = LeaveOneGroupOut()
splits = list(logo.split(X, y, groups=groups))

In [None]:
def score_reporter(initial_results):
    initial_columns = initial_results.axes[0].tolist()
    results = initial_results.dropna()
    changed_columns = results.axes[0].tolist()
    
    dropped_cols = list(set(initial_columns).difference(changed_columns))
    
    print(dropped_cols)

    acc_scores = []
    for x in range(len(rlist)):
        col_name = 'split'+str(x)+'_test_accuracy'
        if col_name not in dropped_cols:
            acc_scores.append(results[col_name])

    spec_scores = []
    for x in range(len(rlist)):
        col_name = 'split'+str(x)+'_test_specificity'
        if col_name not in dropped_cols:
            spec_scores.append(results[col_name])

    sens_scores = []
    for x in range(len(rlist)):
        col_name = 'split'+str(x)+'_test_sensitivity'
        if col_name not in dropped_cols:
            sens_scores.append(results[col_name])


    prec_scores = []
    for x in range(len(rlist)):
        col_name = 'split'+str(x)+'_test_precision'
        if col_name not in dropped_cols:
            prec_scores.append(results[col_name])

    f1_scores = []
    for x in range(len(rlist)):
        col_name = 'split'+str(x)+'_test_f1_score'
        if col_name not in dropped_cols:
            f1_scores.append(results[col_name])
        
    elapsed_times = []
    for x in range(len(rlist)):
        col_name = 'split'+str(x)+'_test_elapsed'
        if col_name not in dropped_cols:
            elapsed_times.append(results[col_name])

    eps_times = []
    for x in range(len(rlist)):
        col_name = 'split'+str(x)+'_test_eps'
        if col_name not in dropped_cols:
            eps_times.append(results[col_name])

    print('---Run time of each fold: \n {}'.format(elapsed_times))
    print("Avg run time: {}".format(np.mean(elapsed_times)))
    print('---Run time per subset of each fold is: \n {}'.format(eps_times))
    print("Avg run time per subset: {}".format(np.mean(eps_times)))
    print()
    print('Accuracy of each fold: \n {}'.format(acc_scores))
    print("Avg accuracy: {}".format(np.mean(acc_scores)))
    print('Std of accuracy : \n{}'.format(np.std(acc_scores)))
    print()
    print('Specificity of each fold: \n {}'.format(spec_scores))
    print("Avg specificity: {}".format(np.mean(spec_scores)))
    print('Std of specificity: \n{}'.format(np.std(spec_scores)))
    print()
    print('Sensitivity of each fold: \n {}'.format(sens_scores))
    print("Avg sensitivity: {}".format(np.mean(sens_scores)))
    print('Std of sensitivity: \n{}'.format(np.std(sens_scores)))
    print()
    print('Precision of each fold: \n {}'.format(prec_scores))
    print("Avg precision: {}".format(np.mean(prec_scores)))
    print('Std of precision : \n{}'.format(np.std(prec_scores)))
    print()
    print('F1-scores of each fold: \n {}'.format(f1_scores))
    print("Avg F1-scores: {}".format(np.mean(f1_scores)))
    print('Std of F1-scores : \n{}'.format(np.std(f1_scores)))

In [None]:
def scorer(clf, X, y):
    global moving_acc

    start_time = timeit.default_timer()
    y_pred = clf.predict(X)
    elapsed = timeit.default_timer() - start_time

    total_seen = 0
    total_correct = 0
    subject_acc = []
    for idx, pred in enumerate(y_pred):
        total_seen+=1
        if pred==y.iloc[idx]:
            total_correct+=1
        subject_acc.append(total_correct/total_seen)
    moving_acc.append(subject_acc)

    fold_size = len(X)

    cm = confusion_matrix(y, y_pred)

    sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
    specificity = cm[1][1]/(cm[1][0]+cm[1][1])
    precision = (cm[0][0])/(cm[0][0]+cm[1][0])
    f1_score = (2*precision*sensitivity)/(precision+sensitivity)

    return {'sensitivity': sensitivity, 'specificity': specificity,
            'precision': precision, 'f1_score': f1_score,
            'accuracy': accuracy_score(y, y_pred), 
            'elapsed': elapsed, 'eps': elapsed/fold_size}

In [None]:
np.seterr(all='ignore')
moving_accs = []

if os.path.exists('saved_gridsearch')==False:
    os.mkdir('saved_gridsearch')

In [None]:
# CatBoost
feature_selection_clf = XGBClassifier(n_estimators=100, max_depth=4, verbose=None, eval_metric='logloss', learning_rate=0.1, tree_method="gpu_hist")

location = "cache"
memory = Memory(location=location, verbose=10)
cb_pipe = Pipeline([
    ('rfe', RFE(estimator=feature_selection_clf,
                n_features_to_select=14)),
    ('clf', CatBoostClassifier(
                        learning_rate=0.1,
                        loss_function='Logloss',
                        task_type="GPU"))
], memory=memory)

params = {
    "clf__iterations": np.arange(450, 550, 25),
    "clf__depth": np.arange(3, 7)
}

moving_acc = []

start_time = timeit.default_timer() #defines start time so computational time can be calculated

cb_search = GridSearchCV(cb_pipe, params, scoring=scorer, refit="accuracy", cv=splits, verbose=3)
cb_search.fit(X, y)

moving_accs.append(moving_acc)
with open('saved_gridsearch/cb_search.pickle', 'wb') as handle:
    pickle.dump(cb_search, handle, protocol=pickle.HIGHEST_PROTOCOL)

cb_scores = cb_search.cv_results_
best_results = pd.DataFrame(cb_scores).iloc[cb_search.best_index_]
score_reporter(best_results)

# Delete the temporary cache before exiting
memory.clear(warn=False)
rmtree(location)

In [None]:
# XGBoost
feature_selection_clf = XGBClassifier(n_estimators=100, max_depth=4, verbose=None, eval_metric='logloss', learning_rate=0.1, tree_method="gpu_hist")

location = "cache"
memory = Memory(location=location, verbose=10)
xg_pipe = Pipeline([
    ('rfe', RFE(estimator=feature_selection_clf,
                n_features_to_select=14)),
    ('clf', XGBClassifier(verbose=None, 
                          eval_metric='logloss', 
                          learning_rate=0.1, 
                          tree_method="gpu_hist"))
], memory=memory)

params = {
    "clf__n_estimators": np.arange(300, 550, 50),
    "clf__max_depth": np.arange(3, 7)
}

moving_acc = []

start_time = timeit.default_timer() #defines start time so computational time can be calculated

xg_search = GridSearchCV(xg_pipe, params, scoring=scorer, refit="accuracy", cv=splits, verbose=3)
xg_search.fit(X, y)

moving_accs.append(moving_acc)
with open('saved_gridsearch/xg_search.pickle', 'wb') as handle:
    pickle.dump(xg_search, handle, protocol=pickle.HIGHEST_PROTOCOL)

xg_scores = xg_search.cv_results_
best_results = pd.DataFrame(xg_scores).iloc[xg_search.best_index_]
score_reporter(best_results)

# Delete the temporary cache before exiting
memory.clear(warn=False)
rmtree(location)

In [None]:
# LightGBM
feature_selection_clf = XGBClassifier(n_estimators=100, max_depth=4, verbose=None, eval_metric='logloss', learning_rate=0.1, tree_method="gpu_hist")

location = "cache"
memory = Memory(location=location, verbose=10)
lg_pipe = Pipeline([
    ('rfe', RFE(estimator=feature_selection_clf,
                n_features_to_select=14)),
    ('clf', lightgbm.LGBMClassifier(learning_rate=0.1, 
                                    random_state=42))
], memory=memory)

params = {
    "clf__n_estimators": np.arange(200, 550, 50),
    "clf__max_depth": np.arange(3, 7)
}

moving_acc = []

start_time = timeit.default_timer() #defines start time so computational time can be calculated

lg_search = GridSearchCV(lg_pipe, params, scoring=scorer, refit="accuracy", cv=splits, verbose=3)
lg_search.fit(X, y)

moving_accs.append(moving_acc)
with open('saved_gridsearch/lg_search.pickle', 'wb') as handle:
    pickle.dump(lg_search, handle, protocol=pickle.HIGHEST_PROTOCOL)

lg_scores = lg_search.cv_results_
best_results = pd.DataFrame(lg_scores).iloc[lg_search.best_index_]
score_reporter(best_results)

# Delete the temporary cache before exiting
memory.clear(warn=False)
rmtree(location)