In [1]:
from matplotlib import pyplot as plt
from tqdm import tqdm
import os
from pathlib import Path, PurePath
import csv
import pandas as pd
import numpy as np
import pickle
from joblib import Memory
from shutil import rmtree
from collections import defaultdict

import timeit

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import ParameterGrid
from sklearn.feature_selection import RFECV

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.inspection import permutation_importance

import catboost as cb
from catboost import CatBoostClassifier

import xgboost as xgb
from xgboost import XGBClassifier

import lightgbm

In [2]:
rlist = []
records = PurePath(Path(os.getcwd()).parents[1], Path('mit-bih-dataframes/subject_list.csv'))
with open(records) as rfile:
    recordreader = csv.reader(rfile, delimiter=' ', quotechar='|')
    for row in recordreader:
        rlist.append(row[0])

In [3]:
feature_dfs = {}
for record in tqdm(rlist):
    feature_dfs[record] = pd.read_parquet(PurePath(Path(os.getcwd()).parents[1], Path('mit-bih-time-features/' + record+ '.parquet')))

combined_features = pd.concat([feature_dfs[key][1:] for key in feature_dfs])

100%|██████████| 23/23 [00:00<00:00, 84.57it/s]


In [4]:
X = combined_features[['StoS', 'StoR', 'StoL', 'RtoS', 'RtoR', 'RtoL', 'LtoS', 'LtoR', 'LtoL', 'std', 'cov', 'range', 'rrInt_var', 'rmean_var', 'rmssd', 'mad', 'iqr']]
y = combined_features['mappedLabel'].map({"Non-Afib": 0, "Afib": 1})
groups = combined_features['subjectID'].astype('int64')

logo = LeaveOneGroupOut()
splits = list(logo.split(X, y, groups=groups))

In [5]:
def score_reporter(initial_results):
    initial_columns = initial_results.axes[0].tolist()
    results = initial_results.dropna()
    changed_columns = results.axes[0].tolist()
    
    dropped_cols = list(set(initial_columns).difference(changed_columns))
    
    print(dropped_cols)

    acc_scores = []
    for x in range(len(rlist)):
        col_name = 'split'+str(x)+'_test_accuracy'
        if col_name not in dropped_cols:
            acc_scores.append(results[col_name])

    spec_scores = []
    for x in range(len(rlist)):
        col_name = 'split'+str(x)+'_test_specificity'
        if col_name not in dropped_cols:
            spec_scores.append(results[col_name])

    sens_scores = []
    for x in range(len(rlist)):
        col_name = 'split'+str(x)+'_test_sensitivity'
        if col_name not in dropped_cols:
            sens_scores.append(results[col_name])


    prec_scores = []
    for x in range(len(rlist)):
        col_name = 'split'+str(x)+'_test_precision'
        if col_name not in dropped_cols:
            prec_scores.append(results[col_name])

    f1_scores = []
    for x in range(len(rlist)):
        col_name = 'split'+str(x)+'_test_f1_score'
        if col_name not in dropped_cols:
            f1_scores.append(results[col_name])
        
    elapsed_times = []
    for x in range(len(rlist)):
        col_name = 'split'+str(x)+'_test_elapsed'
        if col_name not in dropped_cols:
            elapsed_times.append(results[col_name])

    eps_times = []
    for x in range(len(rlist)):
        col_name = 'split'+str(x)+'_test_eps'
        if col_name not in dropped_cols:
            eps_times.append(results[col_name])

    print('---Run time of each fold: \n {}'.format(elapsed_times))
    print("Avg run time: {}".format(np.mean(elapsed_times)))
    print('---Run time per subset of each fold is: \n {}'.format(eps_times))
    print("Avg run time per subset: {}".format(np.mean(eps_times)))
    print()
    print('Accuracy of each fold: \n {}'.format(acc_scores))
    print("Avg accuracy: {}".format(np.mean(acc_scores)))
    print('Std of accuracy : \n{}'.format(np.std(acc_scores)))
    print()
    print('Specificity of each fold: \n {}'.format(spec_scores))
    print("Avg specificity: {}".format(np.mean(spec_scores)))
    print('Std of specificity: \n{}'.format(np.std(spec_scores)))
    print()
    print('Sensitivity of each fold: \n {}'.format(sens_scores))
    print("Avg sensitivity: {}".format(np.mean(sens_scores)))
    print('Std of sensitivity: \n{}'.format(np.std(sens_scores)))
    print()
    print('Precision of each fold: \n {}'.format(prec_scores))
    print("Avg precision: {}".format(np.mean(prec_scores)))
    print('Std of precision : \n{}'.format(np.std(prec_scores)))
    print()
    print('F1-scores of each fold: \n {}'.format(f1_scores))
    print("Avg F1-scores: {}".format(np.mean(f1_scores)))
    print('Std of F1-scores : \n{}'.format(np.std(f1_scores)))

In [6]:
def scorer(clf, X, y):
    global moving_acc

    start_time = timeit.default_timer()
    y_pred = clf.predict(X)
    elapsed = timeit.default_timer() - start_time

    total_seen = 0
    total_correct = 0
    subject_acc = []
    for idx, pred in enumerate(y_pred):
        total_seen+=1
        if pred==y.iloc[idx]:
            total_correct+=1
        subject_acc.append(total_correct/total_seen)
    moving_acc.append(subject_acc)

    fold_size = len(X)

    cm = confusion_matrix(y, y_pred)

    sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
    specificity = cm[1][1]/(cm[1][0]+cm[1][1])
    precision = (cm[0][0])/(cm[0][0]+cm[1][0])
    f1_score = (2*precision*sensitivity)/(precision+sensitivity)

    return {'sensitivity': sensitivity, 'specificity': specificity,
            'precision': precision, 'f1_score': f1_score,
            'accuracy': accuracy_score(y, y_pred), 
            'elapsed': elapsed, 'eps': elapsed/fold_size}

In [7]:
np.seterr(all='ignore')
moving_accs = []

if os.path.exists('saved_gridsearch')==False:
    os.mkdir('saved_gridsearch')

In [8]:
def fit_transform_cacheable(transformer, X, y, **fit_params):
    if hasattr(transformer, "fit_transform"):
        res = transformer.fit_transform(X, y, **fit_params)
    else:
        res = transformer.fit(X, y, **fit_params).transform(X)

    return res, transformer

In [10]:
# CatBoost
feature_selection_clf = XGBClassifier(n_estimators=75, 
                                      max_depth=3, 
                                      eval_metric='logloss', 
                                      learning_rate=0.1, 
                                      tree_method="gpu_hist",
                                      verbosity=2)

location = "cache"
memory = Memory(location=location, verbose=10)
fit_transform_cached = memory.cache(fit_transform_cacheable)

params = {
    "iterations": np.arange(450, 550, 25),
    "depth": np.arange(3, 7)
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')

results = {}
for fit_params in param_grid:
    current_results = defaultdict(list)

    print(f'Fitting parameter combination: {fit_params}')
    Truth = []
    Output = []

    for i, (train, test) in enumerate(splits):
        X_train = X.iloc[train]
        y_train = y.iloc[train]

        X_test = X.iloc[test]
        y_test = y.iloc[test]

        rfe_start_time = timeit.default_timer()
        train_groups = groups.iloc[train]
        rfecv_splits = LeaveOneGroupOut().split(X_train, y_train, groups=train_groups)
        rfecv = RFECV(estimator=feature_selection_clf, 
                      cv=rfecv_splits)
        fit_transform_cached(rfecv, X_train, y_train)
        print(f'RFECV fitting took {timeit.default_timer()-rfe_start_time} seconds')

        clf_start_time = timeit.default_timer()
        clf = CatBoostClassifier(
                        learning_rate=0.1,
                        loss_function='Logloss',
                        task_type="GPU",
                        **fit_params)
        clf.fit(X_train, y_train)
        print(f'classifier fitting took {timeit.default_timer()-clf_start_time} seconds')

        pred_values = clf.predict(X_test)

        cm = confusion_matrix(y_test.values.reshape(y_test.shape[0]), pred_values)
        sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
        specificity = cm[1][1]/(cm[1][0]+cm[1][1])
        precision = (cm[0][0])/(cm[0][0]+cm[1][0])
        f1_score = (2*precision*sensitivity)/(precision+sensitivity)
        acc = accuracy_score(y_test, pred_values)

        current_results[f"split{i}_accuracy"].append(acc)
        current_results[f"split{i}_sensitivity"].append(sensitivity)
        current_results[f"split{i}_specificity"].append(specificity)
        current_results[f"split{i}_precision"].append(precision)
        current_results[f"split{i}_f1_score"].append(f1_score)

        print(f"split {i} complete - accuracy={acc}, sensitivity={sensitivity}, specificity={specificity}, precision={precision}, f1_score={f1_score}")

        current_results[f"split{i}_rfecv"].append(rfecv)
        current_results[f"split{i}_clf"].append(clf)
    results[tuple(sorted(fit_params.items()))] = current_results

with open('saved_gridsearch/cb_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

print(results)

# Delete the temporary cache before exiting
memory.clear(warn=False)
rmtree(location)

Fitting with 16 different parameter combinations
Fitting parameter combination: {'depth': 3, 'iterations': 450}
________________________________________________________________________________
[Memory] Calling __main__--tmp-ipykernel-2486386570.fit_transform_cacheable...
fit_transform_cacheable(RFECV(cv=<generator object BaseCrossValidator.split at 0x7f83f74a0d60>,
      estimator=XGBClassifier(base_score=None, booster=None, callbacks=None,
                              colsample_bylevel=None, colsample_bynode=None,
                              colsample_bytree=None, early_stopping_rounds=None,
                              enable_categorical=False, eval_metric='logloss',
                              feature_types=None, gamma=None, gpu_id=None,
                              grow_policy=None, importance_type=None,
                              interaction_constraints=None, learning_rate=0.1,
                              max_bin=None, max_cat_threshold=None,
                          

KeyboardInterrupt: 

In [None]:
# XGBoost
feature_selection_clf = XGBClassifier(n_estimators=100, max_depth=4, verbose=None, eval_metric='logloss', learning_rate=0.1, tree_method="gpu_hist")

location = "cache"
memory = Memory(location=location, verbose=10)
xg_pipe = Pipeline([
    ('rfe', RFE(estimator=feature_selection_clf,
                n_features_to_select=14)),
    ('clf', XGBClassifier(verbose=None, 
                          eval_metric='logloss', 
                          learning_rate=0.1, 
                          tree_method="gpu_hist"))
], memory=memory)

params = {
    "clf__n_estimators": np.arange(300, 550, 50),
    "clf__max_depth": np.arange(3, 7)
}

moving_acc = []

start_time = timeit.default_timer() #defines start time so computational time can be calculated

xg_search = GridSearchCV(xg_pipe, params, scoring=scorer, refit="accuracy", cv=splits, verbose=3)
xg_search.fit(X, y)

moving_accs.append(moving_acc)
with open('saved_gridsearch/xg_search.pickle', 'wb') as handle:
    pickle.dump(xg_search, handle, protocol=pickle.HIGHEST_PROTOCOL)

xg_scores = xg_search.cv_results_
best_results = pd.DataFrame(xg_scores).iloc[xg_search.best_index_]
score_reporter(best_results)

# Delete the temporary cache before exiting
memory.clear(warn=False)
rmtree(location)

In [None]:
# LightGBM
feature_selection_clf = XGBClassifier(n_estimators=100, max_depth=4, verbose=None, eval_metric='logloss', learning_rate=0.1, tree_method="gpu_hist")

location = "cache"
memory = Memory(location=location, verbose=10)
lg_pipe = Pipeline([
    ('rfe', RFE(estimator=feature_selection_clf,
                n_features_to_select=14)),
    ('clf', lightgbm.LGBMClassifier(learning_rate=0.1, 
                                    random_state=42))
], memory=memory)

params = {
    "clf__n_estimators": np.arange(200, 550, 50),
    "clf__max_depth": np.arange(3, 7)
}

moving_acc = []

start_time = timeit.default_timer() #defines start time so computational time can be calculated

lg_search = GridSearchCV(lg_pipe, params, scoring=scorer, refit="accuracy", cv=splits, verbose=3)
lg_search.fit(X, y)

moving_accs.append(moving_acc)
with open('saved_gridsearch/lg_search.pickle', 'wb') as handle:
    pickle.dump(lg_search, handle, protocol=pickle.HIGHEST_PROTOCOL)

lg_scores = lg_search.cv_results_
best_results = pd.DataFrame(lg_scores).iloc[lg_search.best_index_]
score_reporter(best_results)

# Delete the temporary cache before exiting
memory.clear(warn=False)
rmtree(location)