In [1]:
from matplotlib import pyplot as plt
from tqdm.auto import tqdm

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

from pathlib import Path, PurePath
import csv
import pandas as pd
import numpy as np
import pickle
from joblib import Memory, Parallel, delayed
from shutil import rmtree
from collections import defaultdict

import timeit

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import ParameterGrid
from sklearn.feature_selection import RFECV

from sklearn.base import clone

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import catboost as cb
from catboost import CatBoostClassifier

import xgboost as xgb
from xgboost import XGBClassifier

import lightgbm

In [2]:
rlist = []
records = PurePath(Path(os.getcwd()).parents[1], Path('mit-bih-dataframes/subject_list.csv'))
with open(records) as rfile:
    recordreader = csv.reader(rfile, delimiter=' ', quotechar='|')
    for row in recordreader:
        rlist.append(row[0])

In [3]:
feature_dfs = {}
for record in tqdm(rlist):
    feature_dfs[record] = pd.read_parquet(str(Path(os.getcwd()).parents[1]) + '/mit-bih-time-features/'+record+'.parquet')

combined_features = pd.concat([feature_dfs[key][1:] for key in feature_dfs])

  0%|          | 0/23 [00:00<?, ?it/s]

In [4]:
X = combined_features[['StoS', 'StoR', 'StoL', 'RtoS', 'RtoR', 'RtoL', 'LtoS', 'LtoR', 'LtoL', 'std', 'cov', 'range', 'rrInt_var', 'rmean_var', 'rmssd', 'mad', 'iqr', 'drrmean', 'drrvar']]
y = combined_features['mappedLabel'].map({"Non-Afib": 0, "Afib": 1})
groups = combined_features['subjectID'].astype('int64')

logo = LeaveOneGroupOut()
splits = list(logo.split(X, y, groups=groups))

In [5]:
np.seterr(all='ignore')
moving_accs = []

if os.path.exists('saved_gridsearch')==False:
    os.mkdir('saved_gridsearch')

In [6]:
def score_reporter(results):
    bestParams = None
    maxScore = 0
    for params, scores in results.items():
        num_splits = scores['folds']
        accuracy = [scores[f"split{i}_accuracy"] for i in range(num_splits)]
        print(params, np.mean(accuracy))
        
        if (np.mean(accuracy) > maxScore):
            bestParams = params
            maxScore = np.mean(accuracy)
            
    bestScores = results[bestParams]
    num_splits = bestScores['folds']
    accuracy = [bestScores[f"split{i}_accuracy"] for i in range(num_splits)]
    sensitivity = [bestScores[f"split{i}_sensitivity"] for i in range(num_splits)]
    specificity = [bestScores[f"split{i}_specificity"] for i in range(num_splits)]
    precision = [bestScores[f"split{i}_precision"] for i in range(num_splits)]
    f1_score = [bestScores[f"split{i}_f1_score"] for i in range(num_splits)]
    
    n_features_selected = [bestScores[f"split{i}_n_features"] for i in range(num_splits)]
    feature_rankings = [bestScores[f"split{i}_ranking"] for i in range(num_splits)]
    
    avg_rankings = np.mean(np.array(feature_rankings), axis=0)
    feature_names = bestScores["split0_feature_names_in"][0]
    mapped_rankings = {name: rank for name, rank in zip(feature_names, avg_rankings.flatten())}
    
    print(f"The best parameters were {bestParams}")
    print(f"Accuracy for each fold: {accuracy}")
    print(f"Mean accuracy: {np.mean(accuracy)}")
    print(f"Std accuracy: {np.std(accuracy)}")
    print(f"Sensitivity for each fold: {sensitivity}")
    print(f"Mean sensitivity: {np.mean(sensitivity)}")
    print(f"Std sensitivity: {np.std(sensitivity)}")
    print(f"Specificity for each fold: {specificity}")
    print(f"Mean specificity: {np.mean(specificity)}")
    print(f"Std specificity: {np.std(specificity)}")
    print(f"Precision for each fold: {precision}")
    print(f"Mean precision: {np.mean(precision)}")
    print(f"Std precision: {np.std(precision)}")
    print(f"F1-score for each fold: {f1_score}")
    print(f"Mean F1-score: {np.mean(f1_score)}")
    print(f"Std F1-score: {np.std(f1_score)}")
    print(f"Number of features selected in each fold: {n_features_selected}")
    print(f"Mean number of features selected: {np.mean(n_features_selected)}")
    print("Average feature ranking: ")
    print(mapped_rankings)

In [7]:
def fit_transform_cacheable(transformer, X, y, **fit_params):
    if hasattr(transformer, "fit_transform"):
        res = transformer.fit_transform(X, y, **fit_params)
    else:
        res = transformer.fit(X, y, **fit_params).transform(X)

    return res, transformer

In [8]:
location = "cache"
#rmtree(location)

In [9]:
# XGBoost
memory = Memory(location=location, verbose=0)
fit_transform_cached = memory.cache(fit_transform_cacheable)

feature_selection_clf = XGBClassifier(n_estimators=150,
                          max_depth=4,
                          eval_metric='logloss', 
                          learning_rate=0.1, 
                          tree_method="gpu_hist",
                          verbosity=0)

def fit_xgboost_parallel(X, y, train, test, feature_selection_clf, **fit_params):
    np.seterr(all='ignore')
    
    X_train = X.iloc[train]
    y_train = y.iloc[train]

    X_test = X.iloc[test]
    y_test = y.iloc[test]

    #rfe_start_time = timeit.default_timer()
    train_groups = groups.iloc[train]
    rfecv_splits = list(LeaveOneGroupOut().split(X_train, y_train, groups=train_groups))
    rfecv = RFECV(estimator=clone(feature_selection_clf), 
                  cv=rfecv_splits,
                  step=2,
                  scoring="accuracy",
                  n_jobs=1)

    #print('RFECV fitting started')
    X_train, rfecv = fit_transform_cached(rfecv, X_train, y_train)
    #print(f'RFECV fitting took {timeit.default_timer()-rfe_start_time} seconds')
    #print(f'{rfecv.n_features_} features selected')

    #clf_start_time = timeit.default_timer()
    clf = XGBClassifier(learning_rate = 0.1,
                        verbose=None, 
                        eval_metric='logloss',
                        tree_method='gpu_hist',
                        **fit_params)
    #print('classifier fitting started')
    clf.fit(X_train, y_train)
    #print(f'classifier fitting took {timeit.default_timer()-clf_start_time} seconds')

    pred_values = clf.predict(rfecv.transform(X_test))

    cm = confusion_matrix(y_test.values.reshape(y_test.shape[0]), pred_values)
    sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
    specificity = cm[1][1]/(cm[1][0]+cm[1][1])
    precision = (cm[0][0])/(cm[0][0]+cm[1][0])

    results_dict = {
        "accuracy": accuracy_score(y_test, pred_values),
        "sensitivity": sensitivity,
        "specificity": specificity,
        "precision": precision,
        "f1_score": (2*precision*sensitivity)/(precision+sensitivity),
        "n_features": rfecv.n_features_,
        "ranking": rfecv.ranking_,
        "feature_names_in": rfecv.feature_names_in_,
        "feature_importances": clf.feature_importances_
    }
    
    return results_dict

params = {
    "n_estimators": np.arange(100, 1000, 50),
    "max_depth": np.arange(2, 11)
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')

logo = LeaveOneGroupOut()
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        current_results = defaultdict(list)

        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        fold_results = Parallel(n_jobs=8, max_nbytes=1e6)(
            delayed(fit_xgboost_parallel)(X, y, train, test, feature_selection_clf, **fit_params)
            for (train, test) in splits
        )
        
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"].append(result["accuracy"])
            current_results[f"split{i}_sensitivity"].append(result["sensitivity"])
            current_results[f"split{i}_specificity"].append(result["specificity"])
            current_results[f"split{i}_precision"].append(result["precision"])
            current_results[f"split{i}_f1_score"].append(result["f1_score"])
            current_results[f"split{i}_n_features"].append(result["n_features"])
            current_results[f"split{i}_ranking"].append(result["ranking"])
            current_results[f"split{i}_feature_names_in"].append(result["feature_names_in"])
            current_results[f"split{i}_feature_importances"].append(result["feature_importances"])
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open('saved_gridsearch/xg_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Delete the temporary cache before exiting
# memory.clear(warn=False)
# rmtree(location)

Fitting with 162 different parameter combinations


  0%|          | 0/162 [00:00<?, ?it/s]

In [10]:
score_reporter(results)

(('max_depth', 2), ('n_estimators', 100)) 0.9280052424651943
(('max_depth', 2), ('n_estimators', 150)) 0.929648653804492
(('max_depth', 2), ('n_estimators', 200)) 0.9288538957476153
(('max_depth', 2), ('n_estimators', 250)) 0.9312140488056105
(('max_depth', 2), ('n_estimators', 300)) 0.9323368059767886
(('max_depth', 2), ('n_estimators', 350)) 0.9339229946391171
(('max_depth', 2), ('n_estimators', 400)) 0.9346330812404043
(('max_depth', 2), ('n_estimators', 450)) 0.9350308155131625
(('max_depth', 2), ('n_estimators', 500)) 0.9347088120475971
(('max_depth', 2), ('n_estimators', 550)) 0.9361975191585422
(('max_depth', 2), ('n_estimators', 600)) 0.9348457657852421
(('max_depth', 2), ('n_estimators', 650)) 0.935576942238135
(('max_depth', 2), ('n_estimators', 700)) 0.9355685567017306
(('max_depth', 2), ('n_estimators', 750)) 0.9355800313093569
(('max_depth', 2), ('n_estimators', 800)) 0.9355916172904685
(('max_depth', 2), ('n_estimators', 850)) 0.9356369215148763
(('max_depth', 2), ('n_est

In [None]:
# CatBoost
feature_selection_clf = XGBClassifier(n_estimators=75, 
                                      max_depth=3, 
                                      eval_metric='logloss', 
                                      learning_rate=0.1, 
                                      tree_method="gpu_hist",
                                      verbosity=2)

memory = Memory(location=location, verbose=10)
fit_transform_cached = memory.cache(fit_transform_cacheable)

params = {
    "iterations": np.arange(350, 550, 25),
    "depth": np.arange(3, 7)
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')

logo = LeaveOneGroupOut()
results = {}
for fit_params in param_grid:
    current_results = defaultdict(list)
    
    print(f'Fitting parameter combination: {fit_params}')
    Truth = []
    Output = []

    for i, (train, test) in enumerate(splits):
        X_train = X.iloc[train]
        y_train = y.iloc[train]

        X_test = X.iloc[test]
        y_test = y.iloc[test]

        rfe_start_time = timeit.default_timer()
        train_groups = groups.iloc[train]
        rfecv_splits = list(logo.split(X_train, y_train, groups=train_groups))
        rfecv = RFECV(estimator=feature_selection_clf, 
                      cv=rfecv_splits,
                      n_jobs=-1)
        
        X_train, rfecv = fit_transform_cached(rfecv, X_train, y_train)
        print(f'RFECV fitting took {timeit.default_timer()-rfe_start_time} seconds')
        print(f'{X_train.shape} shape of selected X_train')

        clf_start_time = timeit.default_timer()
        clf = CatBoostClassifier(
                        learning_rate=0.1,
                        loss_function='Logloss',
                        task_type="GPU",
                        silent=True,
                        **fit_params)
        print('classifier created, fitting now')
        clf.fit(X_train, y_train)
        print(f'classifier fitting took {timeit.default_timer()-clf_start_time} seconds')

        pred_values = clf.predict(rfecv.transform(X_test))

        cm = confusion_matrix(y_test.values.reshape(y_test.shape[0]), pred_values)
        sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
        specificity = cm[1][1]/(cm[1][0]+cm[1][1])
        precision = (cm[0][0])/(cm[0][0]+cm[1][0])
        f1_score = (2*precision*sensitivity)/(precision+sensitivity)
        acc = accuracy_score(y_test, pred_values)

        current_results[f"split{i}_accuracy"].append(acc)
        current_results[f"split{i}_sensitivity"].append(sensitivity)
        current_results[f"split{i}_specificity"].append(specificity)
        current_results[f"split{i}_precision"].append(precision)
        current_results[f"split{i}_f1_score"].append(f1_score)

        print(f"split {i} complete - accuracy={acc}, sensitivity={sensitivity}, specificity={specificity}, precision={precision}, f1_score={f1_score}")

        current_results[f"split{i}_rfecv"].append(rfecv)
        current_results[f"split{i}_clf"].append(clf)
    results[tuple(sorted(fit_params.items()))] = current_results
    results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open('saved_gridsearch/cb_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

print(results)

# Delete the temporary cache before exiting
memory.clear(warn=False)
rmtree(location)

In [None]:
score_reporter(results)