In [1]:
from matplotlib import pyplot as plt
from tqdm.auto import tqdm
import os
from pathlib import Path, PurePath
import csv
import pandas as pd
import cudf as cd
import numpy as np

import timeit

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

from sklearn.model_selection import cross_validate
from sklearn.model_selection import LeaveOneGroupOut

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.inspection import permutation_importance

import catboost as cb
from catboost import CatBoostClassifier

from cuml.ensemble import RandomForestClassifier

In [2]:
rlist = []
records = PurePath(Path(os.getcwd()).parents[1], Path('mit-bih-dataframes/subject_list.csv'))
with open(records) as rfile:
    recordreader = csv.reader(rfile, delimiter=' ', quotechar='|')
    for row in recordreader:
        rlist.append(row[0])

In [3]:
performance_dict = {
    "Model name": [],
    "Avg Accuracy": [],
    "Std Accuracy": [],
    "Sensitivity": [],
    "Specificity": [],
    "Precision": [],
    "F1 score": [],
    "Run time": [],
    "TPS": []
}

moving_accuracy = {}

In [4]:
def score_reporter(scores):
    print('---Run time of each fold: \n {}'.format(scores['test_elapsed'].tolist()))
    print("Avg run time: {}".format(scores['test_elapsed'].mean()))
    print('---Run time per subset of each fold is: \n {}'.format(scores['test_eps'].tolist()))
    print("Avg run time per subset: {}".format(np.mean(scores['test_eps'].mean())))
    print()
    print('Accuracy of each fold: \n {}'.format(scores['test_accuracy'].tolist()))
    print("Avg accuracy: {}".format(scores['test_accuracy'].mean()))
    print('Std of accuracy : \n{}'.format(scores['test_accuracy'].std()))
    print()
    print('Specificity of each fold: \n {}'.format(scores['test_specificity'].tolist()))
    print("Avg specificity: {}".format(scores['test_specificity'].mean()))
    print('Std of specificity: \n{}'.format(scores['test_specificity'].std()))
    print()
    print('Sensitivity of each fold: \n {}'.format(scores['test_sensitivity'].tolist()))
    print("Avg sensitivity: {}".format(scores['test_sensitivity'].mean()))
    print('Std of sensitivity: \n{}'.format(scores['test_sensitivity'].std()))
    print()
    print('Precision of each fold: \n {}'.format(scores['test_precision'].tolist()))
    print("Avg precision: {}".format(scores['test_precision'].mean()))
    print('Std of precision : \n{}'.format(scores['test_precision'].std()))
    print()
    print('F1-scores of each fold: \n {}'.format(scores['test_f1_score'].tolist()))
    print("Avg F1-scores: {}".format(scores['test_f1_score'].mean()))
    print('Std of F1-scores : \n{}'.format(scores['test_f1_score'].std()))

In [5]:
def cv_scorer(clf, X, y):
    global moving_acc
    global importances

    start_time = timeit.default_timer()
    y_pred = clf.predict(X)
    elapsed = timeit.default_timer() - start_time

    total_seen = 0
    total_correct = 0
    subject_acc = []
    for idx, pred in enumerate(y_pred):
        total_seen+=1
        if pred==y.iloc[idx]:
            total_correct+=1
        subject_acc.append(total_correct/total_seen)
    moving_acc.append(subject_acc)

    fold_size = len(X)

    cm = confusion_matrix(y, y_pred)

    sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
    specificity = cm[1][1]/(cm[1][0]+cm[1][1])
    precision = (cm[0][0])/(cm[0][0]+cm[1][0])
    f1_score = (2*precision*sensitivity)/(precision+sensitivity)

    importances.append(permutation_importance(clf, X, y, scoring='accuracy', random_state=12))

    return {'accuracy': accuracy_score(y, y_pred), 
            'sensitivity': sensitivity, 'specificity': specificity,
            'precision': precision, 'f1_score': f1_score,
            #'auc_score': roc_auc_score(y, clf.predict_proba(X)[:, 1]),
            'elapsed': elapsed, 'eps': elapsed/fold_size}

In [6]:
rr_int_dfs = {}
for record in tqdm(rlist): 
    rr_int_dfs[record] = cd.read_parquet(str(Path(os.getcwd()).parents[1]) + '/mit-bih-rr-intervals/'+record+'.parquet')

  0%|          | 0/23 [00:00<?, ?it/s]

In [7]:
def find_proportions(groups, window_size):
    series_list = []

    for i in range(window_size):
        row = groups[['rr_int', 'rmean']].nth(i).to_pandas()
        conditions = [
            row['rr_int'] <= 0.85*row['rmean'],
            (row['rr_int'] > 0.85*row['rmean']) & (row['rr_int'] < 1.15*row['rmean']),
            row['rr_int'] >= 1.15*row['rmean']
        ]
        choices = [
            'short',
            'regular',
            'long'
        ]
        row['int_type'] = np.select(conditions, choices)
        series_list.append(row)
    
    transition_matrix = pd.DataFrame(data={
                                            'StoS': np.zeros(len(series_list[0])), 
                                            'StoR': np.zeros(len(series_list[0])), 
                                            'StoL': np.zeros(len(series_list[0])), 
                                            'RtoS': np.zeros(len(series_list[0])), 
                                            'RtoR': np.zeros(len(series_list[0])), 
                                            'RtoL': np.zeros(len(series_list[0])), 
                                            'LtoS': np.zeros(len(series_list[0])), 
                                            'LtoR': np.zeros(len(series_list[0])), 
                                            'LtoL': np.zeros(len(series_list[0]))})

    for i in range(len(series_list)-1):
        temp_matrix = pd.DataFrame()
        conditions = [
            (series_list[i]['int_type']=='short') & (series_list[i+1]['int_type']=='short'),
            (series_list[i]['int_type']=='short') & (series_list[i+1]['int_type']=='regular'),
            (series_list[i]['int_type']=='short') & (series_list[i+1]['int_type']=='long'),
            (series_list[i]['int_type']=='regular') & (series_list[i+1]['int_type']=='short'),
            (series_list[i]['int_type']=='regular') & (series_list[i+1]['int_type']=='regular'),
            (series_list[i]['int_type']=='regular') & (series_list[i+1]['int_type']=='long'),
            (series_list[i]['int_type']=='long') & (series_list[i+1]['int_type']=='short'),
            (series_list[i]['int_type']=='long') & (series_list[i+1]['int_type']=='regular'),
            (series_list[i]['int_type']=='long') & (series_list[i+1]['int_type']=='long')
        ]
        choices = [
            'StoS',
            'StoR',
            'StoL',
            'RtoS',
            'RtoR',
            'RtoL',
            'LtoS',
            'LtoR',
            'LtoL'
        ]
        transition_types = pd.Series(data=np.select(conditions, choices, default=cd.NA))
        temp_matrix['StoS'] = np.where(transition_types=='StoS', 1, 0)
        temp_matrix['StoR'] = np.where(transition_types=='StoR', 1, 0)
        temp_matrix['StoL'] = np.where(transition_types=='StoL', 1, 0)
        temp_matrix['RtoS'] = np.where(transition_types=='RtoS', 1, 0)
        temp_matrix['RtoR'] = np.where(transition_types=='RtoR', 1, 0)
        temp_matrix['RtoL'] = np.where(transition_types=='RtoL', 1, 0)
        temp_matrix['LtoS'] = np.where(transition_types=='LtoS', 1, 0)
        temp_matrix['LtoR'] = np.where(transition_types=='LtoR', 1, 0)
        temp_matrix['LtoL'] = np.where(transition_types=='LtoL', 1, 0)
        transition_matrix = transition_matrix.add(temp_matrix)

    count = len(series_list) - 1
    
    return cd.from_pandas(transition_matrix / count)

In [8]:
def feature_calc(record, rr_int_df, window_size=4):
    rr_int_df.drop(rr_int_df.tail(len(rr_int_df) % window_size).index, inplace=True)
    subsets = rr_int_df.groupby(rr_int_df.index // window_size, sort=True)

    feature_df = cd.DataFrame(data={'rhythm': subsets['rhythm'].nth(0)})
    feature_df['mappedLabel'] = feature_df['rhythm'].map({'N': 'Non-Afib', 'A': 'Afib', 'O': 'Non-Afib'})
    feature_df['subjectID'] = str(record)

    feature_df['std'] = subsets['rr_int'].agg('std')
    feature_df['cov'] = feature_df['std'] / subsets['rr_int'].agg('mean')
    feature_df['range'] = subsets['rr_int'].agg('max') - subsets['rr_int'].agg('min')
    feature_df['rrInt_var'] = subsets['rr_int'].agg('var')
    feature_df['rmean_var'] = subsets['rmean'].agg('var')
    feature_df['rmssd'] = np.sqrt(subsets['sqr_diff'].agg('sum') / subsets['sqr_diff'].agg('count'))
    feature_df['mad'] = subsets['diff'].agg('median')
    feature_df['iqr'] = subsets['rr_int'].quantile(0.75) - subsets['rr_int'].quantile(0.25)

    feature_df = cd.concat([feature_df, find_proportions(subsets, window_size)], axis='columns')

    return feature_df

In [16]:
feature_dfs = {}
for record in tqdm(rlist):
    feature_dfs[record] = feature_calc(record, rr_int_dfs[record], window_size=10)

combined_features = cd.concat([feature_dfs[key][1:] for key in feature_dfs])

  0%|          | 0/23 [00:00<?, ?it/s]

In [19]:
X = combined_features[['StoS', 'StoR', 'StoL', 'RtoS', 'RtoR', 'RtoL', 'LtoS', 'LtoR', 'rmssd', 'iqr', 'mad', 'cov', 'range', 'std', 'rrInt_var', 'rmean_var']].to_pandas()
y = combined_features['mappedLabel'].map({"Non-Afib": 0, "Afib": 1}).to_pandas()
groups = combined_features['subjectID'].astype('int64').to_pandas()

logo = LeaveOneGroupOut()
splits = list(logo.split(X, y, groups=groups))

In [20]:
# CatBoost
moving_accuracy['catboost'] = []

model = CatBoostClassifier(learning_rate=0.1, 
                            loss_function='Logloss', 
                            logging_level='Silent', 
                            max_depth=8, 
                            iterations=400, 
                            task_type="GPU")

moving_acc = []
importances = []
scores = cross_validate(model, X, y, scoring=cv_scorer, cv=splits, return_estimator=True)
moving_accuracy['catboost'] = moving_acc

total_importances_mean = 0
total_importances_std = 0
for importance in importances:
    total_importances_mean+=importance.importances_mean
    total_importances_std+=importance.importances_std

avg_importances_mean = total_importances_mean/len(importances)
avg_importances_std = total_importances_std/len(importances)

for i in avg_importances_mean.argsort()[::-1]:
    print(f"{X.columns[i]:<8}"
            f"{avg_importances_mean[i]:.3f}"
            f" +/- {avg_importances_std[i]:.3f}")

  sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
  sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])


cov     0.152 +/- 0.004
mad     0.040 +/- 0.002
iqr     0.037 +/- 0.002
std     0.025 +/- 0.002
range   0.019 +/- 0.002
rrInt_var0.016 +/- 0.001
rmean_var0.011 +/- 0.002
rmssd   0.009 +/- 0.001
StoL    0.002 +/- 0.001
RtoR    0.001 +/- 0.001
RtoS    0.001 +/- 0.001
StoR    0.001 +/- 0.001
StoS    0.001 +/- 0.000
LtoS    0.000 +/- 0.001
RtoL    0.000 +/- 0.001
LtoR    0.000 +/- 0.001


In [41]:
score_reporter(scores)

---Run time of each fold: 
 [0.0035251160006737337, 0.002734863999648951, 0.002243705999717349, 0.00219242500315886, 0.002704553000512533, 0.0027656940001179464, 0.0026425020005262922, 0.0019376609998289496, 0.0023025759983283933, 0.002214964999438962, 0.002729252999415621, 0.0020819430028495844, 0.002730053001869237, 0.0020235320007486735, 0.0026417219996801578, 0.00256374099990353, 0.0019542609989002813, 0.0021184429970162455, 0.002741963999142172, 0.0022499449987662956, 0.002641451999807032, 0.0021322729990060907, 0.0028213650002726354]
Avg run time: 0.0024649569130143273
---Run time per subset of each fold is: 
 [8.013448512556794e-07, 4.418197091516884e-07, 5.623323307562279e-07, 5.117705422873156e-07, 5.650967406001949e-07, 4.479582118752748e-07, 4.927283237975558e-07, 5.269679085746396e-07, 4.6180826280152295e-07, 4.866985276728108e-07, 4.950576817369166e-07, 5.979158537764458e-07, 4.948437560031243e-07, 5.152869877129293e-07, 4.38460082934466e-07, 4.540005312384505e-07, 5.35854

In [44]:
# bagging
moving_accuracy['bagging'] = []

model = RandomForestClassifier(max_features=len(X.columns), random_state=2, max_depth=11, n_estimators=200)

moving_acc = []
importances = []
scores = cross_validate(model, X, y, scoring=cv_scorer, cv=splits, return_estimator=True)
moving_accuracy['bagging'] = moving_acc

print(importances)

  return func(**kwargs)
  ret_val = func(*args, **kwargs)
  return func(**kwargs)
  ret_val = func(*args, **kwargs)
  return func(**kwargs)
  ret_val = func(*args, **kwargs)
  return func(**kwargs)
  ret_val = func(*args, **kwargs)
  return func(**kwargs)
  ret_val = func(*args, **kwargs)
  return func(**kwargs)
  ret_val = func(*args, **kwargs)
  return func(**kwargs)
  ret_val = func(*args, **kwargs)
  return func(**kwargs)
  ret_val = func(*args, **kwargs)
  return func(**kwargs)
  ret_val = func(*args, **kwargs)
  return func(**kwargs)
  ret_val = func(*args, **kwargs)
  return func(**kwargs)
  ret_val = func(*args, **kwargs)
  return func(**kwargs)
  ret_val = func(*args, **kwargs)
  return func(**kwargs)
  ret_val = func(*args, **kwargs)
  return func(**kwargs)
  ret_val = func(*args, **kwargs)
  sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
  return func(**kwargs)
  ret_val = func(*args, **kwargs)
  sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
  return func(**kwargs)
  ret_val = func

[{'importances_mean': array([ 6.36508297e-04,  1.10479654e-02,  1.09115708e-03,  2.00045465e-03,
       -4.54648784e-04,  8.00181860e-03,  4.54648784e-05,  4.09183905e-04,
       -1.27301659e-03,  5.50579677e-02, -4.61923164e-02, -4.59649920e-02,
        7.18345078e-02]), 'importances_std': array([4.63652604e-04, 1.06236158e-03, 9.09297568e-05, 9.09297568e-05,
       2.49021395e-04, 1.30508753e-03, 9.09297568e-05, 1.70113998e-04,
       1.11365753e-04, 2.60144933e-03, 2.31558656e-03, 2.00458356e-03,
       1.81859514e-03]), 'importances': array([[-0.00022732,  0.00068197,  0.00068197,  0.00113662,  0.0009093 ],
       [ 0.00977495,  0.01045692,  0.0111389 ,  0.01091157,  0.01295749],
       [ 0.00113662,  0.00113662,  0.00113662,  0.0009093 ,  0.00113662],
       [ 0.00204592,  0.00204592,  0.00204592,  0.0018186 ,  0.00204592],
       [-0.00068197,  0.        , -0.00045465, -0.00045465, -0.00068197],
       [ 0.00863833,  0.00704706,  0.00659241,  0.0075017 ,  0.0102296 ],
       [ 0.

In [45]:
score_reporter(scores)

---Run time of each fold: 
 [0.0457986469991738, 0.06670406099874526, 0.05277312799807987, 0.051537940998969134, 0.07501697199768387, 0.08279213399873697, 0.06163913800264709, 0.06280033399889362, 0.07886535700163222, 0.05319247300212737, 0.05031529200277873, 0.05031699199753348, 0.04871433800144587, 0.04956437999862828, 0.0321634179999819, 0.034640803998627234, 0.06629629300005035, 0.05011633799949777, 0.026041927998448955, 0.03793152100115549, 0.08941016799872159, 0.05342815499898279, 0.05422295700191171]
Avg run time: 0.055403598651932756
---Run time per subset of each fold is: 
 [1.041114957926206e-05, 1.0776100322899073e-05, 1.322634786919295e-05, 1.2030331699105774e-05, 1.567425240235768e-05, 1.340980466451846e-05, 1.1493406302936245e-05, 1.707923143837194e-05, 1.5817360008349823e-05, 1.1688084597259366e-05, 9.126662797529246e-06, 1.445060080342719e-05, 8.829860069140088e-06, 1.2621436210498671e-05, 5.338326639001146e-06, 6.1343729411417095e-06, 1.817830902112705e-05, 1.156619847