In [1]:
from matplotlib import pyplot as plt
from tqdm.auto import tqdm

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

from pathlib import Path, PurePath
import csv
import pandas as pd
import numpy as np
import scipy.stats as stats
import pickle
from joblib import Parallel, delayed, dump, load
from shutil import rmtree
from collections import defaultdict

import timeit

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.feature_selection import RFECV

from sklearn.base import clone

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from cuml.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from cuml.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

import catboost as cb
from catboost import CatBoostClassifier

import xgboost as xgb
from xgboost import XGBClassifier

import lightgbm

In [2]:
rlist = []
records = PurePath(Path(os.getcwd()).parents[1], Path('mit-bih-dataframes/subject_list.csv'))
with open(records) as rfile:
    recordreader = csv.reader(rfile, delimiter=' ', quotechar='|')
    for row in recordreader:
        rlist.append(row[0])

In [3]:
def classify_rr_ints(df):
    #list of types of rr_ints for each subject
    subject_types = []
    for row in df.itertuples():
        if row.rrInt < 0.85*row.rmean: 
            #if rr_ints is less than 85% of runningmean
            #label subject type as short
            subject_types.append('short')
        elif row.rrInt > 1.15*row.rmean: 
            #if rr_ints is greater than 115% of runningmean
            #label subject type as long
            subject_types.append('long')
        else:
            #label subject type as regular
            subject_types.append('regular')
    
    return subject_types

In [4]:
def find_proportions(int_types):
    StoS = 0
    StoR = 0
    StoL = 0
    RtoS = 0
    RtoR = 0
    RtoL = 0
    LtoS = 0
    LtoR = 0
    LtoL = 0
    for idx in range(len(int_types)):
        if idx<len(int_types)-1:
            if int_types[idx]=='short' and int_types[idx+1]=='short':
                StoS+=1
            elif int_types[idx]=='short' and int_types[idx+1]=='regular':
                StoR+=1
            elif int_types[idx]=='short' and int_types[idx+1]=='long':
                StoL+=1
            elif int_types[idx]=='regular' and int_types[idx+1]=='short':
                RtoS+=1
            elif int_types[idx]=='regular' and int_types[idx+1]=='regular':
                RtoR+=1
            elif int_types[idx]=='regular' and int_types[idx+1]=='long':
                RtoL+=1
            elif int_types[idx]=='long' and int_types[idx+1]=='short':
                LtoS+=1
            elif int_types[idx]=='long' and int_types[idx+1]=='regular':
                LtoR+=1
            elif int_types[idx]=='long' and int_types[idx+1]=='long':
                LtoL+=1
    
    count = len(int_types)-1
    subject_transitions = [StoS/count, StoR/count, StoL/count, RtoS/count, RtoR/count, RtoL/count, LtoS/count, LtoR/count, LtoL/count]
    
    return subject_transitions

In [5]:
def extract_rmssd(subset):
    rrInts = subset['rrInt'].tolist()
    sum_of_squares = 0
    for idx, rrInt in enumerate(rrInts):
        if idx<len(rrInts)-1:
            square_difference = (rrInt-rrInts[idx-1])**2
            sum_of_squares+=square_difference
    mean_sum = sum_of_squares/(len(rrInts)-1)
    return np.sqrt(mean_sum)

In [6]:
def subset_features(subset_list, current_weight = 0.25, prev_weight = 0.75):
    subset_dfs = {}
    for x, subset in enumerate(subset_list.itertuples()):
        subset_dfs[x] = pd.read_parquet(os.path.normpath(str(Path(os.getcwd()).parents[1]) + '/mit-bih-time-subsets/'+str(subset.subjectID)+'/'+str(subset.subjectID)+"-"+str(x)+".parquet"))

    calib_df = subset_dfs[0]

    feature_dict = {}

    props = find_proportions(classify_rr_ints(calib_df))
    feature_dict['StoS'] = [props[0]]
    feature_dict['StoR'] = [props[1]]
    feature_dict['StoL'] = [props[2]]
    feature_dict['RtoS'] = [props[3]]
    feature_dict['RtoR'] = [props[4]]
    feature_dict['RtoL'] = [props[5]]
    feature_dict['LtoS'] = [props[6]]
    feature_dict['LtoR'] = [props[7]]
    feature_dict['LtoL'] = [props[8]]

    feature_dict['std'] = [np.std(calib_df['rrInt'])]
    feature_dict['cov'] = [feature_dict['std'][0]/np.mean(calib_df['rrInt'])]
    feature_dict['range'] = [np.max(calib_df['rrInt'])-np.min(calib_df['rrInt'])]
    #feature_dict['rmean'] = df['rmean'].tolist()
    #feature_dict['rrv'] = df['rr_variance'].tolist()
    feature_dict['rrInt_var'] = [calib_df['rrInt'].var()]
    feature_dict['rmean_var'] = [calib_df['rmean'].var()]
    feature_dict['rmssd'] = [extract_rmssd(calib_df)]
    feature_dict['mad'] = [stats.median_abs_deviation(calib_df['rrInt'])]
    feature_dict['iqr'] = [stats.iqr(calib_df['rrInt'])]

    drr = np.diff(calib_df['rrInt'])
    feature_dict['drrmean'] = [np.mean(drr)]
    feature_dict['drrvar'] = [np.var(drr)]
    
    for key in subset_dfs:
        if key>0:
            current_subset = subset_dfs[key]
            props = find_proportions(classify_rr_ints(current_subset))
            feature_dict['StoS'].append(props[0]*current_weight + feature_dict['StoS'][key-1]*prev_weight)
            feature_dict['StoR'].append(props[1]*current_weight + feature_dict['StoR'][key-1]*prev_weight)
            feature_dict['StoL'].append(props[2]*current_weight + feature_dict['StoL'][key-1]*prev_weight)
            feature_dict['RtoS'].append(props[3]*current_weight + feature_dict['RtoS'][key-1]*prev_weight)
            feature_dict['RtoR'].append(props[4]*current_weight + feature_dict['RtoR'][key-1]*prev_weight)
            feature_dict['RtoL'].append(props[5]*current_weight + feature_dict['RtoL'][key-1]*prev_weight)
            feature_dict['LtoS'].append(props[6]*current_weight + feature_dict['LtoS'][key-1]*prev_weight)
            feature_dict['LtoR'].append(props[7]*current_weight + feature_dict['LtoR'][key-1]*prev_weight)
            feature_dict['LtoL'].append(props[8]*current_weight + feature_dict['LtoL'][key-1]*prev_weight)

            feature_dict['std'].append(np.std(current_subset['rrInt'])*current_weight + feature_dict['std'][key-1]*prev_weight)
            feature_dict['cov'].append((feature_dict['std'][key]/np.mean(current_subset['rrInt']))*current_weight + feature_dict['cov'][key-1]*prev_weight)
            feature_dict['range'].append(np.max(current_subset['rrInt'])-np.min(current_subset['rrInt'])*current_weight + feature_dict['range'][key-1]*prev_weight)
            #feature_dict['rmean'] = df['rmean'].tolist()
            #feature_dict['rrv'] = df['rr_variance'].tolist()
            feature_dict['rrInt_var'].append(current_subset['rrInt'].var()*current_weight + feature_dict['rrInt_var'][key-1]*prev_weight)
            feature_dict['rmean_var'].append(current_subset['rmean'].var()*current_weight + feature_dict['rmean_var'][key-1]*prev_weight)
            feature_dict['rmssd'].append(extract_rmssd(current_subset)*current_weight + feature_dict['rmssd'][key-1]*prev_weight)
            feature_dict['mad'].append(stats.median_abs_deviation(current_subset['rrInt'])*current_weight + feature_dict['mad'][key-1]*prev_weight)
            feature_dict['iqr'].append(stats.iqr(current_subset['rrInt'])*current_weight + feature_dict['iqr'][key-1]*prev_weight)

            drr = np.diff(calib_df['rrInt'])
            feature_dict['drrmean'].append(np.mean(drr)*current_weight + feature_dict['drrmean'][key-1]*prev_weight)
            feature_dict['drrvar'].append(np.var(drr)*current_weight + feature_dict['drrvar'][key-1]*prev_weight)

    feature_df = pd.DataFrame(data=feature_dict)
    return pd.concat([subset_list, feature_df], axis=1)

In [7]:
current_weight = 1

In [8]:
if not os.path.exists(str(Path(os.getcwd()).parents[1]) + '/mit-bih-time-features/'):
    os.mkdir(str(Path(os.getcwd()).parents[1]) + '/mit-bih-time-features/')

subset_lists = []
for record in rlist:
    subset_lists.append(pd.read_parquet(os.path.normpath(str(Path(os.getcwd()).parents[1]) + '/mit-bih-time-subsets/'+record+'_subset_list.parquet')))
    
features_list = Parallel(n_jobs=8, verbose=12)(
    delayed(subset_features)(subset_list, current_weight, 1-current_weight)
    for subset_list in subset_lists
)

for idx, record in enumerate(rlist):
    features_list[idx].to_parquet(os.path.normpath(str(Path(os.getcwd()).parents[1]) + '/mit-bih-time-features/'+record+".parquet"))
    
print("Feature extraction complete")

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   1 tasks      | elapsed:   18.5s
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:   19.6s
[Parallel(n_jobs=8)]: Done   3 tasks      | elapsed:   21.6s
[Parallel(n_jobs=8)]: Done   4 tasks      | elapsed:   22.0s
[Parallel(n_jobs=8)]: Done   5 tasks      | elapsed:   24.0s
[Parallel(n_jobs=8)]: Done   6 tasks      | elapsed:   28.1s
[Parallel(n_jobs=8)]: Done   7 tasks      | elapsed:   31.4s
[Parallel(n_jobs=8)]: Done   8 tasks      | elapsed:   33.1s
[Parallel(n_jobs=8)]: Done  10 out of  23 | elapsed:   40.9s remaining:   53.2s
[Parallel(n_jobs=8)]: Done  12 out of  23 | elapsed:   46.5s remaining:   42.7s
[Parallel(n_jobs=8)]: Done  14 out of  23 | elapsed:   50.2s remaining:   32.3s
[Parallel(n_jobs=8)]: Done  16 out of  23 | elapsed:   59.8s remaining:   26.2s
[Parallel(n_jobs=8)]: Done  18 out of  23 | elapsed:  1.0min remaining:   17.4s
[Parallel(n_jobs=8)]: Done  20 out of

Feature extraction complete


[Parallel(n_jobs=8)]: Done  23 out of  23 | elapsed:  1.3min finished


In [9]:
feature_dfs = {}
for record in tqdm(rlist):
    feature_dfs[record] = pd.read_parquet(str(Path(os.getcwd()).parents[1]) + '/mit-bih-time-features/'+record+'.parquet')

combined_features = pd.concat([feature_dfs[key][1:] for key in feature_dfs])

  0%|          | 0/23 [00:00<?, ?it/s]

In [10]:
X = combined_features[['StoS', 'StoR', 'StoL', 'RtoS', 'RtoR', 'RtoL', 'LtoS', 'LtoR', 'LtoL', 'std', 'cov', 'range', 'rrInt_var', 'rmean_var', 'rmssd', 'mad', 'iqr']]#, 'drrmean', 'drrvar']]
y = combined_features['mappedLabel'].map({"Non-Afib": 0, "Afib": 1})
groups = combined_features['subjectID'].astype('int64')

logo = LeaveOneGroupOut()
splits = list(logo.split(X, y, groups=groups))

In [11]:
np.seterr(all='ignore')
moving_accs = []

if os.path.exists(f'saved_results_{current_weight}')==False:
    os.mkdir(f'saved_results_{current_weight}')

In [12]:
def score_reporter(results, importances=True):
    bestParams = None
    maxScore = 0
    for params, scores in results.items():
        num_splits = scores['folds']
        accuracy = [scores[f"split{i}_accuracy"] for i in range(num_splits)]
        print(params, np.mean(accuracy))
        
        if (np.mean(accuracy) > maxScore):
            bestParams = params
            maxScore = np.mean(accuracy)
            
    bestScores = results[bestParams]
    num_splits = bestScores['folds']
    accuracy = [bestScores[f"split{i}_accuracy"] for i in range(num_splits)]
    sensitivity = [bestScores[f"split{i}_sensitivity"] for i in range(num_splits)]
    specificity = [bestScores[f"split{i}_specificity"] for i in range(num_splits)]
    precision = [bestScores[f"split{i}_precision"] for i in range(num_splits)]
    f1_score = [bestScores[f"split{i}_f1_score"] for i in range(num_splits)]
    if importances:
        feature_importances = [list(bestScores[f"split{i}_feature_importances"].values()) for i in range(num_splits)]
    
        avg_importances = np.mean(np.array(feature_importances), axis=0)
        feature_names = list(bestScores["split0_feature_importances"].keys())[0]
        mapped_importances = {name: rank for name, rank in zip(feature_names, avg_importances.flatten())}
    
    print(f"The best parameters were {bestParams}")
    print(f"Accuracy for each fold: {accuracy}")
    print(f"Mean accuracy: {np.nanmean(accuracy)}")
    print(f"Std accuracy: {np.nanstd(accuracy)}")
    print(f"Sensitivity for each fold: {sensitivity}")
    print(f"Mean sensitivity: {np.nanmean(sensitivity)}")
    print(f"Std sensitivity: {np.nanstd(sensitivity)}")
    print(f"Specificity for each fold: {specificity}")
    print(f"Mean specificity: {np.nanmean(specificity)}")
    print(f"Std specificity: {np.nanstd(specificity)}")
    print(f"Precision for each fold: {precision}")
    print(f"Mean precision: {np.nanmean(precision)}")
    print(f"Std precision: {np.nanstd(precision)}")
    print(f"F1-score for each fold: {f1_score}")
    print(f"Mean F1-score: {np.nanmean(f1_score)}")
    print(f"Std F1-score: {np.nanstd(f1_score)}")
    if importances:
        print("Average feature importances: ")
        print(mapped_importances)

In [13]:
def fit_parallel(clf, X, y, train, test, **fit_params):
    np.seterr(all='ignore')
    
    X_train = X.iloc[train]
    y_train = y.iloc[train]

    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    cloned_clf = clone(clf)
    cloned_clf.fit(X_train, y_train)

    pred_values = cloned_clf.predict(X_test)

    cm = confusion_matrix(y_test.values.reshape(y_test.shape[0]), pred_values)
    sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
    specificity = cm[1][1]/(cm[1][0]+cm[1][1])
    precision = (cm[0][0])/(cm[0][0]+cm[1][0])

    results_dict = {
        "accuracy": accuracy_score(y_test, pred_values),
        "sensitivity": sensitivity,
        "specificity": specificity,
        "precision": precision,
        "f1_score": (2*precision*sensitivity)/(precision+sensitivity)
    }
    
    return results_dict

In [14]:
def fit_xgboost_parallel(clf, X, y, train, test, **fit_params):
    np.seterr(all='ignore')
    
    X_train = X.iloc[train]
    y_train = y.iloc[train]

    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    cloned_clf = clone(clf)
    cloned_clf.fit(X_train, y_train)

    pred_values = cloned_clf.predict(X_test)

    cm = confusion_matrix(y_test.values.reshape(y_test.shape[0]), pred_values)
    sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
    specificity = cm[1][1]/(cm[1][0]+cm[1][1])
    precision = (cm[0][0])/(cm[0][0]+cm[1][0])

    results_dict = {
        "accuracy": accuracy_score(y_test, pred_values),
        "sensitivity": sensitivity,
        "specificity": specificity,
        "precision": precision,
        "f1_score": (2*precision*sensitivity)/(precision+sensitivity),
        "feature_importances": {A: B for A, B in zip(cloned_clf.feature_names_in_, cloned_clf.feature_importances_)}
    }
    
    return results_dict

In [15]:
def fit_catboost_parallel(clf, X, y, train, test, **fit_params):
    np.seterr(all='ignore')
    
    X_train = X.iloc[train]
    y_train = y.iloc[train]

    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    cloned_clf = clone(clf)
    cloned_clf.fit(X_train, y_train)

    pred_values = cloned_clf.predict(X_test)

    cm = confusion_matrix(y_test.values.reshape(y_test.shape[0]), pred_values)
    sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
    specificity = cm[1][1]/(cm[1][0]+cm[1][1])
    precision = (cm[0][0])/(cm[0][0]+cm[1][0])

    results_dict = {
        "accuracy": accuracy_score(y_test, pred_values),
        "sensitivity": sensitivity,
        "specificity": specificity,
        "precision": precision,
        "f1_score": (2*precision*sensitivity)/(precision+sensitivity),
        "feature_importances": {A: B for A, B in zip(cloned_clf.feature_names_, cloned_clf.feature_importances_)}
    }
    
    return results_dict

In [16]:
def fit_lightgbm_parallel(clf, X, y, train, test, **fit_params):
    np.seterr(all='ignore')
    
    X_train = X.iloc[train]
    y_train = y.iloc[train]

    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    cloned_clf = clone(clf)
    cloned_clf.fit(X_train,y_train,eval_set=[(X_test,y_test),(X_train,y_train)], 
                   eval_metric='logloss')

    pred_values = cloned_clf.predict(X_test)

    cm = confusion_matrix(y_test.values.reshape(y_test.shape[0]), pred_values)
    sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
    specificity = cm[1][1]/(cm[1][0]+cm[1][1])
    precision = (cm[0][0])/(cm[0][0]+cm[1][0])

    results_dict = {
        "accuracy": accuracy_score(y_test, pred_values),
        "sensitivity": sensitivity,
        "specificity": specificity,
        "precision": precision,
        "f1_score": (2*precision*sensitivity)/(precision+sensitivity),
        "feature_importances": {A: B for A, B in zip(cloned_clf.feature_name_, cloned_clf.feature_importances_)}
    }
    
    return results_dict

In [17]:
# Logistic regression
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 30
params = {
    "solver": ["liblinear"]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        
        clf = LogisticRegression(max_iter=3000,
                                 **fit_params)
        fold_results = Parallel(n_jobs=8)(
            delayed(fit_parallel)(clf, X_memmap, y_memmap, train, test, **fit_params)
            for (train, test) in splits
        )
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}/lr_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

Fitting with 1 different parameter combinations


  0%|          | 0/1 [00:00<?, ?it/s]

In [18]:
score_reporter(results, False)

(('solver', 'liblinear'),) 0.8413063587521947
The best parameters were (('solver', 'liblinear'),)
Accuracy for each fold: [0.8962406015037594, 0.8977769148626347, 0.9746606334841629, 0.737085906793936, 0.9056651174475568, 0.9160097323600973, 0.8031831427930958, 0.9787199214274022, 0.804460518384569, 0.8370986920332937, 0.8832188420019627, 0.9412476239847936, 0.6979302832244009, 0.8590254367146798, 0.44039900249376557, 0.7831839521980367, 0.9476907796722397, 0.8788341429562804, 0.6056580815250456, 0.8966701902748414, 0.8727402716780717, 0.9272123225611598, 0.8653341409246921]
Mean accuracy: 0.8413063587521947
Std accuracy: 0.12216752481793575
Sensitivity for each fold: [0.8982980489829805, 0.9721090168110036, 0.976905311778291, 0.7327153927974472, 0.9751243781094527, 0.9348737238044063, 0.9230769230769231, 0.9792967466316136, 0.872324446113406, 0.8352226720647773, 0.7289156626506024, 0.9443374759320847, 0.6056399132321041, nan, nan, 0.9238728750923872, 0.9578754578754579, 0.966804979253

In [19]:
# LDA
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 30
params = {
    "solver": ["lsqr"]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        
        clf = LinearDiscriminantAnalysis(**fit_params)
        fold_results = Parallel(n_jobs=8)(
            delayed(fit_parallel)(clf, X_memmap, y_memmap, train, test, **fit_params)
            for (train, test) in splits
        )
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}/lda_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

Fitting with 1 different parameter combinations


  0%|          | 0/1 [00:00<?, ?it/s]

In [20]:
score_reporter(results, False)

(('solver', 'lsqr'),) 0.8213748233864019
The best parameters were (('solver', 'lsqr'),)
Accuracy for each fold: [0.8961038961038961, 0.8960295116978934, 0.9800904977375565, 0.8141493542953397, 0.8789096847129757, 0.9090024330900244, 0.7604797130688187, 0.959895236536258, 0.7666063893911995, 0.79944510503369, 0.8379675062697634, 0.9405564195610852, 0.6958605664488018, 0.7025743181121668, 0.4374064837905237, 0.7881988903115663, 0.9382552557523589, 0.7464260929909785, 0.6121476373960657, 0.8789640591966174, 0.8748850985599019, 0.9234370280881908, 0.8542297597415708]
Mean accuracy: 0.8213748233864019
Std accuracy: 0.12289545949238502
Sensitivity for each fold: [0.8978829389788294, 0.9649770759042282, 0.9816782140107775, 0.8112748822367422, 0.98045486851457, 0.9281031703385276, 0.9204693611473272, 0.9604009201445941, 0.8790837401426962, 0.7967611336032389, 0.7289156626506024, 0.945387712235253, 0.6392624728850326, nan, nan, 0.9722838137472284, 0.9637769637769638, 0.9353882631890931, 0.56580

In [21]:
# QDA
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 30
params = {
    "fake": ["param"]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        
        clf = QuadraticDiscriminantAnalysis()
        fold_results = Parallel(n_jobs=8)(
            delayed(fit_parallel)(clf, X_memmap, y_memmap, train, test, **fit_params)
            for (train, test) in splits
        )
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}/qda_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

Fitting with 1 different parameter combinations


  0%|          | 0/1 [00:00<?, ?it/s]



In [22]:
score_reporter(results, False)

(('fake', 'param'),) 0.7334793609564302
The best parameters were (('fake', 'param'),)
Accuracy for each fold: [0.5546138072453862, 0.7785651878458402, 0.26349924585218704, 0.3163952835485682, 0.753548549177239, 0.8387347931873479, 0.7312261824702981, 0.129153707644459, 0.8332730560578662, 0.7592812789007795, 0.9694689783011667, 0.9142906514601693, 0.6916122004357298, 0.91955255899479, 0.9840399002493766, 0.9375800256081946, 0.9102797550074491, 0.7236641221374046, 0.7095923747718516, 0.8442124735729387, 0.8800939638443468, 0.6925400181214135, 0.734807187563093]
Mean accuracy: 0.7334793609564302
Std accuracy: 0.2199550319782928
Sensitivity for each fold: [0.5491905354919053, 0.7311512990320937, 0.24926866820631255, 0.26075064579851087, 0.3251599147121535, 0.8260075228371843, 0.13081269013472405, 0.1258626355570161, 0.5636500187758168, 0.7546558704453441, 0.24397590361445784, 0.9182566077367408, 0.4943600867678959, nan, nan, 0.8203991130820399, 0.9224664224664225, 0.04386484884410195, 0.9

In [23]:
# KNN
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 30
params = {
    "n_neighbors": [9]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        
        clf = KNeighborsClassifier(**fit_params)
        fold_results = Parallel(n_jobs=8)(
            delayed(fit_parallel)(clf, X_memmap, y_memmap, train, test, **fit_params)
            for (train, test) in splits
        )
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}/knn_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

Fitting with 1 different parameter combinations


  0%|          | 0/1 [00:00<?, ?it/s]

In [24]:
score_reporter(results, False)

(('n_neighbors', 9),) 0.8460586655107755
The best parameters were (('n_neighbors', 9),)
Accuracy for each fold: [0.8120300751879699, 0.9268032229880594, 0.9647058823529412, 0.6035934868051657, 0.905790729807813, 0.9148418491484185, 0.8206680116565792, 0.8963823866426583, 0.8378541289933695, 0.786761791518034, 0.9102606040780722, 0.8873336789355452, 0.7, 0.8351210542445602, 0.7719700748129675, 0.8218096457533077, 0.9309716934282404, 0.9326856349757113, 0.5390387345366051, 0.9171511627906976, 0.9165560208354612, 0.9090909090909091, 0.9179285281647487]
Mean accuracy: 0.8460586655107755
Std accuracy: 0.10572673216004856
Sensitivity for each fold: [0.8114016881140169, 0.9668874172185431, 0.9658198614318707, 0.5792432760978574, 0.8638948116560057, 0.9220849005910801, 0.8735332464146024, 0.8966480446927374, 0.7510326699211416, 0.7831309041835358, 0.6174698795180723, 0.8867495186416944, 0.49349240780911063, nan, nan, 0.8702882483370288, 0.9287749287749287, 0.9561351511558981, 0.418980245595301

In [25]:
# Decision tree
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 30
params = {
    "max_depth": [None]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        
        clf = DecisionTreeClassifier(**fit_params)
        
        fold_results = []
        for (train, test) in tqdm(splits, leave=False):
            X_train = X.iloc[train]
            y_train = y.iloc[train]

            X_test = X.iloc[test]
            y_test = y.iloc[test]

            cloned_clf = clone(clf)
            cloned_clf.fit(X_train, y_train)

            pred_values = cloned_clf.predict(X_test)

            cm = confusion_matrix(y_test.values.reshape(y_test.shape[0]), pred_values)
            sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
            specificity = cm[1][1]/(cm[1][0]+cm[1][1])
            precision = (cm[0][0])/(cm[0][0]+cm[1][0])

            results_dict = {
                "accuracy": accuracy_score(y_test, pred_values),
                "sensitivity": sensitivity,
                "specificity": specificity,
                "precision": precision,
                "f1_score": (2*precision*sensitivity)/(precision+sensitivity)
            }

            fold_results.append(results_dict)
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}/dt_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

Fitting with 1 different parameter combinations


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

In [26]:
score_reporter(results, False)

(('max_depth', None),) 0.850184437013619
The best parameters were (('max_depth', None),)
Accuracy for each fold: [0.8318523581681476, 0.8826327541015435, 0.9717948717948718, 0.7230488489612578, 0.9089310388142193, 0.882043795620438, 0.7468056489576328, 0.9711900474709445, 0.812537673297167, 0.8606156691769058, 0.8879075346199978, 0.9466044582685329, 0.7075163398692811, 0.7627949739503525, 0.773067331670823, 0.897247119078105, 0.9402416818407549, 0.8108258154059681, 0.5696613263029812, 0.8969344608879493, 0.9020529057297518, 0.953035336756267, 0.9149000605693519]
Mean accuracy: 0.850184437013619
Std accuracy: 0.09747489400151495
Sensitivity for each fold: [0.8310502283105022, 0.8839786041772797, 0.9739799846035412, 0.7166084181735298, 0.9658848614072495, 0.8780225685115529, 0.6079965232507606, 0.9715741045021361, 0.8167480285392414, 0.860863697705803, 0.6295180722891566, 0.9485384211447576, 0.5149674620390455, nan, nan, 0.9076127124907613, 0.9595034595034595, 0.967397747480735, 0.453817

In [27]:
# Random Forest
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 30
params = {
    "n_estimators": [300],
    "max_depth": [30]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        
        clf = RandomForestClassifier(**fit_params)
        
        fold_results = []
        for (train, test) in tqdm(splits, leave=False):
            X_train = X.iloc[train]
            y_train = y.iloc[train]

            X_test = X.iloc[test]
            y_test = y.iloc[test]

            cloned_clf = clone(clf)
            cloned_clf.fit(X_train.astype(np.float32), y_train.astype(np.float32))

            pred_values = cloned_clf.predict(X_test)

            cm = confusion_matrix(y_test.values.reshape(y_test.shape[0]), pred_values)
            sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
            specificity = cm[1][1]/(cm[1][0]+cm[1][1])
            precision = (cm[0][0])/(cm[0][0]+cm[1][0])

            results_dict = {
                "accuracy": accuracy_score(y_test, pred_values),
                "sensitivity": sensitivity,
                "specificity": specificity,
                "precision": precision,
                "f1_score": (2*precision*sensitivity)/(precision+sensitivity)
            }

            fold_results.append(results_dict)
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}/rf_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

Fitting with 1 different parameter combinations


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

In [28]:
score_reporter(results, False)

(('max_depth', 30), ('n_estimators', 300)) 0.8901043854219829
The best parameters were (('max_depth', 30), ('n_estimators', 300))
Accuracy for each fold: [0.8891319207108681, 0.954373361809533, 0.9825037707390648, 0.7247332959011791, 0.964200477326969, 0.9191240875912409, 0.7978031831427931, 0.9698805041741693, 0.8773960216998191, 0.8685427401241907, 0.9548577036310107, 0.9581821323656471, 0.7278867102396515, 0.8570334048421698, 0.8467830423940149, 0.9307511737089202, 0.9662307564972686, 0.8873004857737682, 0.5348813628067329, 0.9470137420718816, 0.9670105198651823, 0.9700996677740864, 0.9766807995154452]
Mean accuracy: 0.8901043854219829
Std accuracy: 0.10563645198724894
Sensitivity for each fold: [0.887920298879203, 0.9639582272032603, 0.9826020015396458, 0.704452210910196, 0.9811656005685856, 0.9131649650725416, 0.7044763146458062, 0.9702596122247782, 0.8317686819376643, 0.8663967611336032, 0.677710843373494, 0.9599159810957465, 0.5075921908893709, nan, nan, 0.9072431633407243, 0.97

In [29]:
# AdaBoost
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 30
params = {
    "n_estimators": [300]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        
        clf = AdaBoostClassifier(algorithm="SAMME.R",
                                **fit_params)
        fold_results = Parallel(n_jobs=4)(
            delayed(fit_xgboost_parallel)(clf, X_memmap, y_memmap, train, test, **fit_params)
            for (train, test) in splits
        )
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
            current_results[f"split{i}_feature_importances"] = result["feature_importances"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}/ada_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

Fitting with 1 different parameter combinations


  0%|          | 0/1 [00:00<?, ?it/s]

In [30]:
score_reporter(results)

(('n_estimators', 300),) 0.8805209704558556
The best parameters were (('n_estimators', 300),)
Accuracy for each fold: [0.8702665755297334, 0.9486457625473255, 0.9742081447963801, 0.6734980348119034, 0.9679688481346564, 0.9045255474452555, 0.8174176193678547, 0.9759371419217547, 0.875708257986739, 0.8034086405073325, 0.9605277505179369, 0.9324347675825125, 0.7181917211328976, 0.8331290223720502, 0.8932668329177057, 0.9334186939820742, 0.9466975666280417, 0.8977099236641222, 0.4980734131007909, 0.944899577167019, 0.9652742314370341, 0.9420114768951978, 0.9747627700383605]
Mean accuracy: 0.8805209704558556
Std accuracy: 0.11406440145173613
Sensitivity for each fold: [0.8689636086896361, 0.9610290371879776, 0.9741339491916859, 0.6479258471356937, 0.9676616915422885, 0.8973670069854917, 0.7031725336810083, 0.9763391390075583, 0.7870822380773563, 0.7997300944669365, 0.6807228915662651, 0.9338351129004026, 0.4822125813449024, nan, nan, 0.8839615668883961, 0.9456654456654456, 0.945465323058684

In [31]:
'''
# Gradient Boost
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 30
params = {
    "n_estimators": [350]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        
        clf = GradientBoostingClassifier(loss="log_loss",
                                         max_depth=8,
                                            **fit_params)
        fold_results = Parallel(n_jobs=4)(
            delayed(fit_xgboost_parallel)(clf, X_memmap, y_memmap, train, test, **fit_params)
            for (train, test) in splits
        )
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
            current_results[f"split{i}_feature_importances"] = result["feature_importances"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}/gb_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)
'''

'\n# Gradient Boost\nfolder = \'./joblib_memmap\'\ntry:\n    os.mkdir(folder)\nexcept FileExistsError:\n    pass\n\ndump(X, os.path.join(folder, \'X\'))\nX_memmap = load(os.path.join(folder, \'X\'), mmap_mode=\'r\')\ndump(y, os.path.join(folder, \'y\'))\ny_memmap = load(os.path.join(folder, \'y\'), mmap_mode=\'r\')\n\n#num_combinations = 30\nparams = {\n    "n_estimators": [350]\n}\nparam_grid = ParameterGrid(params)\nprint(f\'Fitting with {len(param_grid)} different parameter combinations\')\nresults = {}\n\nwith tqdm(param_grid) as pbar:\n    for fit_params in pbar:\n        pbar.set_description(f\'Fitting parameter combination: {fit_params}\')\n        \n        clf = GradientBoostingClassifier(loss="log_loss",\n                                         max_depth=8,\n                                            **fit_params)\n        fold_results = Parallel(n_jobs=4)(\n            delayed(fit_xgboost_parallel)(clf, X_memmap, y_memmap, train, test, **fit_params)\n            for (train

In [32]:
# score_reporter(results)

In [33]:
# SVC
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 30
params = {
    #"kernel": ["linear", "rbf"]
    "verbose": [False]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        
        clf = LinearSVC(**fit_params)
        
        fold_results = []
        for (train, test) in tqdm(splits, leave=False):
            X_train = X.iloc[train]
            y_train = y.iloc[train]

            X_test = X.iloc[test]
            y_test = y.iloc[test]

            cloned_clf = clone(clf)
            cloned_clf.fit(X_train, y_train)

            pred_values = cloned_clf.predict(X_test)

            cm = confusion_matrix(y_test.values.reshape(y_test.shape[0]), pred_values)
            sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
            specificity = cm[1][1]/(cm[1][0]+cm[1][1])
            precision = (cm[0][0])/(cm[0][0]+cm[1][0])

            results_dict = {
                "accuracy": accuracy_score(y_test, pred_values),
                "sensitivity": sensitivity,
                "specificity": specificity,
                "precision": precision,
                "f1_score": (2*precision*sensitivity)/(precision+sensitivity)
            }

            fold_results.append(results_dict)
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}/svc_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

Fitting with 1 different parameter combinations


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

In [34]:
score_reporter(results, False)

(('verbose', False),) 0.8118963745224512
The best parameters were (('verbose', False),)
Accuracy for each fold: [0.8854408749145591, 0.8810795068439957, 0.9668174962292609, 0.722206625491297, 0.8758949880668258, 0.9045255474452555, 0.7974669356646492, 0.9435259453265673, 0.7654008438818566, 0.7798916633637204, 0.8365499945480318, 0.9063418005875238, 0.6749455337690632, 0.8974869751762182, 0.3403491271820449, 0.7440247545881349, 0.9119351100811124, 0.8473282442748091, 0.5960251470290002, 0.8673361522198731, 0.8450617914411194, 0.8881002718212021, 0.7958812840702605]
Mean accuracy: 0.8118963745224512
Std accuracy: 0.13291212352509207
Sensitivity for each fold: [0.887920298879203, 0.9659959246051961, 0.9712086220169361, 0.7199513751709467, 0.9573560767590619, 0.9288554540569586, 0.9439374185136897, 0.9441340782122905, 0.8527975966954563, 0.777867746288799, 0.6897590361445783, 0.9100297566952564, 0.6188720173535792, nan, nan, 0.9368070953436807, 0.9194139194139194, 0.9347954949614701, 0.56

In [35]:
# XGBoost
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 30
params = {
    "n_estimators": [1050],
    "max_depth": [4]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        
        clf = XGBClassifier(learning_rate = 0.1,
                        verbose=None, 
                        eval_metric='logloss',
                        tree_method='gpu_hist',
                        **fit_params)
        fold_results = Parallel(n_jobs=8)(
            delayed(fit_xgboost_parallel)(clf, X_memmap, y_memmap, train, test, **fit_params)
            for (train, test) in splits
        )
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
            current_results[f"split{i}_feature_importances"] = result["feature_importances"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}/xg_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

Fitting with 1 different parameter combinations


  0%|          | 0/1 [00:00<?, ?it/s]

In [36]:
score_reporter(results)

(('max_depth', 4), ('n_estimators', 1050)) 0.8896594660834751
The best parameters were (('max_depth', 4), ('n_estimators', 1050))
Accuracy for each fold: [0.8888585099111415, 0.9495194641296961, 0.9838612368024132, 0.73385738349242, 0.9640748649667127, 0.904720194647202, 0.7834566240753195, 0.9806842363725651, 0.8795660036166365, 0.8711850971066191, 0.9552938610838513, 0.9585277345775013, 0.7327886710239652, 0.8437021146184492, 0.8724189526184538, 0.9489970123772941, 0.9615957622910114, 0.8741151977793199, 0.5344757655647941, 0.9256078224101479, 0.9679297313859667, 0.9702506795530051, 0.9766807995154452]
Mean accuracy: 0.8896594660834751
Std accuracy: 0.10532596210771614
Sensitivity for each fold: [0.8876435588764355, 0.956698930208864, 0.9841416474210931, 0.7196474699893634, 0.9832977967306326, 0.8969371305749597, 0.5775749674054759, 0.9809398619783108, 0.8422831393165603, 0.8695006747638326, 0.6746987951807228, 0.9602660598634692, 0.5117136659436009, nan, nan, 0.9142645971914265, 0.9

In [37]:
# CatBoost
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 15
params = {
    "n_estimators": [650],
    "max_depth": [6]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        
        clf = CatBoostClassifier(
                        learning_rate=0.1,
                        loss_function='Logloss',
                        task_type="GPU",
                        silent=True,
                        **fit_params)
        fold_results = Parallel(n_jobs=1)(
            delayed(fit_catboost_parallel)(clf, X_memmap, y_memmap, train, test, **fit_params)
            for (train, test) in splits
        )
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
            current_results[f"split{i}_feature_importances"] = result["feature_importances"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}/cb_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

Fitting with 1 different parameter combinations


  0%|          | 0/1 [00:00<?, ?it/s]



In [38]:
score_reporter(results)

(('max_depth', 6), ('n_estimators', 650)) 0.8925980023811104
The best parameters were (('max_depth', 6), ('n_estimators', 650))
Accuracy for each fold: [0.8900888585099112, 0.9522376468304048, 0.9852187028657617, 0.7356822010106682, 0.9662102750910689, 0.9104622871046228, 0.7911903160726295, 0.9813390080209526, 0.8802893309222423, 0.8779231074118113, 0.9550757823574311, 0.9606013478486263, 0.7305010893246188, 0.8496782102359791, 0.870423940149626, 0.9484635083226632, 0.9672239695414666, 0.8799444829979182, 0.5253498276211722, 0.9532241014799154, 0.9702788274946379, 0.9693446088794926, 0.979002624671916]
Mean accuracy: 0.8925980023811104
Std accuracy: 0.1073295775282797
Sensitivity for each fold: [0.8888888888888888, 0.9620478858889455, 0.9852193995381062, 0.71934356480778, 0.9840085287846482, 0.9033852767329393, 0.6510212950890917, 0.9817614196516595, 0.8385279759669546, 0.87638326585695, 0.677710843373494, 0.9621914930859443, 0.5108459869848156, nan, nan, 0.9087213599408721, 0.9727309

In [39]:
# LightGBM
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 15
params = {
    "n_estimators": np.arange(900, 1200, 50),
    "max_depth": [5]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        
        clf = lightgbm.LGBMClassifier(
                        learning_rate=0.1,
                        device="gpu",
                        verbose=-1,
                        **fit_params)
        fold_results = []
        for (train, test) in tqdm(splits, leave=False):
            X_train = X.iloc[train]
            y_train = y.iloc[train]

            X_test = X.iloc[test]
            y_test = y.iloc[test]

            cloned_clf = clone(clf)
            cloned_clf.fit(X_train,y_train,eval_set=[(X_test,y_test),(X_train,y_train)], 
                   eval_metric='logloss')

            pred_values = cloned_clf.predict(X_test)

            cm = confusion_matrix(y_test.values.reshape(y_test.shape[0]), pred_values)
            sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
            specificity = cm[1][1]/(cm[1][0]+cm[1][1])
            precision = (cm[0][0])/(cm[0][0]+cm[1][0])

            results_dict = {
                "accuracy": accuracy_score(y_test, pred_values),
                "sensitivity": sensitivity,
                "specificity": specificity,
                "precision": precision,
                "f1_score": (2*precision*sensitivity)/(precision+sensitivity),
                "feature_importances": {A: B for A, B in zip(cloned_clf.feature_name_, cloned_clf.feature_importances_)}
            }

            fold_results.append(results_dict)
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
            current_results[f"split{i}_feature_importances"] = result["feature_importances"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}/lg_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

Fitting with 6 different parameter combinations


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

In [40]:
score_reporter(results)

(('max_depth', 5), ('n_estimators', 900)) 0.8902554480407062
(('max_depth', 5), ('n_estimators', 950)) 0.8902671429408384
(('max_depth', 5), ('n_estimators', 1000)) 0.8900101982305616
(('max_depth', 5), ('n_estimators', 1050)) 0.8899358732250775
(('max_depth', 5), ('n_estimators', 1100)) 0.8895689985710263
(('max_depth', 5), ('n_estimators', 1150)) 0.8899690825535622
The best parameters were (('max_depth', 5), ('n_estimators', 950))
Accuracy for each fold: [0.8898154477101845, 0.9491311523153092, 0.9834087481146304, 0.7344188658057271, 0.9629443537244065, 0.9067639902676399, 0.7833445415826048, 0.9787199214274022, 0.8784810126582279, 0.8791121680539041, 0.9544215461781703, 0.9593917401071367, 0.734640522875817, 0.8455409132699969, 0.8682294264339152, 0.9485702091335895, 0.9617612977983777, 0.8766134628730049, 0.5326505779760697, 0.9334038054968288, 0.9655806352772954, 0.9713077620054364, 0.9778921865536039]
Mean accuracy: 0.8902671429408384
Std accuracy: 0.1053912133840016
Sensitivity 