In [None]:
from matplotlib import pyplot as plt
from tqdm.auto import tqdm

import os
os.environ['NUMBA_CUDA_DRIVER'] = "/usr/lib/wsl/lib/libcuda.so.1"
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

from pathlib import Path, PurePath
import csv
import pandas as pd
import numpy as np
import pickle
from joblib import Parallel, delayed, dump, load

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import ParameterGrid

from sklearn.base import clone

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from cuml.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from cuml.linear_model import LogisticRegression
from cuml.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from cuml.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import lightgbm

In [None]:
rlist = []
extractedpath = PurePath(Path(os.getcwd()).parents[0], 'mit-bih-extracted/')
records = extractedpath / 'subject_list.csv'
with open(records) as rfile: # reads in all of the subject IDs
    recordreader = csv.reader(rfile, delimiter=' ', quotechar='|')
    for row in recordreader:
        rlist.append(row[0])

In [None]:
feature_dfs = {}
recent_size = 400
featurespath = PurePath(Path(os.getcwd()).parents[0], f'mit-bih-time-features-stepping/{recent_size}/')
for record in tqdm(rlist):
    feature_dfs[record] = pd.read_parquet(featurespath / (record + '.parquet'))

combined_features = pd.concat([feature_dfs[key] for key in feature_dfs], ignore_index=True)
combined_features

In [None]:
X = combined_features[['StoS', 'StoR', 'StoL', 'RtoS', 'RtoR', 'RtoL', 'LtoS', 'LtoR', 'LtoL', 'std', 'cov', 'range', 'rrInt_var', 'rmean_var', 'rmssd', 'mad', 'iqr', 'entropy', 'approx_entropy']]#, 'drrmean', 'drrvar']]
y = combined_features['mappedLabel'].map({"Non-Afib": 0, "Afib": 1})
groups = combined_features['subjectID']

logo = LeaveOneGroupOut()
splits = list(logo.split(X, y, groups=groups))

In [None]:
np.seterr(all='ignore')
moving_accs = []

current_weight = 0.5
saved_results_path = PurePath(f'saved_results_{current_weight}_{recent_size}')
if not os.path.exists(saved_results_path):
    os.mkdir(saved_results_path)

In [None]:
def score_reporter(results, importances=True, moving_acc_plot=False, classifier_name="idk"):
    bestParams = None
    maxScore = 0
    for params, scores in results.items():
        num_splits = scores['folds']
        accuracy = [scores[f"split{i}_accuracy"] for i in range(num_splits)]
        print(params, np.mean(accuracy))
        
        if (np.mean(accuracy) > maxScore):
            bestParams = params
            maxScore = np.mean(accuracy)
            
    bestScores = results[bestParams]
    num_splits = bestScores['folds']
    accuracy = [bestScores[f"split{i}_accuracy"] for i in range(num_splits)]
    sensitivity = [bestScores[f"split{i}_sensitivity"] for i in range(num_splits)]
    specificity = [bestScores[f"split{i}_specificity"] for i in range(num_splits)]
    precision = [bestScores[f"split{i}_precision"] for i in range(num_splits)]
    f1_score = [bestScores[f"split{i}_f1_score"] for i in range(num_splits)]
    if importances:
        feature_importances = [list(bestScores[f"split{i}_feature_importances"].values()) for i in range(num_splits)]
    
        avg_importances = np.mean(np.array(feature_importances), axis=0)
        feature_names = list(bestScores["split0_feature_importances"].keys())[0]
        mapped_importances = {name: rank for name, rank in zip(feature_names, avg_importances.flatten())}
        
    if moving_acc_plot:
        subject_accs = [bestScores[f"split{i}_subject_acc"] for i in range(num_splits)]
        minLen = len(subject_accs[0])
        for accs in subject_accs:
            if len(accs)<minLen:
                minLen = len(accs)

        avg_list = [sum(sub_list) * 100 / len(sub_list) for sub_list in zip(*subject_accs)]
        plt.plot(avg_list)
        plt.title(classifier_name + " Accuracy Over Time")
        plt.xlabel("Stepping windows elapsed")
        plt.ylabel("Average accuracy across CV folds (%)")
        plt.show()
    
    print(f"The best parameters were {bestParams}")
    print(f"Accuracy for each fold: {accuracy}")
    print(f"Mean accuracy: {np.nanmean(accuracy)}")
    print(f"Std accuracy: {np.nanstd(accuracy)}")
    print(f"Sensitivity for each fold: {sensitivity}")
    print(f"Mean sensitivity: {np.nanmean(sensitivity)}")
    print(f"Std sensitivity: {np.nanstd(sensitivity)}")
    print(f"Specificity for each fold: {specificity}")
    print(f"Mean specificity: {np.nanmean(specificity)}")
    print(f"Std specificity: {np.nanstd(specificity)}")
    print(f"Precision for each fold: {precision}")
    print(f"Mean precision: {np.nanmean(precision)}")
    print(f"Std precision: {np.nanstd(precision)}")
    print(f"F1-score for each fold: {f1_score}")
    print(f"Mean F1-score: {np.nanmean(f1_score)}")
    print(f"Std F1-score: {np.nanstd(f1_score)}")
    if importances:
        print("Average feature importances: ")
        print(mapped_importances)

In [None]:
def fit_parallel(clf, X, y, train, test, **fit_params):
    np.seterr(all='ignore')
    
    X_train = X.iloc[train]
    y_train = y.iloc[train]

    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    cloned_clf = clone(clf)
    cloned_clf.fit(X_train, y_train)

    pred_values = cloned_clf.predict(X_test)
    
    total_seen = 0
    total_correct = 0
    subject_acc = []
    for idx, pred in enumerate(pred_values):
        total_seen+=1
        if pred==y_test.iloc[idx]:
            total_correct+=1
        subject_acc.append(total_correct/total_seen)

    cm = confusion_matrix(y_test.values.reshape(y_test.shape[0]), pred_values)
    sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
    specificity = cm[1][1]/(cm[1][0]+cm[1][1])
    precision = (cm[0][0])/(cm[0][0]+cm[1][0])

    results_dict = {
        "accuracy": accuracy_score(y_test, pred_values),
        "sensitivity": sensitivity,
        "specificity": specificity,
        "precision": precision,
        "f1_score": (2*precision*sensitivity)/(precision+sensitivity),
        "subject_acc": subject_acc
    }
    
    return results_dict

In [None]:
def fit_tree_parallel(clf, X, y, train, test, **fit_params):
    np.seterr(all='ignore')
    
    X_train = X.iloc[train]
    y_train = y.iloc[train]

    X_test = X.iloc[test]
    y_test = y.iloc[test]
    
    cloned_clf = clone(clf)
    cloned_clf.fit(X_train, y_train)

    pred_values = cloned_clf.predict(X_test)
    
    total_seen = 0
    total_correct = 0
    subject_acc = []
    for idx, pred in enumerate(pred_values):
        total_seen+=1
        if pred==y_test.iloc[idx]:
            total_correct+=1
        subject_acc.append(total_correct/total_seen)

    cm = confusion_matrix(y_test.values.reshape(y_test.shape[0]), pred_values)
    sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
    specificity = cm[1][1]/(cm[1][0]+cm[1][1])
    precision = (cm[0][0])/(cm[0][0]+cm[1][0])

    results_dict = {
        "accuracy": accuracy_score(y_test, pred_values),
        "sensitivity": sensitivity,
        "specificity": specificity,
        "precision": precision,
        "f1_score": (2*precision*sensitivity)/(precision+sensitivity),
        "feature_importances": {A: B for A, B in zip(cloned_clf.feature_names_in_, cloned_clf['clf'].feature_importances_)},
        "subject_acc": subject_acc
    }
    
    return results_dict

In [None]:
# Logistic regression
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 30
params = {
    "solver": ["qn"]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        
        pipe = Pipeline([("scaler", StandardScaler()), 
                         ("clf", LogisticRegression(max_iter=3000,
                                 **fit_params))])
        fold_results = list(tqdm(Parallel(n_jobs=2, return_as="generator")(
            delayed(fit_parallel)(pipe, X_memmap, y_memmap, train, test, **fit_params)
            for (train, test) in splits
        ), 
        desc="Fitting on each fold",
        total=len(splits)))
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
            current_results[f"split{i}_subject_acc"] = result["subject_acc"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}_{recent_size}/lr_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
score_reporter(results, False, True)

In [None]:
# LDA
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 30
params = {
    "solver": ["lsqr"]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        
        pipe = Pipeline([("scaler", StandardScaler()), 
                         ("clf", LinearDiscriminantAnalysis(**fit_params))])
        fold_results = list(tqdm(Parallel(n_jobs=4, return_as="generator")(
            delayed(fit_parallel)(pipe, X_memmap, y_memmap, train, test, **fit_params)
            for (train, test) in splits
        ), 
        desc="Fitting on each fold",
        total=len(splits)))
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
            current_results[f"split{i}_subject_acc"] = result["subject_acc"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}_{recent_size}/lda_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
score_reporter(results, False, True)

In [None]:
# QDA
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 30
params = {
    "fake": ["param"]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        
        pipe = Pipeline([("scaler", StandardScaler()), 
                         ("clf", QuadraticDiscriminantAnalysis())])
        fold_results = list(tqdm(Parallel(n_jobs=4, return_as="generator")(
            delayed(fit_parallel)(pipe, X_memmap, y_memmap, train, test, **fit_params)
            for (train, test) in splits
        ), 
        desc="Fitting on each fold",
        total=len(splits)))
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
            current_results[f"split{i}_subject_acc"] = result["subject_acc"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}_{recent_size}/qda_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
score_reporter(results, False, True)

In [None]:
# KNN
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 30
params = {
    "n_neighbors": [9]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')

        pipe = Pipeline([("scaler", StandardScaler()), 
                         ("clf", KNeighborsClassifier(**fit_params))])
        fold_results = list(tqdm(Parallel(n_jobs=2, return_as="generator")(
            delayed(fit_parallel)(pipe, X_memmap, y_memmap, train, test, **fit_params)
            for (train, test) in splits
        ), 
        desc="Fitting on each fold",
        total=len(splits)))
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
            current_results[f"split{i}_subject_acc"] = result["subject_acc"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}_{recent_size}/knn_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
score_reporter(results, False, True)

In [None]:
# Decision tree
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 30
params = {
    "max_depth": [None]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        
        pipe = Pipeline([("scaler", StandardScaler()), 
                         ("clf", DecisionTreeClassifier(**fit_params))])
        
        fold_results = list(tqdm(Parallel(n_jobs=6, return_as="generator")(
            delayed(fit_parallel)(pipe, X_memmap, y_memmap, train, test, **fit_params)
            for (train, test) in splits
        ), 
        desc="Fitting on each fold",
        total=len(splits)))

        # fold_results = []
        # for (train, test) in tqdm(splits, desc="Fitting for each fold"):
        #     X_train = X.iloc[train]
        #     y_train = y.iloc[train]

        #     X_test = X.iloc[test]
        #     y_test = y.iloc[test]

        #     cloned_clf = clone(pipe)
        #     cloned_clf.fit(X_train, y_train)

        #     pred_values = cloned_clf.predict(X_test)
    
        #     total_seen = 0
        #     total_correct = 0
        #     subject_acc = []
        #     for idx, pred in enumerate(pred_values):
        #         total_seen+=1
        #         if pred==y_test.iloc[idx]:
        #             total_correct+=1
        #         subject_acc.append(total_correct/total_seen)

        #     cm = confusion_matrix(y_test.values.reshape(y_test.shape[0]), pred_values)
        #     sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
        #     specificity = cm[1][1]/(cm[1][0]+cm[1][1])
        #     precision = (cm[0][0])/(cm[0][0]+cm[1][0])

        #     results_dict = {
        #         "accuracy": accuracy_score(y_test, pred_values),
        #         "sensitivity": sensitivity,
        #         "specificity": specificity,
        #         "precision": precision,
        #         "f1_score": (2*precision*sensitivity)/(precision+sensitivity),
        #         "subject_acc": subject_acc
        #     }

        #     fold_results.append(results_dict)
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
            current_results[f"split{i}_subject_acc"] = result["subject_acc"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}_{recent_size}/dt_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
score_reporter(results, False, True)

In [None]:
# Random Forest
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 30
params = {
    "n_estimators": [300],
    "max_depth": [30]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        
        clf = RandomForestClassifier(**fit_params)
        
        fold_results = list(tqdm(Parallel(n_jobs=5, return_as="generator")(
            delayed(fit_parallel)(pipe, X_memmap, y_memmap, train, test, **fit_params)
            for (train, test) in splits
        ), 
        desc="Fitting on each fold",
        total=len(splits)))

        # fold_results = []
        # for (train, test) in tqdm(splits, desc="Fitting for each fold"):
        #     X_train = X.iloc[train]
        #     y_train = y.iloc[train]

        #     X_test = X.iloc[test]
        #     y_test = y.iloc[test]

        #     cloned_clf = clone(clf)
        #     cloned_clf.fit(X_train.astype(np.float32), y_train.astype(np.float32))

        #     pred_values = cloned_clf.predict(X_test)
    
        #     total_seen = 0
        #     total_correct = 0
        #     subject_acc = []
        #     for idx, pred in enumerate(pred_values):
        #         total_seen+=1
        #         if pred==y_test.iloc[idx]:
        #             total_correct+=1
        #         subject_acc.append(total_correct/total_seen)

        #     cm = confusion_matrix(y_test.values.reshape(y_test.shape[0]), pred_values)
        #     sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
        #     specificity = cm[1][1]/(cm[1][0]+cm[1][1])
        #     precision = (cm[0][0])/(cm[0][0]+cm[1][0])

        #     results_dict = {
        #         "accuracy": accuracy_score(y_test, pred_values),
        #         "sensitivity": sensitivity,
        #         "specificity": specificity,
        #         "precision": precision,
        #         "f1_score": (2*precision*sensitivity)/(precision+sensitivity),
        #         "subject_acc": subject_acc
        #     }

        #     fold_results.append(results_dict)
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
            current_results[f"split{i}_subject_acc"] = result["subject_acc"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}_{recent_size}/rf_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
score_reporter(results, False, True)

In [None]:
"""
# AdaBoost
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 30
params = {
    "n_estimators": [300]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        
        clf = AdaBoostClassifier(algorithm="SAMME.R",
                                **fit_params)
        fold_results = Parallel(n_jobs=4)(
            delayed(fit_xgboost_parallel)(clf, X_memmap, y_memmap, train, test, **fit_params)
            for (train, test) in splits
        )
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
            current_results[f"split{i}_feature_importances"] = result["feature_importances"]
            current_results[f"split{i}_subject_acc"] = result["subject_acc"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}/ada_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)
"""

In [None]:
# score_reporter(results, False, True)

In [None]:
# SVC
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 30
params = {
    #"kernel": ["linear", "rbf"]
    "verbose": [False]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        
        pipe = Pipeline([("scaler", StandardScaler()), 
                         ("clf", LinearSVC(**fit_params))])
        
        fold_results = list(tqdm(Parallel(n_jobs=1, return_as="generator")(
            delayed(fit_parallel)(pipe, X_memmap, y_memmap, train, test, **fit_params)
            for (train, test) in splits
        ), 
        desc="Fitting on each fold",
        total=len(splits)))

        # fold_results = []
        # for (train, test) in tqdm(splits, desc="Fitting for each fold"):
        #     X_train = X.iloc[train]
        #     y_train = y.iloc[train]

        #     X_test = X.iloc[test]
        #     y_test = y.iloc[test]

        #     cloned_clf = clone(pipe)
        #     cloned_clf.fit(X_train, y_train)

        #     pred_values = cloned_clf.predict(X_test)
    
        #     total_seen = 0
        #     total_correct = 0
        #     subject_acc = []
        #     for idx, pred in enumerate(pred_values):
        #         total_seen+=1
        #         if pred==y_test.iloc[idx]:
        #             total_correct+=1
        #         subject_acc.append(total_correct/total_seen)

        #     cm = confusion_matrix(y_test.values.reshape(y_test.shape[0]), pred_values)
        #     sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
        #     specificity = cm[1][1]/(cm[1][0]+cm[1][1])
        #     precision = (cm[0][0])/(cm[0][0]+cm[1][0])

        #     results_dict = {
        #         "accuracy": accuracy_score(y_test, pred_values),
        #         "sensitivity": sensitivity,
        #         "specificity": specificity,
        #         "precision": precision,
        #         "f1_score": (2*precision*sensitivity)/(precision+sensitivity),
        #         "subject_acc": subject_acc
        #     }

        #     fold_results.append(results_dict)
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
            current_results[f"split{i}_subject_acc"] = result["subject_acc"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}_{recent_size}/svc_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
score_reporter(results, False, True)

In [None]:
# XGBoost
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 30
params = {
    "n_estimators": [1050],
    "max_depth": [5]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        
        pipe = Pipeline([("scaler", StandardScaler()), 
                         ("clf", XGBClassifier(learning_rate = 0.1,
                            verbose=None, 
                            eval_metric='logloss',
                            tree_method='gpu_hist',
                            **fit_params))])
        fold_results = list(tqdm(Parallel(n_jobs=1, return_as="generator")(
            delayed(fit_tree_parallel)(pipe, X_memmap, y_memmap, train, test, **fit_params)
            for (train, test) in splits
        ), 
        desc="Fitting on each fold",
        total=len(splits)))
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
            current_results[f"split{i}_feature_importances"] = result["feature_importances"]
            current_results[f"split{i}_subject_acc"] = result["subject_acc"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}_{recent_size}/xg_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
score_reporter(results, True, True, "XGBoost")

In [None]:
# CatBoost
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 15
params = {
    "n_estimators": [650],
    "max_depth": [6]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        
        pipe = Pipeline([("scaler", StandardScaler()), 
                         ("clf", CatBoostClassifier(
                            learning_rate=0.1,
                            loss_function='Logloss',
                            task_type="GPU",
                            silent=True,
                            **fit_params))])
        fold_results = list(tqdm(Parallel(n_jobs=1, return_as="generator")(
            delayed(fit_tree_parallel)(pipe, X_memmap, y_memmap, train, test, **fit_params)
            for (train, test) in splits
        ), 
        desc="Fitting on each fold",
        total=len(splits)))
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
            current_results[f"split{i}_feature_importances"] = result["feature_importances"]
            current_results[f"split{i}_subject_acc"] = result["subject_acc"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}_{recent_size}/cb_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
score_reporter(results, False, True, "CatBoost")

In [None]:
# LightGBM
folder = './joblib_memmap'
try:
    os.mkdir(folder)
except FileExistsError:
    pass

dump(X, os.path.join(folder, 'X'))
X_memmap = load(os.path.join(folder, 'X'), mmap_mode='r')
dump(y, os.path.join(folder, 'y'))
y_memmap = load(os.path.join(folder, 'y'), mmap_mode='r')

#num_combinations = 15
params = {
    "n_estimators": [1000],
    "max_depth": [5]
}
param_grid = ParameterGrid(params)
print(f'Fitting with {len(param_grid)} different parameter combinations')
results = {}

with tqdm(param_grid) as pbar:
    for fit_params in pbar:
        pbar.set_description(f'Fitting parameter combination: {fit_params}')
        
        pipe = Pipeline([("scaler", StandardScaler()), 
                         ("clf", lightgbm.LGBMClassifier(
                            learning_rate=0.1,
                            verbose=-1,
                            device="cuda",
                            **fit_params))])
        fold_results = list(tqdm(Parallel(n_jobs=6, return_as="generator")(
            delayed(fit_tree_parallel)(pipe, X_memmap, y_memmap, train, test, **fit_params)
            for (train, test) in splits
        ), 
        desc="Fitting on each fold",
        total=len(splits)))
        # fold_results = []
        # for (train, test) in tqdm(splits, desc="Fitting for each fold"):
        #     X_train = X.iloc[train]
        #     y_train = y.iloc[train]

        #     X_test = X.iloc[test]
        #     y_test = y.iloc[test]

        #     cloned_clf = clone(pipe)
        #     cloned_clf.fit(X_train,y_train)

        #     pred_values = cloned_clf.predict(X_test)
    
        #     total_seen = 0
        #     total_correct = 0
        #     subject_acc = []
        #     for idx, pred in enumerate(pred_values):
        #         total_seen+=1
        #         if pred==y_test.iloc[idx]:
        #             total_correct+=1
        #         subject_acc.append(total_correct/total_seen)

        #     cm = confusion_matrix(y_test.values.reshape(y_test.shape[0]), pred_values)
        #     sensitivity = cm[0][0]/(cm[0][0]+cm[0][1])
        #     specificity = cm[1][1]/(cm[1][0]+cm[1][1])
        #     precision = (cm[0][0])/(cm[0][0]+cm[1][0])

        #     results_dict = {
        #         "accuracy": accuracy_score(y_test, pred_values),
        #         "sensitivity": sensitivity,
        #         "specificity": specificity,
        #         "precision": precision,
        #         "f1_score": (2*precision*sensitivity)/(precision+sensitivity),
        #         "feature_importances": {A: B for A, B in zip(cloned_clf.feature_names_in_, cloned_clf['clf'].feature_importances_)}, 
        #         "subject_acc": subject_acc
        #     }

        #     fold_results.append(results_dict)
        
        current_results = {}
        for i, result in enumerate(fold_results):
            current_results[f"split{i}_accuracy"] = result["accuracy"]
            current_results[f"split{i}_sensitivity"] = result["sensitivity"]
            current_results[f"split{i}_specificity"] = result["specificity"]
            current_results[f"split{i}_precision"] = result["precision"]
            current_results[f"split{i}_f1_score"] = result["f1_score"]
            current_results[f"split{i}_feature_importances"] = result["feature_importances"]
            current_results[f"split{i}_subject_acc"] = result["subject_acc"]
        results[tuple(sorted(fit_params.items()))] = current_results
        results[tuple(sorted(fit_params.items()))]['folds'] = len(splits)

with open(f'saved_results_{current_weight}_{recent_size}/lg_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
score_reporter(results, False, True, "LightGBM")