# Imports

In [None]:
import numpy as np
import pandas as pd
import json
import os
import glob
import random
import gc
import keras
import mcfly
from keras import backend as K
import tensorflow as tf
import tensorflow_addons as tfa
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import auc, roc_curve, precision_recall_curve, brier_score_loss
from sklearn.calibration import calibration_curve
from datetime import datetime 
from matplotlib import pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns
import math
import scipy.stats as st
from scipy import stats
import csv

In [None]:
#set to True if want to train the models during cv, False to load pretrained cv models
train_models = False

#Put same weights as in original model
set_weights = True 

#Number of crossvalidation folds
n_folds = 5

#path to file with indexes of files split into training, val and test
split_path = "400_dumped/Final_Data/split/train_val_test.json"

#paths to the labels and the data
labels_path = "400_dumped/Final_Data/labels/labels.npy"
samples_path = "400_dumped/Final_Data/samples/"

#paths to store and retrieve model types, architectures and hyperparameters
archi_path = "400_dumped/Models_Final_Data/architecture/architecture_"
params_path = "400_dumped/Models_Final_Data/parameters/params_"
type_path = "400_dumped/Models_Final_Data/type/type_"

#choose model to laod and train
model_name = "Model6.json"

#path to trained cv models
cv_trained_path = "400_dumped/CrossValidation/Model6/Round" 
cv_t_test_path = "400_dumped/CrossValidation/Model6/t_test_data.csv"

#path to original json data, to check filter types
path_negative = "AnonymisedECGs_json/negative"
path_positive = "AnonymisedECGs_json/positive"

#set the seed 
random.seed(0) #generation of train, val, test sets
np.random.seed(0) #mcfly models
tf.random.set_seed(0) #keras training

# Dictionary with sample id and label

In [None]:
labels_array = np.load(labels_path)
labels = dict()

for row in labels_array:
    labels[row[0]] = int(row[1])

    
del labels_array
gc.collect()

# Train, val, test split

In [None]:
#open original train, val, test split to calculate original weights
with open(split_path, "r") as fp:
    train_val_test_dict = json.load(fp)

In [None]:
len(train_val_test_dict["train"]) + len(train_val_test_dict["val"])

In [None]:
len(train_val_test_dict["val"])

# Cross Validation Set Generation

In [None]:
#put together train and validation sets and shuffle
train = train_val_test_dict["train"].copy()
val = train_val_test_dict["val"].copy()

cv_samples = train
cv_samples.extend(val)

#randomly shuffle train and val
random.Random(10).shuffle(cv_samples)


# Check filter distribution in cv samples dataset

In [None]:
pos_filters = pd.DataFrame()
neg_filters = pd.DataFrame()

for elem in cv_samples: 
    
    if elem[0] == str(1):
            directory = path_negative + "/"+ elem + ".json"
            
    if elem[0] == str(2):
        directory = path_positive + "/" + elem + ".json"
    
    f = open(directory)
    data = json.load(f)
        
    ecg = data["RestingECG"]
    waveform = pd.DataFrame(ecg["Waveform"])
    waveform_rhythm = pd.DataFrame(waveform[waveform["WaveformType"]=="Rhythm"])
    
    label = ""
    if "positive" in directory:
        label = "positive"
    elif "negative" in directory:
        label = "negative"
    
    temp = pd.DataFrame(
    {
        "high_pass": waveform_rhythm["HighPassFilter"],
        "low_pass": waveform_rhythm["LowPassFilter"],
        "ac": waveform_rhythm["ACFilter"],
        "label": label
    })
    
    if label == "positive":
        pos_filters = pd.concat([pos_filters, temp])
    elif label == "negative":
        neg_filters = pd.concat([neg_filters, temp])
    

In [None]:
def analyse_filter_dist(df): 
    filter_combo = df.groupby(["high_pass", "low_pass", "ac", "label"]).size().reset_index(name="Count")
    filter_combo["percentage_by_class"] = 100 * filter_combo["Count"] / filter_combo.groupby("label")["Count"].transform("sum")
    filter_combo["combination"] = list(zip(filter_combo.high_pass, filter_combo.low_pass, filter_combo.ac))
    filter_combo = filter_combo.sort_values(by=["label", "percentage_by_class"], ascending=False)
    
    return filter_combo

In [None]:
p_filter_combo = analyse_filter_dist(pos_filters)
n_filter_combo = analyse_filter_dist(neg_filters)

In [None]:
p_filter_combo

In [None]:
n_filter_combo

In [None]:
p_filter_combo = p_filter_combo.set_index("combination")
p_filter_combo = p_filter_combo.reindex(index = n_filter_combo["combination"])
p_filter_combo = p_filter_combo.reset_index()

In [None]:
p_filter_combo

In [None]:
n_filter_combo

In [None]:
n_filter_combo_head = n_filter_combo.head(5)
p_filter_combo_head = p_filter_combo.head(5)

p_filter_combo_head = p_filter_combo_head.set_index("combination")
p_filter_combo_head = p_filter_combo_head.reindex(index = n_filter_combo_head["combination"])
p_filter_combo_head = p_filter_combo_head.reset_index()

In [None]:
ind = np.arange(p_filter_combo_head.shape[0])
width = 0.35

fig, ax = plt.subplots(figsize=(20, 12.5))
rects_neg = ax.bar(ind - width/2, n_filter_combo_head["percentage_by_class"], width, label = "Negative")
rects_pos = ax.bar(ind + width/2, p_filter_combo_head["percentage_by_class"], width, label = "Positive")
ax.set_ylabel("Percentage of samples")
ax.set_title("Top 5 percentage of samples per filter combination per class")
ax.set_xticks(ind)
y_labels = list(n_filter_combo_head["combination"])
ax.set_xticklabels(y_labels)
ax.legend()


In [None]:
diff1 = pd.merge(n_filter_combo[["combination", "percentage_by_class"]],
                p_filter_combo[["combination", "percentage_by_class"]],
                how = "outer",
                left_on = ["combination"],
                right_on = ["combination"],
                suffixes = ["_neg", "_pos"])

diff1.fillna(0, inplace=True)
diff1["difference"] = diff1["percentage_by_class_neg"]- diff1["percentage_by_class_pos"]

diff1

In [None]:
# distribution is similar to training set, hence no correcting for filters necessary
# percentage of 16, 150, 50 increased since not corrected in val, but size of val is too small for significant effect on train

# Functions

In [None]:
#make n sets for n-fold crossval
def cv_splits(cv_samples, n_folds):
    cv_sets = {}
    n_elements = round(len(cv_samples)/n_folds)
    for idx in range(n_folds):
        first_idx = idx *n_elements
        last_idx = min(first_idx + n_elements, len(cv_samples))      
        cv_sets[idx] = cv_samples[first_idx : last_idx]
    return cv_sets

In [None]:
#function to create validation and test sets and store in memory
def set_generation(val_or_test, cv_train_val_test_dict, labels, dim = (2500, 8)):
    n_samples = len(cv_train_val_test_dict[val_or_test])

    #Initialise
    X = np.empty((n_samples, dim[0], dim[1]))
    y = np.empty((n_samples), dtype = int)

    #Generate data
    for i, ID in enumerate(cv_train_val_test_dict[val_or_test]):
        #store sample
        X[i,] = np.load(samples_path + ID +".npy")

        #store class
        y[i] = labels[ID]
    
    return X, y

In [None]:
#generate batches of data
class DataGenerator(keras.utils.Sequence):    

    def __init__(self, list_IDs, labels, batch_size = 32, dim = (2500,), n_channels = 8, n_classes=2, shuffle = True):
        #"Initialization"
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()
        
    def __len__(self):
        #number of batches per epoch
        return int(np.floor(len(self.list_IDs)/self.batch_size))
    
    def __getitem__(self, index):
        #Generates indexes of one batch of data
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        
        #find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        
        #Generate data
        X, y = self.__data_generation(list_IDs_temp)        
        return X, y
    
    def on_epoch_end(self):
        #updates indexes after each epoch
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)
    
    def __data_generation(self, list_IDs_temp):
        #Generates data containing batch_size samples
        
        #Initialise
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size), dtype = int)
        
        #Generate data
        for i, ID in enumerate(list_IDs_temp):
            #store sample
            X[i,] = np.load(samples_path + ID +".npy")
            
            #store class
            y[i] = self.labels[ID]
        
        return X, keras.utils.to_categorical(y, num_classes = self.n_classes)

In [None]:
#calculate class imbalance in original training set to keep same model
def calculate_original_weights(train_val_test_dict):
    zeroes = 0
    ones = 0
    for i, ID in enumerate(train_val_test_dict["train"]):
        if labels[ID] == 0:
            zeroes = zeroes + 1
        if labels[ID] == 1:
            ones = ones + 1

    if ones < zeroes:
        class_weights = {0: 1., 1: zeroes/ones}
    elif zeroes < ones:
        class_weights = {0: ones/zeroes, 1: 1.}
    else:
        class_weights = {0: 1., 1: 1.}

    return class_weights  

In [None]:
def model_training(cv_round, cv_train_val_test_dict, labels, class_weights, X_val, y_val, cv_trained_path):
    #define parameters
    params = {"dim" : (2500,),
             "batch_size": 32,
             "n_classes": 2,
             "n_channels":8,
             "shuffle" :True}

    #Generators 
    training_generator = DataGenerator(cv_train_val_test_dict["train"], labels, **params) 
    
    #load model architecture
    with open(archi_path + model_name, "r") as f:
        model_loaded = json.load(f)
        model = keras.models.model_from_json(model_loaded)    
       
    #set metrics
    metric = ["accuracy"]
    model.compile(loss="categorical_crossentropy", optimizer = "adam", metrics = metric)     
    
    #print time    
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print(current_time)
    
    #train
    callback = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", patience = 5, restore_best_weights = True)
    history = model.fit(training_generator, 
              validation_data = (X_val, y_val), 
              epochs = 20,
              class_weight = class_weights, 
              callbacks = callback,
              verbose = True)
    
    #print time
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print(current_time)
    
    #save the model
    cv_path = cv_trained_path + model_name
    cv_path = cv_trained_path + str(cv_round) + ".json"
    model.save(cv_path)
    
    return model
    

In [None]:
def performance_metrics(y_true, y_pred, y_proba, metrics):
    conf_mat = confusion_matrix(y_true, y_pred)
    print("Confusion matrix: ")
    print(conf_mat)
    tn,fp,fn,tp = conf_mat.ravel()
    print("tn: ", tn," fp: ", fp," fn: ", fn," tp: ", tp)
    
    print("")
    matthews = ((tp*tn) - (fp*fn)) / math.sqrt(((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)))
    print("Matthews Correlation Coefficient: ", matthews)
    
    print("")
    print(classification_report(y_true, y_pred))
    
    print("")           
    precision_bis = tp/(tp+fp) #positive predictive value
    recall_bis = tp/(tp+fn)
    f1 = 2*precision_bis*recall_bis/(precision_bis+recall_bis)
    specificity = tn/(tn+fp) #true negative rate
    fnr = fn/(fn+tp)
    accuracy = (tp+tn)/(tp+tn+fp+fn)
    FPR = fp/(fp+tn)
    
    print("precision/positive predictive value: ", precision_bis)
    print("recall/sensitivity: ", recall_bis)
    print("specificity/true negative rate: ", specificity)
    print("False negative rate: ", fnr)
    print("False positive rate: ", FPR)
    print("accuracy: ", accuracy)    
    print("f1 score: ", f1) 

      
    print("")
    brier = brier_score_loss(y_true, y_proba)
    fpr, tpr, thresholds = roc_curve(y_true, y_proba)
    auc_coef = auc(fpr, tpr)
    precision, recall, thresholds = precision_recall_curve(y_true, y_proba)
    auprc = auc(recall, precision)
    print("brier score: ", brier )
    print("auc: ", auc_coef)
    print("auprc: ", auprc)
    
    metrics.append([tn, fp, fn, tp, matthews, precision_bis, recall_bis, specificity, fnr, FPR, accuracy, f1, auc_coef, auprc, brier])
    
    return 


#predicts on cv test set and gets performance stats
def predictions(model, X_test, y_test, metrics):
    pred_probas = model.predict(X_test)

    #BrS appears as 1, hence transformed to [0,1] => the second column returns 1 if BrS, 0 otherwise
    BrS = y_test[:,1]
    
    #get probabilities and predictions
    BrS_probas = pred_probas[:,1]
    BrS_predictions = pred_probas.argmax(axis = -1)
    BrS_predictions
    
    #get performance metrics
    performance_metrics(BrS, BrS_predictions, BrS_probas, metrics)
    
    return 

In [None]:
#iterates over crossval sets, makes train, val and test sets
#trains model
def cross_val(cv_samples, n_folds, labels, set_weights, train_val_test_dict, cv_trained_path):
    
    #make n different sets from data
    cv_sets = cv_splits(cv_samples, n_folds)
    #make list of indexes of each cv fold
    all_idxs = np.arange(n_folds)
    
    #calculate weights
    if set_weights:  
        class_weights = calculate_original_weights(train_val_test_dict)
    else:
        class_weights = {0: 1., 1: 1.}  
        
    #declare list to store performance metrics
    metrics = []
    
    #cross validation, n rounds, train or load model & calculate performance metrics
    for cv_round in range(n_folds):
        cv_train_val_test_dict = {}
        cv_train_val_test_dict["train"] = []
        
        #start val set as 0th set from cv sets and test as val set +1, step size 1 per cv round
        #val_idx = cv_round % n_folds
        #test_idx = (cv_round+1) % n_folds        
        #cv_train_val_test_dict["val"] = cv_sets[val_idx]
        #cv_train_val_test_dict["test"] = cv_sets[test_idx]
        
        val_test_idx = cv_round % n_folds
        val_test_set = cv_sets[val_test_idx]
        val = random.sample(val_test_set, round(len(val_test_set) * 0.5)) #half of samples in this fold go to val
        test = list(set(val_test_set).symmetric_difference(val)) #remove val samples from fold, to get 50% of data in test
        cv_train_val_test_dict["val"] = val
        cv_train_val_test_dict["test"] = test
        
        #all set indexes different to val and test become train
        #train_idxs = all_idxs[(all_idxs != val_idx) & (all_idxs != test_idx)]
        train_idxs = all_idxs[all_idxs != val_test_idx]  
        
        for train_key in train_idxs:
            cv_train_val_test_dict["train"].extend(cv_sets[train_key])
        
        if train_models:
            #generate val set and save to memory
            X_val, y_val = set_generation("val", cv_train_val_test_dict, labels, (2500, 8))
            y_val = keras.utils.to_categorical(y_val, 2)            
            
            #train model
            model = model_training(cv_round, cv_train_val_test_dict, labels, class_weights, X_val, y_val, cv_trained_path)
            
            #free memory
            del X_val, y_val
            gc.collect()
            
        else:
            #load trained model 
            cv_path = cv_trained_path + str(cv_round) + ".json"
            model = keras.models.load_model(cv_path)
            
        #generate test set and save to memory
        X_test, y_test = set_generation("test", cv_train_val_test_dict, labels, (2500, 8))
        y_test = keras.utils.to_categorical(y_test, 2)
        
        #predict and get performance metrics
        predictions(model, X_test, y_test, metrics)
        
        #free memory
        del X_test, y_test
        gc.collect()
            
    return metrics  
    

In [None]:
metrics = cross_val(cv_samples, n_folds, labels, set_weights, train_val_test_dict, cv_trained_path)


In [None]:
#calculates 95% CI
def conf_int(metrics):
    means = []
    ses = []
    conf_ints = []
    metrics = np.array(metrics)
    
    for col in range(metrics.shape[1]):
        mean = np.mean(metrics[:, col])
        se = np.std(metrics[:, col], ddof = 1)/np.sqrt(metrics.shape[0])
        t_value = stats.t.ppf(1-0.025, n_folds-1)
        ci = [mean - t_value* se, mean + t_value * se]
        
        
        means.append(mean)
        ses.append(se)
        conf_ints.append(ci)
        
    return means, ses, conf_ints

In [None]:
#tn, fp, fn, tp, matthews, precision_bis, recall_bis, specificity, fnr, fpr, accuracy, f1, auc_coef, auprc, brier
mean, se, conf_int = conf_int(metrics)

In [None]:
mean

In [None]:
se

In [None]:
#tn, fp, fn, tp, matthews, precision_bis, recall_bis, specificity, fnr, fpr, accuracy, f1, auc_coef, auprc, brier
conf_int

In [None]:
# save data to csv
f = open(cv_t_test_path, "w")
writer = csv.writer(f)
metrics = np.array(metrics)
for row in range(metrics.shape[0]):
    writer.writerow(metrics[row])
f.close()