# Imports

In [None]:
import numpy as np
import pandas as pd
import json
import os
import glob
import random
import gc
import keras
import mcfly
from keras import backend as K
import tensorflow as tf
from tensorflow.keras import layers, models
from keras.utils.vis_utils import plot_model
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense, concatenate
import tensorflow_addons as tfa
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import auc, roc_curve, precision_recall_curve, brier_score_loss
from sklearn.calibration import calibration_curve
from datetime import datetime 
from matplotlib import pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns
import math
from collections import Counter 

In [None]:
#choose model to laod and train
ecg_model_name = "Model6.json"
prs_model_name = "PRS1"
comb_idx = str(8) #iteration of combination of ECG model x and PRS model y

#set to True if want to train the model (trained model will be saved), False to load trained model
train_model = False

#set to true if modify and save the PRS model architecture
save_PRS = False 
#set to true if want to modify and save combined model architecture
save_combined = False
#set to true if want to save plots
save_plots = False

#set to True if want to add weights to loss function based on training class imbalance
add_weights = False
#set to True if want to force val to have same class imbalance as train
balance_val = False

#set number of epochs and patience
n_epochs = 100
n_patience = 3

"""
#path to file with indexes of files split into training, val and test
split_path = "Removed_no_genetic/Final_Indep_Data/split/train_val_test.json"

#paths to the labels and the data
labels_path = "Removed_no_genetic/Final_Indep_Data/labels/labels.npy"
samples_path = "Removed_no_genetic/Final_Indep_Data/samples/"

"""

#path to file with indexes of files split into training, val and test
split_path = "Removed_no_genetic/Data_Final_Same_Test_Balanced_Val/split/train_val_test.json"

#paths to the labels and the data
labels_path = "Removed_no_genetic/Data_Final_Same_Test_Balanced_Val/labels/labels.npy"
samples_path = "Removed_no_genetic/Data_Final_Same_Test_Balanced_Val/samples/"


#paths to store and retrieve model types, architectures 
archi_path = "Removed_no_genetic/Data_Fusion/architecture/architecture_"
type_path = "Removed_no_genetic/Data_Fusion/type/type_"

#path to store trained model
combined_model_name = ecg_model_name.split(".", 1)[0] + "_"+ prs_model_name + "_"+ comb_idx + ".json"
trained_path = "Removed_no_genetic/Data_Fusion/trained/" + combined_model_name

#path to save plots
plot_path = "Removed_no_genetic/Data_Fusion/plots/" + combined_model_name.split(".", 1)[0] + "/"
test_plot_path = "Removed_no_genetic/Data_Fusion/test_plots/" + combined_model_name.split(".", 1)[0] + "/"

#path to BRS PRS
genetic_data_path = "GeneticData/available_BRS_PRS.txt"

#set the seed 
random.seed(0) #generation of train, val, test sets
np.random.seed(0) #mcfly models
tf.random.set_seed(0) 

if not os.path.exists(plot_path):
    os.makedirs(plot_path)

# Dictionary with sample id and label

In [None]:
labels_array = np.load(labels_path)
labels = dict()

for row in labels_array:
    labels[row[0]] = int(row[1])

    
del labels_array
gc.collect()

#  PRS data

In [None]:
PRS = pd.read_csv(genetic_data_path, header=0, sep=",")

# Train, val, test split

In [None]:
#to open dictionary
with open(split_path, "r") as fp:
    train_val_test_dict = json.load(fp)    

In [None]:
#getting PRS statistics for normalisation
train_ecg_ids = train_val_test_dict["train"]
train_ids = [int(elem.split("_")[0]) for elem in train_ecg_ids]
train_ids = list(dict.fromkeys(train_ids))
PRS_train_for_norm = [PRS[PRS["anonymous_id"] == p_id]["SCORE"] for p_id in train_ids] 
train_PRS_mean = np.mean(PRS_train_for_norm)
train_PRS_std = np.std(PRS_train_for_norm, ddof=0)

print("mean train PRS: ", train_PRS_mean , ", standard dev of train PRS: ", train_PRS_std)

In [None]:
#function to create validation set and store in memory
def set_generation(val_or_test, train_val_test_dict, labels, PRS, train_PRS_mean, train_PRS_std, dim = (2500, 8)):
    n_samples = len(train_val_test_dict[val_or_test])

    #Initialise
    X_ECG = np.empty((n_samples, dim[0], dim[1]))
    X_PRS = np.empty((n_samples), dtype = float)
    y = np.empty((n_samples), dtype = int)

    #Generate data
    for i, ID in enumerate(train_val_test_dict[val_or_test]):
        p_id = int(ID.split("_", 1)[0])
        
        #store ECG sample
        X_ECG[i,] = np.load(samples_path + ID +".npy")
        
        #store PRS sample
        #X_PRS[i] = PRS[PRS["anonymous_id"] == p_id]["SCORE"]
        #X_PRS[i] = (PRS[PRS["anonymous_id"] == p_id]["SCORE"] - train_PRS_mean) /train_PRS_std
        X_PRS[i] = round(((PRS[PRS["anonymous_id"] == p_id]["SCORE"] - train_PRS_mean) /train_PRS_std)*2)/2

        #store class
        y[i] = labels[ID]
    
    return X_ECG, X_PRS, y

In [None]:
X_val_ECG, X_val_PRS, y_val = set_generation("val", train_val_test_dict, labels, PRS, train_PRS_mean, train_PRS_std, (2500, 8))
y_val = keras.utils.to_categorical(y_val, 2)

# Checks

In [None]:
print("train samples: ", len(train_val_test_dict["train"]))
print("val samples: ", len(train_val_test_dict["val"]))
print("test samples: ", len(train_val_test_dict["test"]))
print("total samples: ", len(train_val_test_dict["train"]) + len(train_val_test_dict["val"]) + len(train_val_test_dict["test"]))

In [None]:
#data proportions
n_train = len(train_val_test_dict["train"])
n_val = len(train_val_test_dict["val"])
n_test = len(train_val_test_dict["test"])
n_tot = n_train + n_val + n_test
print("proportion of train, val, test")
print(n_train*100/n_tot, n_val*100/n_tot, n_test*100/n_tot)

# Match val class imbalance to train's

In [None]:
def calculate_imbalance(data):
    pos = 0
    neg = 0
    
    for elem in data: 
        if elem[0] == str(1):
            neg = neg + 1
        if elem[0] == str(2):
            pos = pos +1
    
    if pos > neg:
        return neg, pos, pos/neg
            
    return neg, pos, neg/pos   

neg_train, pos_train, imb_train = calculate_imbalance(train_val_test_dict["train"])
neg_val, pos_val, imb_val = calculate_imbalance(train_val_test_dict["val"])


if balance_val: 
    if neg_train >= pos_train:
        n_remove_from_val = round(neg_val - imb_train * pos_val)       
        val_majority = [i for i in train_val_test_dict["val"] if i.startswith('1')]   
    else: 
        n_remove_from_val = round(pos_val - imb_train * neg_val)        
        val_majority = [i for i in train_val_test_dict["val"] if i.startswith('2')]        
    
    removed_samples =  random.sample(list(val_majority), n_remove_from_val)
    train_val_test_dict["val"]  = list(set(train_val_test_dict["val"]).symmetric_difference(removed_samples))
                                     
    print("Remaining samples in new val: ", len(train_val_test_dict["val"]))                                 
    print("number of samples in new val majority class, number of samples in new val minority class, class imbalance: ")
    print(calculate_imbalance(train_val_test_dict["val"]))
    
    #new data proportions
    n_train = len(train_val_test_dict["train"])
    n_val = len(train_val_test_dict["val"])
    n_test = len(train_val_test_dict["test"])
    n_tot = n_train + n_val + n_test
    
    print("proportion of data in train, new val, test: ")
    print(n_train*100/n_tot, n_val*100/n_tot, n_test*100/n_tot)
    
    X_val_ECG, X_val_PRS, y_val = set_generation("val", train_val_test_dict, labels, PRS, train_PRS_mean, train_PRS_std, (2500, 8)) 
    y_val = keras.utils.to_categorical(y_val, 2)     

# Data Loader

In [None]:
class DataGenerator(keras.utils.Sequence):    

    def __init__(self, list_IDs, labels, train_PRS_mean, train_PRS_std, batch_size = 32, dim_ECG = (2500,), n_channels_ECG = 8, dim_PRS = (1,), n_channels_PRS = 1, n_classes=2, shuffle = True, seed = None):
        #"Initialization"
        self.dim_ECG = dim_ECG
        self.dim_PRS = dim_PRS
        self.batch_size = batch_size
        self.labels = labels
        self.train_PRS_mean = train_PRS_mean
        self.train_PRS_std = train_PRS_std        
        self.list_IDs = list_IDs
        self.n_channels_ECG = n_channels_ECG
        self.n_channels_PRS = n_channels_PRS
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.rng = np.random.default_rng(seed=seed)
        self.on_epoch_end()      
        
    def __len__(self):
        #number of batches per epoch
        return int(np.floor(len(self.list_IDs)/self.batch_size))
    
    def __getitem__(self, index):
        #Generates indexes of one batch of data
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        
        #find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        
        #Generate data
        X, y = self.__data_generation(list_IDs_temp)        
        return X, y
    
    def on_epoch_end(self):
        #updates indexes after each epoch
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            self.rng.shuffle(self.indexes)
    
    def __data_generation(self, list_IDs_temp):
        #Generates data containing batch_size samples
        
        #Initialise
        
        X_ECG = np.empty((self.batch_size, *self.dim_ECG, self.n_channels_ECG))
        
        if self.n_channels_PRS > 1:
            X_PRS = np.empty((self.batch_size, *self.dim_PRS, self.n_channels_PRS))
        else:  
            X_PRS = np.empty((self.batch_size, *self.dim_PRS))
        
        y = np.empty((self.batch_size), dtype = int)
        
        for i, ID in enumerate(list_IDs_temp):
            #store sample
            X_ECG[i,] = np.load(samples_path + ID +".npy")
            
            p_id = int(ID.split("_", 1)[0])  
            
            if self.n_channels_PRS > 1:
                #X_PRS[i,] = PRS[PRS["anonymous_id"] == p_id]["SCORE"]
                #X_PRS[i,] = (PRS[PRS["anonymous_id"] == p_id]["SCORE"] - train_PRS_mean) / train_PRS_std
                X_PRS[i,] = round(((PRS[PRS["anonymous_id"] == p_id]["SCORE"] - train_PRS_mean) / train_PRS_std)*2)/2
                
            else:
                #X_PRS[i] = PRS[PRS["anonymous_id"] == p_id]["SCORE"]
                #X_PRS[i] = (PRS[PRS["anonymous_id"] == p_id]["SCORE"] - train_PRS_mean) / train_PRS_std
                X_PRS[i] = round(((PRS[PRS["anonymous_id"] == p_id]["SCORE"] - train_PRS_mean) / train_PRS_std)*2)/2
                
            #store class
            y[i] = self.labels[ID]         
        
                   
        return [X_ECG, X_PRS], keras.utils.to_categorical(y, num_classes = self.n_classes)
            

# Load ECG Model

In [None]:
with open(archi_path + ecg_model_name, "r") as f:
    model_loaded = json.load(f)
    model = keras.models.model_from_json(model_loaded)

In [None]:
with open(type_path + ecg_model_name, "r") as f:
    model_type = json.load(f)    
    print(model_type)

In [None]:
model.summary()

In [None]:
ECG_model = Model(inputs = model.input, outputs = model.output)

# Make PRS Model

In [None]:
PRS_input = Input(shape = (1,))
x = Dense(1, activation = "relu")(PRS_input)
PRS_model = Model(PRS_input, x)

In [None]:
if save_PRS: 
    with open(archi_path + prs_model_name + ".json", "w") as f:
                    json.dump(PRS_model.to_json(), f)

In [None]:
with open(archi_path + prs_model_name + ".json", "r") as f:
    prs_loaded = json.load(f)
    prs = keras.models.model_from_json(prs_loaded)
    prs.summary()

# Make combined Model

In [None]:
if save_combined: 
    combined_input = concatenate([ECG_model.output, PRS_model.output])
    z = Dense(3, activation = "relu")(combined_input)
    z = Dense(2, activation = "softmax")(z)
    combined_model = Model(inputs = [ECG_model.inputs, PRS_model.input], outputs = z)
    
    with open(archi_path + combined_model_name, "w") as f:
                    json.dump(combined_model.to_json(), f)
            
    plot_model(combined_model, to_file = plot_path + "combined_model.png", show_shapes = True, show_layer_names = True)

In [None]:
with open(archi_path + combined_model_name, "r") as f:
    comb_loaded = json.load(f)
    combined_model = keras.models.model_from_json(comb_loaded)
    combined_model.summary()

# Train / Load 

In [None]:
if train_model: 
    #define parameters
    params = {"dim_ECG" : (2500,),
              "dim_PRS" : (1,),
             "batch_size": 32,
             "n_classes": 2,
             "n_channels_ECG":8,
             "n_channels_PRS":1,
             "shuffle" :True,
             "seed": 0}
     
    
    #Generators 
    training_generator = DataGenerator(train_val_test_dict["train"], labels, train_PRS_mean, train_PRS_std, **params)  

    metric = ["accuracy"]
    combined_model.compile(loss="categorical_crossentropy", optimizer = "adam", metrics = metric)
    
    if add_weights:      

        #calculate class imbalance
        zeroes = 0
        ones = 0
        for i, ID in enumerate(train_val_test_dict["train"]):
            if labels[ID] == 0:
                zeroes = zeroes + 1
            if labels[ID] == 1:
                ones = ones + 1

        if ones < zeroes:
            class_weights = {0: 1., 1: zeroes/ones}
        elif zeroes < ones:
            class_weights = {0: ones/zeroes, 1: 1.}
        else:
            class_weights = {0: 1., 1: 1.}
    else:
        class_weights = {0: 1., 1: 1.}
        
    #print time    
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print(current_time)
    
    #train
    callback = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", patience = n_patience, restore_best_weights = True)
    history = combined_model.fit(training_generator,
              validation_data = ([X_val_ECG, X_val_PRS ], y_val), 
              epochs = n_epochs,
              class_weight = class_weights, 
              callbacks = callback,
              verbose = True)
    
    #print time
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print(current_time)
    
    #save the model
    combined_model.save(trained_path)
       
else:
    #load the model
    combined_model = keras.models.load_model(trained_path)

# Predictions

In [None]:
pred_probas = combined_model.predict([X_val_ECG, X_val_PRS])

In [None]:
#no BrS would appear as 0, hence transformed to [1,0] => the first column returns 1 if no BrS, 0 otherwise
no_BrS = y_val[:, 0]

#BrS appears as 1, hence transformed to [0,1] => the second column returns 1 if BrS, 0 otherwise
BrS = y_val[:,1]

# Performance

In [None]:
BrS_probas = pred_probas[:,1]
BrS_predictions = pred_probas.argmax(axis = -1)
BrS_predictions

In [None]:
def performance_metrics(y_true, y_pred, y_proba):
    conf_mat = confusion_matrix(y_true, y_pred)
    print("Confusion matrix: ")
    print(conf_mat)
    tn,fp,fn,tp = conf_mat.ravel()
    print("tn: ", tn," fp: ", fp," fn: ", fn," tp: ", tp)
    
    print("")
    matthews = ((tp*tn) - (fp*fn)) / math.sqrt(((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)))
    print("Matthews Correlation Coefficient: ", matthews)
    
    print("")
    print(classification_report(y_true, y_pred))
    
    print("")           
    precision_bis = tp/(tp+fp) #positive predictive value
    recall_bis = tp/(tp+fn)
    f1 = 2*precision_bis*recall_bis/(precision_bis+recall_bis)
    specificity = tn/(tn+fp) #true negative rate
    fnr = fn/(fn+tp)
    FPR = fp/(fp+tn)
    accuracy = (tp+tn)/(tp+tn+fp+fn)
    
    print("precision/positive predictive value: ", precision_bis)
    print("recall/sensitivity: ", recall_bis)
    print("specificity/true negative rate: ", specificity)
    print("False negative rate: ", fnr)
    print("False positive rate: ", FPR)
    print("accuracy: ", accuracy)    
    print("f1 score: ", f1) 

      
    print("")
    brier = brier_score_loss(y_true, y_proba)
    fpr, tpr, thresholds = roc_curve(y_true, y_proba)
    auc_coef = auc(fpr, tpr)
    precision, recall, thresholds = precision_recall_curve(y_true, y_proba)
    auprc = auc(recall, precision)
    print("brier score: ", brier )
    print("auc: ", auc_coef)
    print("auprc: ", auprc)
    
    
    return

In [None]:
performance_metrics(BrS, BrS_predictions, BrS_probas)

# Plots

In [None]:
if train_model:
    #plot train and validation loss
    training_loss = history.history["loss"]
    validation_loss = history.history["val_loss"]

    #number of epochs
    epoch_count = range(1, len(training_loss) +1)

    #visualise loss history
    f, ax = plt.subplots(figsize=(6,6))      
    ax.plot(epoch_count, training_loss, "r--", label="Training Loss")
    ax.plot(epoch_count, validation_loss, "b-", label="Validation Loss")
    ax.set_xlabel("Epoch")
    ax.set_ylabel("Loss")
    ax.set_title("Training and Validation Loss Over the Epochs")
    ax.legend()
    plt.savefig(plot_path + "Loss.png")

In [None]:
#ROC curve
fpr, tpr, thresholds = roc_curve(BrS, BrS_probas)
auc_coef = round(auc(fpr, tpr),3)
f, ax = plt.subplots(figsize=(6,6))
ax.plot(fpr, tpr, marker=".", label = model_type["type"] + " - AUC: " + str(auc_coef))
ax.plot([0,1], [0,1], transform = ax.transAxes, linestyle="--", label="Random Classifier")
ax.set_ylim(bottom=0, top = 1)
ax.set_xlim([0,1])
ax.set_xlabel("False Positive Rate")
ax.set_ylabel("True Positive Rate")
#ax.set_title("ROC")
ax.legend()
if save_plots:
    plt.savefig(plot_path + "ROC.png")

In [None]:
#Precision Recall curve
precision, recall, thresholds = precision_recall_curve(BrS, BrS_probas)
auprc = round(auc(recall, precision),3)
f, ax = plt.subplots(figsize=(6,6))
ax.plot(recall, precision, marker=".", label = model_type["type"] + " - AUPRC: " + str(auprc))
ax.set_xlabel("Recall (Positive label: Brugada)")
ax.set_ylabel("Precision (Positive label: Brugada)")
#ax.set_title("AUPRC")
ax.set_ylim(bottom=0, top = 1)
ax.set_xlim([0,1])
ax.legend()

if save_plots:
    plt.savefig(plot_path + "PrecisionRecallCurve.png")

In [None]:
#Calibration
# bin data and normalise counts
def counts_to_percentages(probabilities):
    bin0_01 = 0
    bin01_02=0
    bin02_03=0
    bin03_04=0
    bin04_05=0
    bin05_06=0
    bin06_07=0
    bin07_08=0
    bin08_09=0
    bin09_1=0 
    
    for val in probabilities:
    
        if val <0.1:
            bin0_01 = bin0_01 + 1
    
        elif val >= 0.1 and val <0.2:
            bin01_02= bin01_02 +1 
    
        elif val >= 0.2 and val <0.3:
            bin02_03= bin02_03 +1 
    
        elif val >= 0.3 and val <0.4:
                bin03_04= bin03_04 +1
    
        elif val >= 0.4 and val <0.5:
                bin04_05= bin04_05 +1 
    
        elif val >= 0.5 and val <0.6:
                bin05_06= bin05_06 +1 
    
        elif val >= 0.6 and val <0.7:
                    bin06_07= bin06_07 +1 
    
        elif val >= 0.7 and val <0.8:
                    bin07_08= bin07_08 +1 
    
        elif val >= 0.8 and val <0.9:
                    bin08_09= bin08_09 +1 
    
        elif val >= 0.9:
                    bin09_1= bin09_1 +1 
                
    counts = [bin0_01, bin01_02, bin02_03, bin03_04, bin04_05,
             bin05_06, bin06_07, bin07_08, bin08_09, bin09_1]    
    
    percentages = counts/np.sum(counts) *100
    
    return percentages
    
    
#plot calibration plot and histogram together
def calibration_together (BrS, BrS_probas, plot_path, per_patient = False):        
    print("plot curves and save in one png file")
    #save three plots in one png file
    fig_index = 1
      
    #save three plots in one png file
    fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(7, 12))   
    
    # plot calibration curve LSTM
    y, x = calibration_curve(BrS, BrS_probas, n_bins=10)

    ax1.plot(x, y, 'C0',marker='o', linewidth=1, label= model_type["type"], color = "darkturquoise") 
    ax1.set(xlabel= 'Predicted score', ylabel= 'True probability in each bin')
    
    ax1.set_ylim(bottom=-0.2, top = 1.2)
    ax1.set_xlim([-0.2,1.2])
    line = mlines.Line2D([0, 1], [0, 1], color='black', linestyle='--', linewidth=0.9, label= "Perfectly calibrated")
    transform = ax1.transAxes
    line.set_transform(transform)
    ax1.add_line(line)     
    ax1.legend(loc="upper left")  
  
    #HISTOGRAMS    
    x = np.arange(0,1,0.1)
    
    y = counts_to_percentages(BrS_probas)   #if instead of % want values in [0,1], do: y = counts_to_percentages(proba)/100 
    ax2.hist(x, range = [0,1], bins=10, weights = y, label= model_type["type"],
                 histtype="step", lw=3.5, color = "darkturquoise")
    
    ax2.set_xlabel("Mean predicted score")
    ax2.set_ylabel("Percentage of counts")
    ax2.legend(loc="upper center", ncol=5)
    ax2.set_ylim([0,101]) #if instead of % want probabilities, change to [0,1]     

    #plt.tight_layout()
    if per_patient: 
        #plt.savefig(plot_path + "Calibration_PP.png")
        print("hi")
    elif save_plots:
        #plt.savefig(plot_path + "Calibration.png")
        print("hi")
    
    plt.show()
        
    return

calibration_together(BrS, BrS_probas, plot_path)


In [None]:
#Discrimination
def distribution(BrS, BrS_probas, plot_path, per_patient = False):
    #probabilities distributions graphs
    true_1 = pd.DataFrame(BrS_probas, columns=['Predicted probabilities'])
    true_1['labels'] = BrS.tolist()
    true_0 = true_1.copy(deep = True) 
    indexNames = true_1[true_1['labels'] == 0].index
    true_1.drop(indexNames , inplace=True)
    indexNames = true_0[ true_0['labels'] == 1 ].index
    true_0.drop(indexNames , inplace=True)
    true_1.drop(columns=['labels'], inplace = True)
    true_0.drop(columns=['labels'], inplace = True)
    
    sns.distplot(true_1['Predicted probabilities'], hist = False, kde = True,
                 kde_kws = {'shade': True, 'linewidth': 3,"color": "r"}, label = 'Class 1')
    sns.distplot(true_0['Predicted probabilities'], hist = False, kde = True,
                     kde_kws = {'shade': True, 'linewidth': 3, "color": "g"}, label = 'Class 0')
    plt.ylabel('Density')
    plt.xlabel('Predicted score')
    plt.legend(labels=["BrP","No BrP"])

    if per_patient: 
        #plt.savefig(plot_path + "Discrimiation_PP.png")
        print("hi")
    elif save_plots:
        #plt.savefig(plot_path + "Discrimination.png")
        print("hi")
        
    plt.show()
    plt.clf()    
    return

distribution(BrS, BrS_probas, plot_path)

# Check if overfitting on the PRS
## Check if predictions that were right/wrong came from patients that were also on the training set

In [None]:
train_p_ids = [int(ID.split("_", 1)[0]) for ID in train_val_test_dict["train"]]      

In [None]:
train_p_ids_counts = Counter(train_p_ids) #number of occurences of a patient id in the list

In [None]:
patient_in_train = []
val_ids = train_val_test_dict["val"]

for ID in val_ids:   
    #get data
    p_id = int(ID.split("_", 1)[0])
    mini_x_ecg = np.load(samples_path + ID +".npy")
    #mini_x_prs = PRS[PRS["anonymous_id"] == p_id]["SCORE"]
    #mini_x_prs = (PRS[PRS["anonymous_id"] == p_id]["SCORE"] - train_PRS_mean) / train_PRS_std
    mini_x_prs = round(((PRS[PRS["anonymous_id"] == p_id]["SCORE"] - train_PRS_mean) /train_PRS_std)*2)/2
    
    
    mini_y = labels[ID]
    
    
    #predict      
    mini_pred_probas = combined_model.predict([np.expand_dims(mini_x_ecg, 0), mini_x_prs])

    #True label
    mini_BrS = mini_y

    #get probabilities and predictions
    mini_BrS_probas = mini_pred_probas[:,1]
    mini_BrS_predictions = mini_pred_probas.argmax(axis = -1)
    
    if mini_BrS == int(mini_BrS_predictions):
        correct_prediction = True
    else: 
        correct_prediction = False
        
    print(mini_BrS, mini_BrS_predictions, correct_prediction)
    
    if p_id in train_p_ids_counts: 
        row = [ID, True, train_p_ids_counts[p_id], correct_prediction]
    else: 
        row = [ID, False, 0, correct_prediction]
    
    patient_in_train.append(row)

In [None]:
patient_in_train = pd.DataFrame(patient_in_train, columns = ["ECG_id", "patient_in_train", "patient_samples_in_train", "correct_prediction"])

In [None]:
#check number of TT/tot in train, TF/tot in train, FT/tot not in train, FF//tot not in train
TT = patient_in_train[patient_in_train['patient_in_train'] & patient_in_train['correct_prediction']].shape[0]
TF = patient_in_train[patient_in_train['patient_in_train'] & (patient_in_train['correct_prediction'] == False)].shape[0]
FT = patient_in_train[(patient_in_train['patient_in_train'] == False) & patient_in_train['correct_prediction']].shape[0]
FF = patient_in_train[~(patient_in_train['patient_in_train'] | patient_in_train['correct_prediction'])].shape[0]

In [None]:
TT+TF+FT+FF

In [None]:
in_train = TT + TF
not_in_train = FT + FF

In [None]:
print("proportion of val samples from patients found in train that were correctly classified: ", TT/(in_train+0.0000000001))
print("proportion of val samples from patients found in train that were incorrectly classified: ", TF/(in_train+0.0000000001))
print("proportion of val samples from patients NOT found in train that were correctly classified: ", FT/not_in_train)
print("proportion of val samples from patients NOT found in train that were incorrectly classified: ", FF/not_in_train)

# Testing

In [None]:
X_test_ECG, X_test_PRS, y_test = set_generation("test", train_val_test_dict, labels, PRS, train_PRS_mean, train_PRS_std, (2500, 8))
y_test = keras.utils.to_categorical(y_test, 2)

In [None]:
pred_probas = combined_model.predict([X_test_ECG, X_test_PRS])
#BrS appears as 1, hence transformed to [0,1] => the second column returns 1 if BrS, 0 otherwise
BrS = y_test[:,1]
BrS_probas = pred_probas[:,1]
BrS_predictions = pred_probas.argmax(axis = -1)

performance_metrics(BrS, BrS_predictions, BrS_probas)

In [None]:
#ROC curve
fpr, tpr, thresholds = roc_curve(BrS, BrS_probas)
auc_coef = round(auc(fpr, tpr),3)
f, ax = plt.subplots(figsize=(6,6))
ax.plot(fpr, tpr, marker=".", label = model_type["type"] + " - AUC: " + str(auc_coef))
ax.plot([0,1], [0,1], transform = ax.transAxes, linestyle="--", label="Random Classifier")
ax.set_ylim(bottom=0, top = 1)
ax.set_xlim([0,1])
ax.set_xlabel("False Positive Rate")
ax.set_ylabel("True Positive Rate")
#ax.set_title("ROC")
ax.legend()
if save_plots:
    plt.savefig(test_plot_path + "ROC.png")

In [None]:
#Precision Recall curve
precision, recall, thresholds = precision_recall_curve(BrS, BrS_probas)
auprc = round(auc(recall, precision),3)
f, ax = plt.subplots(figsize=(6,6))
ax.plot(recall, precision, marker=".", label = model_type["type"] + " - AUPRC: " + str(auprc))
ax.set_xlabel("Recall (Positive label: Brugada)")
ax.set_ylabel("Precision (Positive label: Brugada)")
#ax.set_title("AUPRC")
ax.set_ylim(bottom=0, top = 1)
ax.set_xlim([0,1])
ax.legend()

if save_plots:
    plt.savefig(test_plot_path + "PrecisionRecallCurve.png")

In [None]:
calibration_together(BrS, BrS_probas, test_plot_path)

In [None]:
distribution(BrS, BrS_probas, test_plot_path)

# Testing Per Patient

In [None]:
test = train_val_test_dict["test"].copy()

In [None]:
stripped= []
for p in test:
     stripped.append(p.split("_", 1)[0]) #remove everythin after "_"
stripped = list(dict.fromkeys(stripped))

In [None]:
test_df = pd.DataFrame(test)
test_df.columns = ["ecg_id"]
test_df = pd.Series(test_df.ecg_id) 
dim = (2500, 8)
mini_ecg_x = np.empty((1, dim[0], dim[1]))
mini_prs_x = np.empty((1,1))


file_id_conf_mat = {"TN":[], "TP":[], "FN": [], "FP":[]}
p_id_reprod = {}

for p in stripped:
    all_samples =  list(test_df.loc[test_df.str.contains(p)].values)
    TP=0
    TN=0
    FP=0
    FN=0
    
    for s in all_samples:
        
        mini_ecg_x[0,] = np.load(samples_path + s +".npy")
        mini_prs_x[0,] = round(((PRS[PRS["anonymous_id"] == int(p)]["SCORE"] - train_PRS_mean) /train_PRS_std)*2)/2
        
        mini_y = labels[s] 
        if mini_y == 0: 
            mini_y = [1,0]
        if mini_y == 1:
            mini_y = [0,1]
   
        #predict and get performance metrics    
        mini_pred_probas = combined_model.predict([mini_ecg_x, mini_prs_x])

        #BrS appears as 1, hence transformed to [0,1] => the second column returns 1 if BrS, 0 otherwise
        mini_BrS = mini_y[1]

        #get probabilities and predictions
        mini_BrS_probas = mini_pred_probas[:,1]
        mini_BrS_predictions = mini_pred_probas.argmax(axis = -1)
        
        
        if mini_BrS == 0:
            if mini_BrS_predictions == 0:
                TN = TN +1
                file_id_conf_mat["TN"].append(s)
                
                
            if mini_BrS_predictions == 1:
                FP = FP +1
                file_id_conf_mat["FP"].append(s)
        
        if mini_BrS == 1:
            if mini_BrS_predictions == 1:
                TP = TP +1
                file_id_conf_mat["TP"].append(s)
                
            if mini_BrS_predictions == 0:
                FN = FN +1
                file_id_conf_mat["FN"].append(s)
                
    p_id_reprod[p] = [labels[s], TN, FP, TP, FN]

In [None]:
data = p_id_reprod
reprod = pd.DataFrame.from_dict(data, orient='index',
                       columns=['label', 'TN', 'FP', 'TP', 'FN'])

In [None]:
reprod["number_of_samples"] = reprod["TN"] + reprod["TP"] + reprod["FN"] + reprod["FP"]
reprod["fraction_correct_labels"] = (reprod["TN"] + reprod["TP"]) / reprod["number_of_samples"]
reprod["all_samples_correctly_predicted"] = np.where(reprod["fraction_correct_labels"]== 1, True, False)
reprod

In [None]:
print("number of different negative test patients: ", reprod[reprod["label"]==0].shape[0], " number of negative samples: ",sum(reprod[reprod["label"]==0]["number_of_samples"])) 
print("number negative samples per patient: ", sum(reprod[reprod["label"]==0]["number_of_samples"])/reprod[reprod["label"]==0].shape[0])

In [None]:
print("number of different positive test patients: ", reprod[reprod["label"]==1].shape[0], " number of positive samples: ",sum(reprod[reprod["label"]==1]["number_of_samples"]))
print("number positive samples per patient: ", sum(reprod[reprod["label"]==1]["number_of_samples"])/reprod[reprod["label"]==1].shape[0])

In [None]:
print("number of samples for positive patients")
sum(reprod[reprod["label"]==1]["number_of_samples"])

In [None]:
print("number of samples for negative patients")
sum(reprod[reprod["label"]==0]["number_of_samples"])

## Drop patients with less than two samples

In [None]:
reprod = reprod[reprod["number_of_samples"]>=2]

In [None]:
print("number of different negative test patients: ", reprod[reprod["label"]==0].shape[0], " number of negative samples: ",sum(reprod[reprod["label"]==0]["number_of_samples"])) 
print("number negative samples per patient: ", sum(reprod[reprod["label"]==0]["number_of_samples"])/reprod[reprod["label"]==0].shape[0])

In [None]:
print("number of different positive test patients: ", reprod[reprod["label"]==1].shape[0], " number of positive samples: ",sum(reprod[reprod["label"]==1]["number_of_samples"]))
print("number positive samples per patient: ", sum(reprod[reprod["label"]==1]["number_of_samples"])/reprod[reprod["label"]==1].shape[0])

In [None]:
print("number of samples for positive patients")
sum(reprod[reprod["label"]==1]["number_of_samples"])

In [None]:
print("number of samples for negative patients")
sum(reprod[reprod["label"]==0]["number_of_samples"])

### Fraction of correct labels : within patient agreement

In [None]:
print("General fraction of correct labels")
print(np.mean(reprod["fraction_correct_labels"]), np.median(reprod["fraction_correct_labels"]))

In [None]:
fig = plt.figure(figsize =(10, 7))
ax = fig.add_axes([0, 0, 1, 1]) 
bp = ax.boxplot(reprod["fraction_correct_labels"]) 
ax.set_xticklabels(['All groups'])
plt.title("Distribution of fraction of correct labels")
plt.show()

In [None]:
data_1 = reprod[reprod["label"]==0]["fraction_correct_labels"]
data_2 = reprod[reprod["label"]==1]["fraction_correct_labels"]
df = [data_1, data_2]
fig = plt.figure(figsize =(10, 7)) 
ax = fig.add_axes([0, 0, 1, 1])
ax.set_xticklabels(['No BrP', 'BrP'])
bp = ax.boxplot(df)
plt.title("Distribution of fraction of correct labels for positive and negative samples")
plt.show()

In [None]:
print( "mean ratio of correct predictions per patient, positives: ", np.mean(reprod[reprod["label"]==1]["fraction_correct_labels"]),
     ", negatives: ", np.mean(reprod[reprod["label"]==0]["fraction_correct_labels"]))

In [None]:
print("number of samples per patient for patients for which at least one prediction was wrong")
print(np.mean(reprod[reprod["all_samples_correctly_predicted"]==False]["number_of_samples"]), 
      np.median(reprod[reprod["all_samples_correctly_predicted"]==False]["number_of_samples"]))

In [None]:
print("number of samples per patient for patients for which all predictions were right")
print(np.mean(reprod[reprod["all_samples_correctly_predicted"]==True]["number_of_samples"]), 
      np.median(reprod[reprod["all_samples_correctly_predicted"]==True]["number_of_samples"]))

In [None]:
correct = reprod[reprod["all_samples_correctly_predicted"]==True] #patients for which 100% within patient agreement was obtained
incorrect = reprod[reprod["all_samples_correctly_predicted"]==False] #patients for which less than 100% within patient agreement was obtained

In [None]:
print("propotion of samples that reached 100% within patient agreement ", correct.shape[0]/(correct.shape[0]+incorrect.shape[0]))

In [None]:
#boxplots
data_1 = correct[correct["label"]==1]["number_of_samples"]
data_2 = correct[correct["label"]==0]["number_of_samples"]
data_3 = incorrect[incorrect["label"]==1]["number_of_samples"]
data_4 = incorrect[incorrect["label"]==0]["number_of_samples"]

df = [data_1, data_2, data_3, data_4]
fig = plt.figure(figsize=(10,8)) 
ax = fig.add_axes([0.1, 0.1, 0.75, 0.75])
ax.set_xticklabels(["All correct and BrP", "All correct and no BrP", ">1 incorrect and BrP", ">1 incorrect and no BrP"])
ax.set_ylabel("Number of samples per patient ")
ax.boxplot(df)
#plt.title("Distribution of number of samples with respect to whether all samples were correctly classified for one patient per true label")
#fig.savefig(test_plot_path + "BoxPlot_all_correct_at_least_one_wrong_per_class.png")
fig.show()

### New AUC

In [None]:
fig, ax= plt.subplots(figsize = (10,10))
colors = {0:"green", 1: "red"}
labels = {0: "no BrP", 1: "BrP"}
grouped = reprod.groupby("label")
for key, group in grouped:
    group.plot(ax = ax, kind ="scatter", x = "number_of_samples", y= "fraction_correct_labels", label = labels[key], color = colors[key], s=50)
ax.set(xlabel = "Samples available per patient", ylabel = "Fraction of correctly predicted labels")
plt.rc("axes", labelsize=20)
plt.rc("legend", fontsize=20)
plt.rc("xtick", labelsize = 20)
plt.rc("ytick", labelsize = 20)
plt.savefig(test_plot_path + "scatter_fraction_correct_per_n_samples.png")




In [None]:
# make new probabilities
#if positive label probability of BrP is fraction of correct
#if negative label, probability of BrP is 1- fraction of correct
reprod["new_probas"] = ""
reprod.loc[reprod["label"]==1, "new_probas"] = reprod["fraction_correct_labels"]
reprod.loc[reprod["label"]==0, "new_probas"] = 1 - reprod["fraction_correct_labels"]
reprod

In [None]:
# predict based on fraction of correctly predicted
original_labels = reprod["label"]
new_probas = reprod["new_probas"]

new_predictions = [1 if elem >= 0.5 else 0 for elem in new_probas]
#new_predictions = reprod["new_probas"].astype("float").round(0)
metrics =[]
performance_metrics(original_labels,new_predictions, new_probas)

In [None]:
#ROC curve
fpr, tpr, thresholds = roc_curve(original_labels, new_probas)
auc_coef = round(auc(fpr, tpr),3)
f, ax = plt.subplots(figsize=(6,6))
ax.plot(fpr, tpr, marker=".", label = model_type["type"] + " - AUC: " + str(auc_coef))
ax.plot([0,1], [0,1], transform = ax.transAxes, linestyle="--", label="Random Classifier")
ax.set_xlabel("False Positive Rate")
ax.set_ylabel("True Positive Rate")
#ax.set_title("ROC")
ax.legend()
#plt.savefig(test_plot_path + "ROC_PP.png")

In [None]:
#Precision Recall curve
precision, recall, thresholds = precision_recall_curve(original_labels, new_probas)
auprc = round(auc(recall, precision),3)
f, ax = plt.subplots(figsize=(6,6))
ax.plot(recall, precision, marker=".", label = model_type["type"] + " - AUPRC: " + str(auprc))
ax.set_xlabel("Recall (Positive label: Brugada)")
ax.set_ylabel("Precision (Positive label: Brugada)")
#ax.set_title("AUPRC")
ax.set_ylim([0.0, 1.05])
ax.legend()
#plt.savefig(test_plot_path + "AUPRC_PP.png")

In [None]:
calibration_together(original_labels.astype(float), new_probas.astype(float), test_plot_path, per_patient = True)

In [None]:
distribution(original_labels, {"Predicted probabilities": np.array(new_probas)}, test_plot_path, per_patient = True)

In [None]:
#export predictions to csv
#labels_and_predictions_p_sample = pd.DataFrame(list(zip(BrS, BrS_probas)), columns = ["label_per_sample", "prediction_per_sample"])
#labels_and_predictions_p_patient = pd.DataFrame(list(zip(original_labels, new_probas)), columns = ["label_per_sample", "prediction_per_sample"])
#labels_and_predictions_p_sample.to_csv("ecg_prs_predictions_per_sample.csv", index=False)
#labels_and_predictions_p_patient.to_csv("ecg_prs_predictions_per_patient.csv", index=False)