# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mcfly
from sklearn.model_selection import train_test_split
import gc
import os
from mcfly.find_architecture import train_models_on_samples
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from imblearn.over_sampling import SMOTE

%matplotlib widget
%matplotlib inline

# Read Processed File

In [None]:
processed = pd.read_csv("Output/output.csv")

#check for nans in df
processed.isnull().sum().sum()

# Split Data into train, validation and test sets

In [None]:
X = processed.iloc[:, 1:]
y = processed.iloc[:,0]
y = y.astype(int)

del processed
gc.collect()

In [None]:
#get one outcome var per patient: select first element every 2500 ECG time points
y = y[np.arange(0, X.shape[0], 2500)]

In [None]:
#shape X into a 3D array there is an element per patient which contains 8 leads and 2500 rows
X = X.to_numpy()
X = X.reshape(int(X.shape[0]/2500), 2500, X.shape[1])

In [None]:
#split data into train, validation and test sets
X_train_imb, X_test, y_train_imb, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=0, stratify=y_test)

In [None]:
del X, y
gc.collect()

In [None]:
print("train set size", X_train_imb.shape[0])
print("validation set size", X_val.shape[0])
print("test set size", X_test.shape[0])

In [None]:
print("Class imbalance")
print(y_train_imb.tolist().count(0))
print(y_train_imb.tolist().count(1))

# SMOTE on training set

In [None]:
nsamples, nx, ny = X_train_imb.shape
print(nsamples, nx, ny)

X_train_imb = X_train_imb.reshape((nsamples, nx*ny))
X_train_imb.shape

In [None]:
smote = SMOTE(sampling_strategy = "minority", random_state=0)
X_train, y_train = smote.fit_resample(X_train_imb, y_train_imb)

In [None]:
print("Class imbalance solved?", y_train.value_counts())

In [None]:
#put X_train in 3D again: shape X into a 3D array there is an element per patient which contains 8 leads and 2500 rows
print("new size of X_train", X_train.shape)
X_train = X_train.reshape(X_train.shape[0], 2500, int(X_train.shape[1]/2500))
print("Restored dimensions of X_train: ", X_train.shape)

# Put in McFly format

In [None]:
def two_cols_per_label(y): 
    
    y = y.replace({0:"no_BRS", 1:"BRS"})
    
    #create two columns, one per class
    print(y.head())

    #integer mapping
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(y)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)

    #one hot encoding
    onehot_encoder = OneHotEncoder(sparse=False)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    print(integer_encoded)
    print(" ")
    print(onehot_encoded)
    print("-------")
    
    return onehot_encoded

In [None]:
y_train = two_cols_per_label(y_train)
y_test = two_cols_per_label(y_test)
y_val = two_cols_per_label(y_val)

# Plots

In [None]:
def plot_leads(row, df):

    #fig, axis = plt.subplots(6, 2, sharex=True, sharey=True, figsize=(6, 12))
    fig, axis = plt.subplots(4, 2, sharex=True, sharey=True, figsize=(6, 12))
    
    axis[0,0].plot(np.array(df[row, :, 0]))
    axis[0,0].set_title("Lead I")
    
    axis[0,1].plot(np.array(df[row, :, 1]))
    axis[0,1].set_title("Lead II")
    
    axis[1,0].plot(np.array(df[row, :, 2]))
    axis[1,0].set_title("Lead V1")
    
    axis[1,1].plot(np.array(df[row, :, 3]))
    axis[1,1].set_title("Lead V2")
    
    axis[2,0].plot(np.array(df[row, :, 4]))
    axis[2,0].set_title("Lead V3")
    
    axis[2,1].plot(np.array(df[row, :, 5]))
    axis[2,1].set_title("Lead V4")
    
    axis[3,0].plot(np.array(df[row, :, 6]))
    axis[3,0].set_title("Lead V5")
    
    axis[3,1].plot(np.array(df[row, :, 7]))
    axis[3,1].set_title("Lead V6")
    
    plt.show()
    
    return

In [None]:
plot_leads(7000, X_train)

# Model Generation

In [None]:
num_classes = y_train.shape[1]
metric = "accuracy"
models = mcfly.modelgen.generate_models(X_train.shape,
                                       number_of_classes = num_classes,
                                       number_of_models = 4, 
                                       metrics = [metric])

In [None]:
models_to_print = range(len(models))
for i, item in enumerate(models):
    if i in models_to_print:
        model, params, model_types = item
        print("-----------------------------------------------------------")
        print("Model " + str(i))
        print(" ")
        print("Hyperparameters: ")
        print(params)
        print(" ")
        print("Model description: ")
        model.summary()
        print(" ")
        print("Model type: ")
        print(model_types)
        print(" ")

In [None]:
models

# Model Comparison

In [None]:
resultpath = os.path.join("Mcfly_output", "models3")
if not os.path.exists(resultpath):
    os.makedirs(resultpath)

In [None]:
outputfile = os.path.join(resultpath, "modelcomparison.json")
histories, val_accuracies, val_losses = train_models_on_samples(X_train, y_train,
                                                               X_val, y_val,
                                                               models, nr_epochs=40,
                                                               subset_size=500,
                                                               early_stopping_patience = 10,
                                                               verbose = True,
                                                               batch_size = 50,
                                                               outputfile = outputfile)

print("Details of the training process were stored in ", outputfile)

In [None]:
outputfile

# Performance

In [None]:
metric = "accuracy"
modelcomparisons = pd.DataFrame({"model": [str(params) for model, params, model_types in models],
                                "model-type": [str(model_types) for model, params, model_types in models],
                                "train_{}".format(metric): [history.history[metric][-1] for history in histories],
                                "train_loss": [history.history["loss"][-1] for history in histories],
                                "val_{}".format(metric): [history.history["val_{}".format(metric)][-1] for history in histories],
                                "val_loss": [history.history["val_loss"][-1] for history in histories]
                                })
modelcomparisons.to_csv(os.path.join(resultpath, "modelcomparisons.csv"))

modelcomparisons

# Retrain

In [None]:
best_model_index = np.argmax(val_accuracies)
best_model, best_params, best_model_types = models[best_model_index]
print("Model type and parameters of the best model")
print(best_model_types)
print(best_params)

In [None]:
models[best_model_index]

In [None]:
#new model
_,_,_ = train_models_on_samples(X_train, y_train,
                               X_val, y_val,
                               [models[best_model_index]], nr_epochs = 300,
                               subset_size = None,
                               batch_size = 100,
                               early_stopping_patience = 15,
                               verbose = True,
                               outputfile = outputfile,
                               metric = metric)

In [None]:
modelname = "my_bestmodel.h5"
model_path = os.path.join(resultpath, modelname)
best_model.save(model_path)

In [None]:
#reload best model and check that weights are the same as the new one
model_reloaded = tf.keras.models.load_model(model_path)
np.all([np.all(x==y) for x,y in zip(best_model.get_weights(), model_reloaded.get_weights())])

# Investigate predictions

In [None]:
datasize = X_val.shape[0]
probs = models[0][0].predict(X_val[:datasize, :, :], batch_size = 1)

In [None]:
#columns: predictions, rows: true values
labels = ["BRS", "no_BRS"]

predicted = probs.argmax(axis=1)
y_index = y_val.argmax(axis=1)
confusion_matrix = pd.crosstab(pd.Series(y_index), pd.Series(predicted))
confusion_matrix.index = [labels[i] for i in confusion_matrix.index]
confusion_matrix.columns = [labels[i] for i in confusion_matrix.columns]
confusion_matrix.reindex(columns =[l for l in labels], fill_value=0)
confusion_matrix