In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mcfly
from sklearn.model_selection import train_test_split
import gc
import os
from mcfly.find_architecture import train_models_on_samples
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from imblearn.over_sampling import SMOTE
from keras import backend as K
import keras
import json
import glob
import datetime
import seaborn as sns
%matplotlib widget
%matplotlib inline

ECG_only = False #set to true if want to use ECG train/val/split (=> no PRS distribution per set)
split_path = "400_dumped/Final_Data/split/train_val_test.json"

#split_path = "Removed_no_genetic/Data_Final_Same_Test_Balanced_Val/split/train_val_test.json"

# Check number of patients in final train, val and test set whose samples were taken on the same day

In [None]:
#open original train, val, test split to calculate original weights
with open(split_path, "r") as fp:
    train_val_test_dict = json.load(fp)

In [None]:
def filter_patients(path_to_files, ecg_ids_and_dates):
    
    #iterate through all file names in the directory
    for name in glob.glob(path_to_files + "/*"):
        #open file
        f = open(name) 

        #load file as json file
        data = json.load(f)
        name = name.replace(path_to_files, "")
        name = name.replace("\\" , "")
        id_ecg = name.replace(".json", "")    
                    
        #check if ECG has a field acquisition date, 
        #check if date in this field corresponds to an ajmaline test date for that patient number
        #if so store as ajmaline test, otherwise store as not tested: use for training
        #if ECG has no field acquisition date, store ECG as having different format
        if data["RestingECG"].__contains__("TestDemographics"):
            t_demographics = data["RestingECG"]["TestDemographics"]

            if t_demographics.__contains__("AcquisitionDate"):
                
                #put datetime in "%m-%d-%Y" format
                test_date = str(t_demographics["AcquisitionDate"])
                test_date = datetime.date.strftime(pd.to_datetime(test_date), "%m-%d-%Y")

                ecg_ids_and_dates.append([id_ecg, test_date])
                    
    return ecg_ids_and_dates

In [None]:
ecg_ids_and_dates = []
path_negative = "AnonymisedECGs_json/negative"
path_positive = "AnonymisedECGs_json/positive"
ecg_ids_and_dates = filter_patients(path_positive, ecg_ids_and_dates)
ecg_ids_and_dates = filter_patients(path_negative, ecg_ids_and_dates)

In [None]:
ecg_ids_and_dates = pd.DataFrame(ecg_ids_and_dates, columns=["ecg_id", "ecg_date"])

In [None]:
train = train_val_test_dict["train"].copy()
val = train_val_test_dict["val"].copy()
test = train_val_test_dict["test"].copy()

In [None]:
train_ecg_ids_dates = [ecg_ids_and_dates.iloc[row, :] for row in range(ecg_ids_and_dates.shape[0]) if ecg_ids_and_dates["ecg_id"][row] in train]
val_ecg_ids_dates = [ecg_ids_and_dates.iloc[row, :]  for row in range(ecg_ids_and_dates.shape[0]) if ecg_ids_and_dates["ecg_id"][row] in val]
test_ecg_ids_dates = [ecg_ids_and_dates.iloc[row, :]  for row in range(ecg_ids_and_dates.shape[0]) if ecg_ids_and_dates["ecg_id"][row] in test]

In [None]:
train_ecg_ids_dates = pd.DataFrame(train_ecg_ids_dates)
val_ecg_ids_dates = pd.DataFrame(val_ecg_ids_dates)
test_ecg_ids_dates = pd.DataFrame(test_ecg_ids_dates)

In [None]:
train_ecg_ids_dates["ecg_id"] = [ecg_id.split("_")[0] for ecg_id in train_ecg_ids_dates["ecg_id"]]
val_ecg_ids_dates["ecg_id"] = [ecg_id.split("_")[0] for ecg_id in val_ecg_ids_dates["ecg_id"]]
test_ecg_ids_dates["ecg_id"] = [ecg_id.split("_")[0] for ecg_id in test_ecg_ids_dates["ecg_id"]]

In [None]:
print( "n patients in train, val, test: ", train_ecg_ids_dates["ecg_id"].nunique(), val_ecg_ids_dates["ecg_id"].nunique(), test_ecg_ids_dates["ecg_id"].nunique() )

## Train

In [None]:
print("Patients with duplicate dates: ")
a = train_ecg_ids_dates.groupby(train_ecg_ids_dates.columns.tolist(), as_index = False).size()
a[a["size"]>=2]

In [None]:
a[a["size"]==1]

In [None]:
print("mean number of ecgs on the same day for the same patient in train: ", np.mean(a["size"]), 
      ", median: ", np.median(a["size"]),
     ", min: ", np.min(a["size"]),
     ", max: ", np.max(a["size"]))

In [None]:
ax = plt.scatter(a["ecg_id"], a["size"])
plt.xlabel("Patients")
plt.ylabel("Number of ECGs in a particular date")
plt.xticks([])

In [None]:
train_ecg_ids_dates["label"] = ["No BrP" if elem[0]== str(1) else "BrP" for elem in train_ecg_ids_dates["ecg_id"] ]
m = train_ecg_ids_dates.groupby(["ecg_id", "label"]).size().reset_index(name="samples_per_patient")
sns.set(font_scale= 2.5)
sns.set_style("white")
fig, ax= plt.subplots(figsize = (10,10))
sns.boxplot(x="label", y="samples_per_patient", data = m)
ax.set(xlabel = "True label", ylabel= "Samples per patient")
#plt.savefig("ECG_PRS_Samples_pp_boxplot_train.png")
plt.savefig("ECG_Samples_pp_boxplot_train.png")

In [None]:
print("mean number of ecgs for the same patient in train: ", np.mean(m["samples_per_patient"]), 
      ", median: ", np.median(m["samples_per_patient"]),
     ", min: ", np.min(m["samples_per_patient"]),
     ", max: ", np.max(m["samples_per_patient"]))

## Val

In [None]:
#val
print("Patients with duplicate dates: ")
a = val_ecg_ids_dates.groupby(val_ecg_ids_dates.columns.tolist(), as_index = False).size()
a[a["size"]>=2]

In [None]:
a[a["size"]==1]

In [None]:
print("mean number of ecgs on the same day for the same patient in val: ", np.mean(a["size"]), 
      ", median: ", np.median(a["size"]),
     ", min: ", np.min(a["size"]),
     ", max: ", np.max(a["size"]))
ax = plt.scatter(a["ecg_id"], a["size"])
plt.xlabel("Patients")
plt.ylabel("Number of ECGs in a particular date")
plt.xticks([])

In [None]:
val_ecg_ids_dates["label"] = ["No BrP" if elem[0]== str(1) else "BrP" for elem in val_ecg_ids_dates["ecg_id"] ]
m = val_ecg_ids_dates.groupby(["ecg_id", "label"]).size().reset_index(name="samples_per_patient")
fig, ax= plt.subplots(figsize = (10,10))
ax = sns.boxplot(x="label", y="samples_per_patient", data = m)
ax.set(xlabel = "True label", ylabel= "Samples per patient")
#plt.savefig("ECG_PRS_Samples_pp_boxplot_val.png")
plt.savefig("ECG_Samples_pp_boxplot_val.png")

In [None]:
print("mean number of ecgs for the same patient in val: ", np.mean(m["samples_per_patient"]), 
      ", median: ", np.median(m["samples_per_patient"]),
     ", min: ", np.min(m["samples_per_patient"]),
     ", max: ", np.max(m["samples_per_patient"]))

## Test

In [None]:
#test
print("Patients with duplicate dates: ")
a = test_ecg_ids_dates.groupby(test_ecg_ids_dates.columns.tolist(), as_index = False).size()
a[a["size"]>=2]

In [None]:
a[a["size"] == 1]

In [None]:
print("mean number of ecgs on the same day for the same patient in test: ", np.mean(a["size"]), 
      ", median: ", np.median(a["size"]),
     ", min: ", np.min(a["size"]),
     ", max: ", np.max(a["size"]))

ax = plt.scatter(a["ecg_id"], a["size"])
plt.xlabel("Patients")
plt.ylabel("Number of ECGs in a particular date")
plt.xticks([])

In [None]:
test_ecg_ids_dates["label"] = ["No BrP" if elem[0]== str(1) else "BrP" for elem in test_ecg_ids_dates["ecg_id"] ]
m = test_ecg_ids_dates.groupby(["ecg_id", "label"]).size().reset_index(name="samples_per_patient")
fig, ax= plt.subplots(figsize = (10,10))
ax =sns.boxplot(x="label", y="samples_per_patient", data = m)
ax.set(xlabel = "True label", ylabel= "Samples per patient")
#plt.savefig("ECG_PRS_Samples_pp_boxplot_test.png")
plt.savefig("ECG_Samples_pp_boxplot_test.png")

In [None]:
print("mean number of ecgs for the same patient in test: ", np.mean(m["samples_per_patient"]), 
      ", median: ", np.median(m["samples_per_patient"]),
     ", min: ", np.min(m["samples_per_patient"]),
     ", max: ", np.max(m["samples_per_patient"]))

# Check no intersection btw test and other sets

In [None]:
#open original train, val, test split to calculate original weights
with open(split_path, "r") as fp:
    train_val_test_dict = json.load(fp)

In [None]:
train = train_val_test_dict["train"].copy()
val = train_val_test_dict["val"].copy()
test = train_val_test_dict["test"].copy()

In [None]:
def get_p_id(df):    
    stripped= []
    for p in df:
         stripped.append(p.split("_", 1)[0]) #remove everythin after "_"
    stripped = list(dict.fromkeys(stripped))
    return stripped

In [None]:
train_clean = get_p_id(train)
val_clean = get_p_id(val)
test_clean = get_p_id(test)

In [None]:
list(set(train_clean).intersection(set(val_clean)))

In [None]:
print("Number of patients that appear both on train and val")
len(list(set(train_clean).intersection(set(val_clean))))

In [None]:
print("Number of patients that appear both on train and test")
len(list(set(train_clean).intersection(set(test_clean))))

In [None]:
print("Number of patients that appear both on val and test")
len(list(set(val_clean).intersection(set(test_clean))))

In [None]:
set(train_clean).intersection(set(val_clean))

In [None]:
train_val_intersection = list(set(train_clean).intersection(set(val_clean)))
val_samples_from_train_patients = 0
for elem in val_ecg_ids_dates["ecg_id"]:
    if elem in  train_val_intersection:
        val_samples_from_train_patients = val_samples_from_train_patients+1
print("Number of validation samples that come from patients that also have samples in train: ", val_samples_from_train_patients)

In [None]:
print("Number of samples in validation set: ", val_ecg_ids_dates.shape[0])

In [None]:
if ECG_only:
    raise Exception("No PRS data for ECG model, stop notebook")

# Different values in BRS PRS

In [None]:
genetic_data_path = "GeneticData/available_BRS_PRS.txt"

In [None]:
PRS = pd.read_csv(genetic_data_path, header=0, sep=",")

In [None]:
PRS

In [None]:
sns.displot(PRS, x=PRS["SCORE"])

In [None]:
min(PRS["SCORE"])

In [None]:
max(PRS["SCORE"])

In [None]:
len(set(PRS["SCORE"]))

In [None]:
PRS["SCORE"].shape[0]

In [None]:
#open original train, val, test split to calculate original weights
with open(split_path, "r") as fp:
    train_val_test_dict = json.load(fp)

In [None]:
#get PRS from patients in train and val sets
keys = ['train', 'val']
n_samples = len(train_val_test_dict['train']) + len(train_val_test_dict['val'])
X_PRS = np.empty((n_samples), dtype = float)

for elem in keys: 
    for i, ID in enumerate(train_val_test_dict[elem]):
            p_id = int(ID.split("_", 1)[0])

            #store PRS sample
            X_PRS[i] = PRS[PRS["anonymous_id"] == p_id]["SCORE"]

In [None]:
len(X_PRS)

In [None]:
len(train_val_test_dict['train']) + len(train_val_test_dict['val'])

In [None]:
#get PRS from patients in train and val sets
train_samples = len(train_val_test_dict['train'])
val_samples = len(train_val_test_dict['val'])
X_PRS_t = np.empty(shape=(train_samples,2), dtype = float)
X_PRS_val = np.empty(shape=(val_samples,2), dtype = float)


for i, ID in enumerate(train_val_test_dict['train']):
        p_id = int(ID.split("_", 1)[0])

        #store PRS sample
        X_PRS_t[i, 0] = p_id
        X_PRS_t[i, 1] = PRS[PRS["anonymous_id"] == p_id]["SCORE"]
        
for i, ID in enumerate(train_val_test_dict['val']):
        p_id = int(ID.split("_", 1)[0])

        #store PRS sample
        X_PRS_val[i, 0] = p_id
        X_PRS_val[i, 1] = PRS[PRS["anonymous_id"] == p_id]["SCORE"]

In [None]:
print(len(X_PRS_t), len(X_PRS_val))

In [None]:
print(len(np.unique(X_PRS_t, axis=0)), len(np.unique(X_PRS_val, axis=0)))
#should be the same as number of unique patient id in train and val

In [None]:
print(len(np.unique(X_PRS_t[:,1], axis=0)), len(np.unique(X_PRS_val[:,1], axis=0)))
#number of different PRS scores in test and val => goal: check if there's less unique PRS than unique patients (then can use PRS cause not perfect predictor)

In [None]:
train = train_val_test_dict["train"].copy()
val = train_val_test_dict["val"].copy()
test = train_val_test_dict["test"].copy()


def get_p_id(df):    
    stripped= []
    for p in df:
         stripped.append(p.split("_", 1)[0]) #remove everythin after "_"
    stripped = list(dict.fromkeys(stripped))
    return stripped

train_clean = get_p_id(train)
val_clean = get_p_id(val)
test_clean = get_p_id(test)


In [None]:
print(len(np.unique(train_clean)), len(train_val_test_dict['train']))
#number of train patients, and number of train samples

In [None]:
print(len(np.unique(val_clean)), len(train_val_test_dict['val']))
#number of val patients, and number of val samples

In [None]:
"""
want to check number of patients that appear both in train and val 
and compare to number of prs that appear both in trian and val
"""

In [None]:
len(list(set(train_clean).intersection(set(val_clean))))
#149 patients appear both on train and val

In [None]:
len(np.intersect1d(np.unique(X_PRS_t[:,1]), (X_PRS_val[:,1])))
#111 PRS appear both on train and val < number of patients that appear on both train and val

## Check train, val test dis of PRS before and after processing

In [None]:
labels_path = "Removed_no_genetic/Data_Final_Same_Test_Balanced_Val/labels/labels.npy"
samples_path = "Removed_no_genetic/Data_Final_Same_Test_Balanced_Val/samples/"
labels_array = np.load(labels_path)
labels = dict()

for row in labels_array:
    labels[row[0]] = int(row[1])

    
del labels_array
gc.collect()

In [None]:
#getting PRS statistics for normalisation
train_ecg_ids = train_val_test_dict["train"]
train_ids = [int(elem.split("_")[0]) for elem in train_ecg_ids]
train_ids = list(dict.fromkeys(train_ids))
PRS_train_for_norm = [PRS[PRS["anonymous_id"] == p_id]["SCORE"] for p_id in train_ids] 
train_PRS_mean = np.mean(PRS_train_for_norm)
train_PRS_std = np.std(PRS_train_for_norm, ddof=0)

print("mean train PRS: ", train_PRS_mean , ", standard dev of train PRS: ", train_PRS_std)

In [None]:
PRS

In [None]:
PRS["normalised_PRS"] = (PRS["SCORE"] - train_PRS_mean) /train_PRS_std

In [None]:
PRS["rounded_normalised_PRS"] = round(PRS["normalised_PRS"] * 2) /2

In [None]:
PRS

In [None]:
PRS["SCORE"].nunique()

In [None]:
PRS["rounded_normalised_PRS"].nunique()

In [None]:
print("min, max raw PRS, normalized PRS, rounded PRS: ", min(PRS["SCORE"]), max(PRS["SCORE"]),
     min(PRS["normalised_PRS"]), max(PRS["normalised_PRS"]),
     min(PRS["rounded_normalised_PRS"]), max(PRS["rounded_normalised_PRS"]))

In [None]:
PRS_train = PRS[PRS["anonymous_id"].isin(train_ids)]

In [None]:
PRS_train

In [None]:
PRS_train.nunique()

In [None]:
a = PRS_train.groupby(["SCORE"]).size().reset_index(name="Count")

In [None]:
print(min(a["Count"]), np.mean(a["Count"]), np.median(a["Count"]), max(a["Count"]))

In [None]:
PRS_train["label"] = [0 if int(str(elem)[0])== 1 else 1 for elem in PRS_train["anonymous_id"]]

In [None]:
PRS_train

In [None]:
b = PRS_train.groupby(["SCORE", "label"]).size().reset_index(name="Count")

In [None]:
b

In [None]:
b_0 = b[b["label"]==0]
b_1 = b[b["label"]==1]

In [None]:
len(list(set(b_0["SCORE"]).intersection(set(b_1["SCORE"]))))

In [None]:
print("min, max raw PRS, normalized PRS, rounded PRS in train: ", min(PRS_train["SCORE"]), max(PRS_train["SCORE"]),
     min(PRS_train["normalised_PRS"]), max(PRS_train["normalised_PRS"]),
     min(PRS_train["rounded_normalised_PRS"]), max(PRS_train["rounded_normalised_PRS"]))

In [None]:
sns.set(font_scale= 3)
sns.set_style("white")
sns.displot(PRS_train, x=PRS_train["SCORE"],height=10, aspect=1)
plt.xlabel("PRS")
plt.savefig("raw_PRS_dist.png")

In [None]:
sns.displot(PRS_train, x=PRS_train["rounded_normalised_PRS"], bins= np.arange(-3, 3.5, 0.5), height=10, aspect=1)
plt.xlabel("Rounded normalised PRS")
plt.savefig("rounded_normalised_PRS_dist.png")

In [None]:
(0.000069 - 0.0000344) /12

In [None]:
np.arange(0.0000344, 0.00007, 0.000002883333333333333)

In [None]:
val_ecg_ids = train_val_test_dict["val"]
val_ids = [int(elem.split("_")[0]) for elem in val_ecg_ids]
val_ids = list(dict.fromkeys(val_ids))

PRS_train[PRS_train["anonymous_id"].isin(val_ids)]

In [None]:
scores_in_train_and_val = PRS_train[PRS_train["anonymous_id"].isin(val_ids)]["SCORE"]

In [None]:
#got patients that appear in train and val
#got counts per PRS in train - number of train patients with the same PRS per PRS
#wanna check if PRS appearing in train is a perfect predictor of val
#not perfect predictor if PRSs that appear in val can be from multiple patients in train
print("Min and max  repeated PRS scores in train for patients in the validation set that appear in the training set")
print(min(a[a["SCORE"].isin(scores_in_train_and_val)]["Count"]), max(a[a["SCORE"].isin(scores_in_train_and_val)]["Count"]))