In [None]:
import pandas as pd
import numpy as np
import json
import datetime
import glob
from base64 import b64decode
import matplotlib.pyplot as plt
import seaborn as sns
import random
np.random.seed(1)
random.seed(1)

In [None]:
negative_key_table_path = "BrS_negatives_translation_table.csv"
positive_key_table_path = "BrS_positives_translation_table.csv"
path_negative = "AnonymisedECGs_json/negative"
path_positive = "AnonymisedECGs_json/positive"
test_dates_path = "20210305Ajmaline_AI.sav"
genetic_data_path = "GeneticData/available_BRS_PRS.txt"

split_path = "Removed_no_genetic/Data_Final_Same_Test_Balanced_Val/split/train_val_test.json"
labels_path = "Removed_no_genetic/Data_Final_Same_Test_Balanced_Val/labels/labels.npy"
samples_path = "Removed_no_genetic/Data_Final_Same_Test_Balanced_Val/samples/"


ecg_only_split_path = "400_dumped/Final_Data/split/train_val_test.json"

remove_no_dna = True #set to true if want to filter out patients not genetically tested
to_val = False #set to true if want the removed samples from the training set to go to the validation

# I. Read Data and Check  Data Structure

In [None]:
#Read test dates and genetic data
test_dates = pd.read_spss(test_dates_path)
genetic = pd.read_csv(genetic_data_path, header=0, sep=",")
negative = pd.read_csv(negative_key_table_path, header=0)
positive = pd.read_csv(positive_key_table_path, header=0)

In [None]:
#put all keys into one df
keys = pd.concat([negative, positive])

In [None]:
keys.columns = keys.columns.str.replace(" ", "") #remove spaces in column names

In [None]:
#put dates in same format as in ECG files
dates = {}
test_dates["Provocation_date"] = pd.to_datetime(test_dates["Provocation_date"])
    
for row in np.arange(test_dates.shape[0]):
    formatted_date = datetime.date.strftime(test_dates["Provocation_date"][row], "%m-%d-%Y")
    patient_id = int(test_dates["ID"][row])
    dates[patient_id] = formatted_date

In [None]:
#list of patients whose genetic data is available
genetic_tested = np.array(genetic["anonymous_id"])

In [None]:
def filter_patients(path_to_files, ajmaline_tested, not_tested, no_dna, diff):
    #iterate through all file names in the directory
    for name in glob.glob(path_to_files + "/*"):
        #open file
        f = open(name) 

        #load file as json file
        data = json.load(f)
        name = name.replace(path_to_files, "")
        name = name.replace("\\" , "")
        id_ecg = name.replace(".json", "")    
        pseudo_id = int(id_ecg.partition("_")[0])
        patient_id = int(keys.loc[keys["anonymous_id"] == pseudo_id, "original_patient_id"])
        
        if remove_no_dna: 
        #check if genetic data is available for this patient, if not, won't be added to list of yes/not ajmaline tested
            if pseudo_id not in genetic_tested: 
                no_dna.append(id_ecg)
                continue   
            
        #check if ECG has a field acquisition date, 
        #check if date in this field corresponds to an ajmaline test date for that patient number
        #if so store as ajmaline test, otherwise store as not tested: use for training
        #if ECG has no field acquisition date, store ECG as having different format
        if data["RestingECG"].__contains__("TestDemographics"):
            t_demographics = data["RestingECG"]["TestDemographics"]

            if t_demographics.__contains__("AcquisitionDate"):
                
                #put datetime in "%m-%d-%Y" format
                test_date = str(t_demographics["AcquisitionDate"])
                datetime.date.strftime(pd.to_datetime(test_date), "%m-%d-%Y")

                if test_date == dates[patient_id]:                
                    ajmaline_tested.append(id_ecg)
                else:
                    not_tested.append(id_ecg) 
            else:
                diff.append(id_ecg)
        else:
            diff.append(id_ecg)
                    
    return ajmaline_tested, not_tested, no_dna, diff

In [None]:
ajmaline_tested = []
not_tested = []
no_dna = []
diff = []

ajmaline_tested, not_tested, no_dna, diff = filter_patients(path_positive, ajmaline_tested, not_tested, no_dna, diff)

In [None]:
print(len(ajmaline_tested))
print(len(not_tested))
print(len(no_dna))
print(len(diff))

In [None]:
ajmaline_tested, not_tested, no_dna, diff = filter_patients(path_negative, ajmaline_tested, not_tested, no_dna, diff)

In [None]:
print(len(ajmaline_tested))
print(len(not_tested))
print(len(no_dna))
print(len(diff))

In [None]:
no_dna.sort()
no_dna

In [None]:
len(ajmaline_tested) + len(not_tested) + len(no_dna)

## Check that no patient number is more than one list 

In [None]:
print(list(set(not_tested).intersection(set(ajmaline_tested))))
print(list(set(not_tested).intersection(set(no_dna))))
print(list(set(no_dna).intersection(set(ajmaline_tested))))

## Check there's 8 leads pp & check filter type

In [None]:
def get_lead_data(lead_data, diff_str, filename):
    lead_I = []
    lead_II = []
    lead_V1 = []
    lead_V2 = []
    lead_V3 = []
    lead_V4 = []
    lead_V5 = []
    lead_V6 = []
    
    lead_indx = {0: lead_I, 1: lead_II, 2: lead_V1, 3: lead_V2, 4: lead_V3,
             5: lead_V4, 6: lead_V5, 7: lead_V6}
    
  
    #find ECG data per lead and add offset
    for index in lead_indx:
        leadoffset = float(lead_data[1][index]["LeadOffsetFirstSample"])
        
        if np.isnan(leadoffset):
            leadoffset = 0
            
        decoded = np.array(np.frombuffer(b64decode(lead_data[1][index]["WaveFormData"]), dtype=np.int16)) - leadoffset
        lead_indx[index] = decoded
        
        #if lead length is not 2500 or 5000 save as file with different structure
        if not (len(decoded) == 2500 or len(decoded) == 5000):
            error = "different sampling rate: " + str(len(decoded))
            diff_str.append({filename: error}) 
            continue
            
    return lead_indx[0], lead_indx[1], lead_indx[2], lead_indx[3], lead_indx[4], lead_indx[5], lead_indx[6], lead_indx[7], diff_str

In [None]:
#downsample leads measured at double speed
#keeps values at every other index
def downsample(lead):    
    if len(lead) == 5000:
        indeces = np.arange(0,5000,2)
        downsampled = lead[indeces]
        return downsampled
    return lead

In [None]:
#store file name with corresponding directory
def create_directories(json_file_names):
    directories = []
    for elem in json_file_names:
        if elem[0] == str(1):
            directory = path_negative + "/"+ elem
            directories.append(directory)
        if elem[0] == str(2):
            directory = path_positive + "/" + elem
            directories.append(directory)
    return directories

In [None]:
def check_and_process_data(list_of_ecg_ids):

    json_ids = [sub + ".json" for sub in list_of_ecg_ids]
    directories = create_directories(json_ids)
    transformed_files=0
    filters = pd.DataFrame()
    
    for filename in directories:
        f = open(filename)
        data = json.load(f)
        diff_str = []

        ########## checks structure ##########
        if ("RestingECG") in data:        
            ecg = data["RestingECG"]
        else:
            diff_str.append({filename: "no RestingECG"})
            continue

        if ("Waveform") in ecg:
            waveform = pd.DataFrame(ecg["Waveform"])
        else:
            diff_str.append({filename: "no Waveform"})
            continue
        ########## ################ ##########  

        ########## gets lead data ##########        
        if ("WaveformType") in waveform:
            waveform_rhythm = pd.DataFrame(waveform[waveform["WaveformType"]=="Rhythm"])

            if waveform_rhythm.empty:
                diff_str.append({filename: "no Rhythm values"})
                continue

            lead_data = waveform_rhythm["LeadData"]
            if lead_data.empty:
                diff_str.append({filename: "no LeadData"})
                continue

            lead_I, lead_II, lead_V1, lead_V2, lead_V3, lead_V4, lead_V5, lead_V6, diff_str = get_lead_data(lead_data, diff_str, filename)       
            

        else:        
            diff_str.append({filename: "no WaveformType"})
            continue
        ########## ################ ##########

        ########### checks label pos or neg ##########
        label = ""
        if "positive" in filename:
            label = "positive"
        elif "negative" in filename:
            label = "negative"
        ########## ################ ##########

        ########### saves filter type ##########    
        temp = pd.DataFrame(
        {
            "id": filename,
            "high_pass": waveform_rhythm["HighPassFilter"],
            "low_pass": waveform_rhythm["LowPassFilter"],
            "ac": waveform_rhythm["ACFilter"],
            "label": label
        })

        filters = pd.concat([filters, temp])
        ########## ################ ##########

        transformed_files = transformed_files + 1
        
    return diff_str, filters, transformed_files
        

In [None]:
diff_str, filters, transformed_files = check_and_process_data(not_tested)

In [None]:
print(diff_str, transformed_files) #diff_str is empty hence all samples have 2500 or 5000 points

## Data Exploration
### Pos and Neg number of samples

In [None]:
mini_filters = filters.loc[:, filters.columns != "id"]
m = mini_filters.groupby(["label"]).size().reset_index(name="Count")

In [None]:
#number of positive samples, negative samples, and total
tot_negatives = m.iloc[0,1]
tot_positives = m.iloc[1,1]
tot = tot_negatives + tot_positives
print(tot_negatives, tot_positives, tot)

In [None]:
counts = m["Count"]
y_pos = np.arange(len(m["label"]))
plt.bar(y_pos, counts)
plt.xticks(y_pos, m["label"])

### Number of patients per class AND Number of samples per patient per class

In [None]:
stripped_not_tested = []
for p in not_tested:
     stripped_not_tested.append(p.split("_", 1)[0]) #remove everythin after "_"

In [None]:
samples_per_patient = pd.DataFrame()
for p in stripped_not_tested:
    if p[0] == "1":
        label = "negative"
    if p[0] == "2":
        label = "positive"
        

    temp = pd.DataFrame(
        {
            "id": p,
            "label": label
        }, index = [0])
    
    samples_per_patient = pd.concat([samples_per_patient, temp])

In [None]:
samples_per_patient

In [None]:
m = samples_per_patient.groupby(["id", "label"]).size().reset_index(name="Count")
m

In [None]:
m.groupby("label")["Count"].mean()

In [None]:
m.groupby("label")["Count"].median()

In [None]:
m[m["label"]=="positive"].shape

In [None]:
m[m["label"]=="negative"].shape

In [None]:
sns.boxplot(x="label", y="Count", data = m)

### Count and Percentage per class per (high, low, ac) filter combination

In [None]:
m = mini_filters.groupby(["high_pass", "low_pass", "ac", "label"]).size().reset_index(name="Count")
m["percentage_by_class"] = 100 * m["Count"] / m.groupby("label")["Count"].transform("sum")
m["combination"] = list(zip(m.high_pass, m.low_pass, m.ac))
m.sort_values(by=["label", "percentage_by_class"], ascending=False)

### Count and Percentage per class per high pass filter

In [None]:
m = mini_filters.groupby(["high_pass", "label"]).size().reset_index(name="Count")
m["percentage_by_class"] = 100 * m["Count"] / m.groupby("label")["Count"].transform("sum")
m.sort_values(by=["label", "percentage_by_class"], ascending=False)

### Count and Percentage per class per low pass filter

In [None]:
m = mini_filters.groupby(["low_pass", "label"]).size().reset_index(name="Count")
m["percentage_by_class"] = 100 * m["Count"] / m.groupby("label")["Count"].transform("sum")
m.sort_values(by=["label", "percentage_by_class"], ascending=False)

### Count and Percentage per class per ac filter

In [None]:
m = mini_filters.groupby(["ac", "label"]).size().reset_index(name="Count")
m["percentage_by_class"] = 100 * m["Count"] / m.groupby("label")["Count"].transform("sum")
m.sort_values(by=["label", "percentage_by_class"], ascending=False)

# II. Make Independent Test Set

In [None]:
# load old train_val_test dict and keep test samples that have genetic data

#open train, val, test split used to train ECG only network
with open(ecg_only_split_path, "r") as fp:
    original_train_val_test_dict = json.load(fp)
    
original_test = original_train_val_test_dict["test"].copy()

In [None]:
test = list(set(original_test).intersection(set(not_tested)))

In [None]:
print("samples in original test set: ", len(original_test), ", samples in new test set: ", len(test))

In [None]:
#test patient pseudo ids
pos_test = [str(elem.split("_")[0]) for elem in test if elem[0]==str(2)]
neg_test = [str(elem.split("_")[0]) for elem in test if elem[0]==str(1)]
test_ids = list(dict.fromkeys(pos_test)) + list(dict.fromkeys(neg_test))  
pos_test =list(dict.fromkeys(pos_test))
neg_test =list(dict.fromkeys(neg_test))
print("number of positive test patients: ", len(pos_test),", number of negative test patients: ", len(neg_test))

In [None]:
pos_test_ecg_id = [elem for elem in test if elem[0]==str(2)]
neg_test_ecg_id = [elem for elem in test if elem[0]==str(1)]
print("number of positive test samples: ", len(pos_test_ecg_id),", number of negative test samples: ", len(neg_test_ecg_id))


# III. Make Train and Val set

In [None]:
m = samples_per_patient.groupby(["id", "label"]).size().reset_index(name="Count")
m

In [None]:
pos_ecgs = m[m["label"] == "positive"]
neg_ecgs = m[m["label"] == "negative"]

In [None]:
#get all ecg file names that contain the substring corresponding to a pseudo patient id from test set ids
not_tested = pd.DataFrame(not_tested)
not_tested.columns = ["ecg_id"]
not_tested = pd.Series(not_tested.ecg_id)  

In [None]:
pos_train_val_ids = pos_ecgs[~pos_ecgs["id"].isin(pos_test)]["id"] #get all id's that are not in the test set
neg_train_val_ids = neg_ecgs[~neg_ecgs["id"].isin(neg_test)]["id"]

In [None]:
len(pos_ecgs["id"]) + len(neg_ecgs["id"])

In [None]:
len(pos_train_val_ids) + len(neg_train_val_ids)

In [None]:
len(pos_test) + len(neg_test)

In [None]:
len(pos_train_val_ids) + len(neg_train_val_ids) + len(pos_test) + len(neg_test)

In [None]:
#get all ecg file names that contain the substring corresponding to a pseudo patient id from list of ids that are not in test set
pos_train_val_ecg_id = []
for patient_id in pos_train_val_ids:
    patient_id = str(patient_id)    
    ecgs = list(not_tested.loc[not_tested.str.contains(patient_id)].values)
    for ecg_id in ecgs:
        pos_train_val_ecg_id.append(ecg_id)    

In [None]:
neg_train_val_ecg_id = []
for patient_id in neg_train_val_ids:
    patient_id = str(patient_id)    
    ecgs = list(not_tested.loc[not_tested.str.contains(patient_id)].values)
    for ecg_id in ecgs:
        neg_train_val_ecg_id.append(ecg_id)    

In [None]:
print(len(pos_train_val_ecg_id), len(neg_train_val_ecg_id))

In [None]:
n_pos_or_neg_val = round((len(pos_train_val_ecg_id) + len(neg_train_val_ecg_id))*0.05)
pos_val = random.sample(pos_train_val_ecg_id, n_pos_or_neg_val)
neg_val = random.sample(neg_train_val_ecg_id, n_pos_or_neg_val)

In [None]:
pos_train = list(set(pos_train_val_ecg_id).symmetric_difference(pos_val))
neg_train = list(set(neg_train_val_ecg_id).symmetric_difference(neg_val))

In [None]:
print("train , val, test proportion out of total positive samples: ")
print(len(pos_train)/tot_positives, len(pos_val)/tot_positives, len(pos_test_ecg_id)/tot_positives)
print("")
print("train , val, test proportion out of total neagtive samples: ")
print(len(neg_train)/tot_negatives, len(neg_val)/tot_negatives, len(neg_test_ecg_id)/tot_negatives)

# IV. Check Filter Distribution in Train set

In [None]:
diff_str, pos_train_filters, transformed_files = check_and_process_data(pos_train)
diff_str, neg_train_filters, transformed_files = check_and_process_data(neg_train)

In [None]:
def analyse_filter_dist(df): 
    filter_combo = df.groupby(["high_pass", "low_pass", "ac", "label"]).size().reset_index(name="Count")
    filter_combo["percentage_by_class"] = 100 * filter_combo["Count"] / filter_combo.groupby("label")["Count"].transform("sum")
    filter_combo["combination"] = list(zip(filter_combo.high_pass, filter_combo.low_pass, filter_combo.ac))
    filter_combo = filter_combo.sort_values(by=["label", "percentage_by_class"], ascending=False)
    
    return filter_combo

In [None]:
p_filter_combo = analyse_filter_dist(pos_train_filters)
n_filter_combo = analyse_filter_dist(neg_train_filters)

In [None]:
n_filter_combo_head = n_filter_combo.head(5)
p_filter_combo_head = p_filter_combo.head(5)

p_filter_combo_head = p_filter_combo_head.set_index("combination")
p_filter_combo_head = p_filter_combo_head.reindex(index = n_filter_combo_head["combination"])
p_filter_combo_head = p_filter_combo_head.reset_index()

In [None]:
n_filter_combo

In [None]:
p_filter_combo

In [None]:
n_filter_combo_head

In [None]:
p_filter_combo_head

## Top 5 filter combos per class

In [None]:
ind = np.arange(p_filter_combo_head.shape[0])
width = 0.35

fig, ax = plt.subplots(figsize=(20, 12.5))
rects_neg = ax.bar(ind - width/2, n_filter_combo_head["percentage_by_class"], width, label = "Negative")
rects_pos = ax.bar(ind + width/2, p_filter_combo_head["percentage_by_class"], width, label = "Positive")
ax.set_ylabel("Percentage of samples")
ax.set_title("Top 5 percentage of samples per filter combination per class")
ax.set_xticks(ind)
y_labels = list(n_filter_combo_head["combination"])
ax.set_xticklabels(y_labels)
ax.legend()

## Difference in distributions

In [None]:
diff1 = pd.merge(n_filter_combo[["combination", "percentage_by_class"]],
                p_filter_combo[["combination", "percentage_by_class"]],
                how = "outer",
                left_on = ["combination"],
                right_on = ["combination"],
                suffixes = ["_neg", "_pos"])

diff1.fillna(0, inplace=True)
diff1["difference"] = diff1["percentage_by_class_neg"]- diff1["percentage_by_class_pos"]

diff1

In [None]:
stop here

# V. Modify filter distribution in training set - change this if removing samples with no DNA data

In [None]:
#biggest difference in (16,150, 50)
#difference is approx 16%, want percentage differences to be at most 5% 
#decrease 16 150 50 until it's 13%

goal = 0.13 #change this according to distribution differences found between classes in previous section
total =  sum(n_filter_combo["Count"]) 
n_sixteen_150_fifty = n_filter_combo[n_filter_combo["combination"] == ("16", "150", "50")]["Count"].values[0]

x = round((n_sixteen_150_fifty - goal*total) / (1-goal))

In [None]:
mask = (neg_train_filters["high_pass"] == "16") & (neg_train_filters["low_pass"] == "150") & (neg_train_filters["ac"] == "50")
sixteen_150_fifty = neg_train_filters[mask]

In [None]:
remove_from_train = random.sample(list(sixteen_150_fifty["id"]), x)

In [None]:
#remove directory and ".json" to keep only file id
remove_from_train_id = []
substr = path_negative + "/"
for file_id in remove_from_train:
    file_id = file_id.split(substr, 1)[1]
    file_id = file_id.split(".json", 1)[0]
    remove_from_train_id.append(file_id) 

In [None]:
neg_train_final = list(set(neg_train).symmetric_difference(remove_from_train_id))

In [None]:
if to_val:
    neg_val_final = neg_val + remove_from_train_id
else:    
    neg_val_final = neg_val

In [None]:
print(len(neg_train), x, len(neg_train_final), len(neg_val), len(neg_val_final))

## Check data proportions per set

In [None]:
if to_val:
    tot_negatives = tot_negatives
else:
    tot_negatives = tot_negatives - x

In [None]:
#proportion train/val/test per class
print(len(pos_train)/tot_positives, len(pos_val)/tot_positives, len(pos_test_ecg_id)/tot_positives)
print(len(neg_train_final)/tot_negatives, len(neg_val_final)/tot_negatives, len(neg_test_ecg_id)/tot_negatives)

In [None]:
n = tot_positives + tot_negatives

#proportion train/val/test
print((len(pos_train)+len(neg_train_final))/n, 
      (len(pos_val)+len(neg_val_final))/n,
      (len(pos_test_ecg_id) + len(neg_test_ecg_id))/n)


In [None]:
#positive and negative samples in train val test
print(len(pos_train),len(neg_train_final), 
      len(pos_val),len(neg_val_final),
      len(pos_test_ecg_id), len(neg_test_ecg_id))

In [None]:
#class imbalance before filter cleaning
print(len(neg_train)/len(pos_train), len(neg_val)/len(pos_val), len(neg_test_ecg_id)/len(pos_test_ecg_id))
#class imbalance after filter cleaning
print(len(neg_train_final)/len(pos_train), len(neg_val_final)/len(pos_val))

In [None]:
n

# VI. Check Filter Distribution in Training Set after Redistribution

In [None]:
#store file name with corresponding directory
diff_str, neg_train_filters, transformed_files = check_and_process_data(neg_train_final)

In [None]:
#get df with number of samples per filter combo
n_filter_combo = analyse_filter_dist(neg_train_filters)

In [None]:
#get 5 highest proportions per class and order with order of appearance for negative samples
n_filter_combo_head = n_filter_combo.head(5)
p_filter_combo_head = p_filter_combo.head(5)
p_filter_combo_head = p_filter_combo_head.set_index("combination")
p_filter_combo_head = p_filter_combo_head.reindex(index = n_filter_combo_head["combination"])
p_filter_combo_head = p_filter_combo_head.reset_index()

In [None]:
n_filter_combo

In [None]:
p_filter_combo

In [None]:
n_filter_combo_head

In [None]:
p_filter_combo_head

## Top 5 filter combos per class

In [None]:
ind = np.arange(p_filter_combo_head.shape[0])
width = 0.35

fig, ax = plt.subplots(figsize=(20, 12.5))
rects_neg = ax.bar(ind - width/2, n_filter_combo_head["percentage_by_class"], width, label = "Negative")
rects_pos = ax.bar(ind + width/2, p_filter_combo_head["percentage_by_class"], width, label = "Positive")
ax.set_ylabel("Percentage of samples")
ax.set_title("Top 5 percentage of samples per filter combination per class")
ax.set_xticks(ind)
y_labels = list(n_filter_combo_head["combination"])
ax.set_xticklabels(y_labels)
ax.legend()

## Difference in distributions

In [None]:
diff2 = pd.merge(n_filter_combo[["combination", "percentage_by_class"]],
                p_filter_combo[["combination", "percentage_by_class"]],
                how = "outer",
                left_on = ["combination"],
                right_on = ["combination"],
                suffixes = ["_neg", "_pos"])

diff2.fillna(0, inplace=True)
diff2["difference"] = diff2["percentage_by_class_neg"]- diff2["percentage_by_class_pos"]

diff2

# VII. Data Processing - from here on, same code if delete samples with no DNA data

## Get lead data, store as npy files, store labels, store dictionary with train/val/test split

In [None]:
#downsample leads measured at double speed
#keeps values at every other index
def downsample(lead):    
    if len(lead) == 5000:
        indeces = np.arange(0,5000,2)
        downsampled = lead[indeces]
        return downsampled
    return lead

In [None]:
def write_to_npy(file_id, file_directory, array):
    file_name =  str(file_id) + ".npy"
    np.save(file_directory + "/" + file_name, array)
    return

In [None]:
def process_save_data(list_of_ecg_ids, labels, pos_or_neg, samples_path):

    json_ids = [sub + ".json" for sub in list_of_ecg_ids]
    directories = create_directories(json_ids)
    
    for filename in directories:
        f = open(filename)
        data = json.load(f)
        diff_str = []
        
        if pos_or_neg == 0:
            substr = path_negative + "/"
        if pos_or_neg == 1:
            substr = path_positive + "/"
        
        ecg_id = filename.split(substr, 1)[1]
        ecg_id = ecg_id.split(".json", 1)[0]
           
        ecg = data["RestingECG"]
        waveform = pd.DataFrame(ecg["Waveform"])
        waveform_rhythm = pd.DataFrame(waveform[waveform["WaveformType"]=="Rhythm"])
        lead_data = waveform_rhythm["LeadData"]
        lead_I, lead_II, lead_V1, lead_V2, lead_V3, lead_V4, lead_V5, lead_V6, diff_str = get_lead_data(lead_data, diff_str, filename)       
            
        #downsample all leads with 5000 measurements to 2500 measurements   
        lead_I = downsample(lead_I)
        lead_II = downsample(lead_II)
        lead_V1 = downsample(lead_V1)
        lead_V2 = downsample(lead_V2)
        lead_V3 = downsample(lead_V3)
        lead_V4 = downsample(lead_V4)
        lead_V5 = downsample(lead_V5)
        lead_V6 = downsample(lead_V6)               

        #put different time stamps as different rows
        leads = np.array([lead_I, lead_II, lead_V1, lead_V2, lead_V3, lead_V4, lead_V5, lead_V6]).T

        #save label in different list
        labels.append([ecg_id, pos_or_neg])      

        #check NaNs
        if np.isnan(leads).any():
            print("Warning: NaNs in sample ", filename)

        #write to npy file
        write_to_npy(ecg_id, samples_path, leads)
        
    return labels

In [None]:
labels = process_save_data(pos_train, [], 1, samples_path)
labels = process_save_data(pos_val, labels, 1, samples_path)
labels = process_save_data(pos_test_ecg_id, labels, 1, samples_path)
labels = process_save_data(neg_train_final, labels, 0, samples_path)
labels = process_save_data(neg_val_final, labels, 0, samples_path)
labels = process_save_data(neg_test_ecg_id, labels, 0, samples_path)

In [None]:
#save labels as npy file
labels = np.array(labels)
np.save(labels_path, labels) 

In [None]:
train_val_test_dict = {
        "train": pos_train + neg_train_final , 
        "val": pos_val + neg_val_final  , 
        "test": pos_test_ecg_id + neg_test_ecg_id}   

#save indexes of train, val and test for future use
with open(split_path, "w") as fp:
    json.dump(train_val_test_dict, fp)
fp.close()


# VIII. Example of Loading Data

In [None]:
example = np.load(samples_path + "20109_2.npy")
print(example.shape)
example

# IX. Visualization

In [None]:
def plot_leads(file_id_nr):
    df = np.load(samples_path + str(file_id_nr) + ".npy")
    
    t = np.arange(df.shape[0])
    fig, axis = plt.subplots(4, 2, sharex=True, sharey=True, figsize=(25, 20))
    
    axis[0,0].plot(t, df[:,0])
    axis[0,0].set_title("Lead I")
    
    axis[0,1].plot(t, df[:,1])
    axis[0,1].set_title("Lead II")
    
    axis[1,0].plot(t, df[:,2])
    axis[1,0].set_title("Lead V1")
    
    axis[1,1].plot(t, df[:,3])
    axis[1,1].set_title("Lead V2")
    
    axis[2,0].plot(t, df[:,4])
    axis[2,0].set_title("Lead V3")
    
    axis[2,1].plot(t, df[:,5])
    axis[2,1].set_title("Lead V4")
    
    axis[3,0].plot(t, df[:,6])
    axis[3,0].set_title("Lead V5")
    
    axis[3,1].plot(t, df[:,7])
    axis[3,1].set_title("Lead V6")
    
    name = "plot_" + str(file_id_nr)
    #fig.savefig(name)
    
    return

In [None]:
plot_leads("20109_2")