# Imports

In [None]:
from base64 import b64decode
import numpy as np
import pandas as pd
import csv
import json
import os
import glob
import datetime
from pathlib import Path
import matplotlib.pyplot as plt

# Paths to data

In [None]:
#clinical data 
path_clinical_data = "20210305Ajmaline_AI.sav"

#BrS data
path_negative = "Ajmaline_data_json/negative"
path_positive = "Ajmaline_data_json/positive"

#directory for output data
output_directory = "DataFilteredByDate/samples"

#directory for labels 
label_directory = "DataFilteredByDate/labels/labels.npy"

#directory for information on positive and negative samples
info_directory = "DataFilteredByDate/info/info.csv"

# Functions

In [None]:
#puts dates in "clinical" in same format as in ECG records
#returns dictionary of patient id and test date
def dates(clinical):
    patient_date = []
    clinical["Provocation_date"] = pd.to_datetime(clinical["Provocation_date"])
    
    for row in np.arange(clinical.shape[0]):
        formatted_date = datetime.date.strftime(clinical["Provocation_date"][row], "%m-%d-%Y")
        dates.append(formatted_date)
        
    return dates

In [None]:
def is_ajmaline_tested(data, ajmaline_dates, file_name):
    #checks if date from test appears on clinical data (without checking patient number)
    
    if data["RestingECG"].__contains__("TestDemographics"):
        t_demographics = data["RestingECG"]["TestDemographics"]
        
        if t_demographics.__contains__("AcquisitionDate"): 
            test_date = str(t_demographics["AcquisitionDate"])
            
            if test_date in ajmaline_dates:
                print(file_name)
                print(test_date)
                print("---------")
                return True            
            else:
                return False
        else:
            return False
    else:
        return False

In [None]:
def get_lead_data(data):
    lead_I = []
    lead_II = []
    lead_V1 = []
    lead_V2 = []
    lead_V3 = []
    lead_V4 = []
    lead_V5 = []
    lead_V6 = []
    
    lead_indx = {0: lead_I, 1: lead_II, 2: lead_V1, 3: lead_V2, 4: lead_V3,
             5: lead_V4, 6: lead_V5, 7: lead_V6}
    
    #get waveform info
    waveform = pd.DataFrame(data["RestingECG"]["Waveform"])
    
    #use rhythm ECG (not median ECG)
    rhythm_ecg = waveform[waveform["WaveformType"]=="Rhythm"]
    
    #get elements inside LeadData 
    lead_data = rhythm_ecg["LeadData"]
    
    #find ECG data per lead and add offset
    for index in lead_indx:
        leadoffset = float(lead_data[1][index]["LeadOffsetFirstSample"])
            
        decoded = np.array(np.frombuffer(b64decode(lead_data[1][index]["WaveFormData"]), dtype=np.int16)) - leadoffset
        lead_indx[index] = decoded
        
    return lead_indx[0], lead_indx[1], lead_indx[2], lead_indx[3], lead_indx[4], lead_indx[5], lead_indx[6], lead_indx[7]

In [None]:
#downsample leads measured at double speed
#keeps values at every other index
def downsample(lead):    
    if len(lead) == 5000:
        indeces = np.arange(0,5000,2)
        downsampled = lead[indeces]
        return downsampled
    return lead

In [None]:
#calculate leads III, aVL, aVR, aVF
def calculate_missing_leads(lead_I, lead_II):
    lead_III = lead_II - lead_I
    lead_aVL = (lead_I - lead_III)/2.0
    lead_aVR = (lead_I + lead_II)/(-2.0)
    lead_aVF = (lead_II + lead_III)/2.0    
    return lead_III, lead_aVL, lead_aVR, lead_aVF

In [None]:
#normalise each row by its maximum value
def normalize(lead):
    max_value = max(lead)
    if max_value == 0:
        max_value = 0.0000000001
    normalised = lead/max_value 
    return normalised

In [None]:
def write_to_npy(file_id, file_directory, array):
    file_name = "id-" + str(file_id) + ".npy"
    np.save(file_directory + "/" + file_name, array)
    return

In [None]:
def process_data(directory, clinical, pos_or_neg, output_directory, file_id_index, labels):    
    ajmaline_dates = dates(clinical)
  
    #iterate through all file names in the directory
    for name in glob.glob(directory + "/*"):
        #open file
        f = open(name) 
        
        #load file as json file
        data = json.load(f)         

        #check if patient id is in list of patients that were tested with ajmaline
        #only keep file if not tested with ajmaline 
        if is_ajmaline_tested(data,ajmaline_dates, name) is False:

            #get file name without directory
            base_name = os.path.basename(name)

            #extract ecg lead data and decode, save lead per array with base_name and test outcome
            #0: negative test, 1: positive test
            lead_I, lead_II, lead_V1, lead_V2, lead_V3, lead_V4, lead_V5, lead_V6 = get_lead_data(data)
                        
            #downsample all leads with 5000 measurements to 2500 measurements   
            lead_I = downsample(lead_I)
            lead_II = downsample(lead_II)
            lead_V1 = downsample(lead_V1)
            lead_V2 = downsample(lead_V2)
            lead_V3 = downsample(lead_V3)
            lead_V4 = downsample(lead_V4)
            lead_V5 = downsample(lead_V5)
            lead_V6 = downsample(lead_V6)               
            
            #put different time stamps as different rows
            leads = np.array([lead_I, lead_II, lead_V1, lead_V2, lead_V3, lead_V4, lead_V5, lead_V6]).T
            
            #save label in different list
            labels.append(["id-" + str(file_id_index), pos_or_neg])            
            
            #check NaNs
            if np.isnan(leads).any():
                print("Warning: NaNs in sample ", file_id_index)
                
            #write to npy file
            write_to_npy(file_id_index, output_directory, leads)
            file_id_index = file_id_index + 1  
    
    return file_id_index, labels

# Read and Process Data

In [None]:
#Read clinical data
clinical = pd.read_spss(path_clinical_data)

#Read ECG data stored as json files
next_file_id, labels = process_data(path_negative, clinical, 0, output_directory, 0, [])
last_file_id, labels = process_data(path_positive, clinical, 1, output_directory, next_file_id, labels)

In [None]:
#save first and last positive sample indexes"
header = ["first_positive_sample_idx", "last_positive_sample_idx"]
data = [next_file_id, last_file_id -1]
with open(info_directory, "w", newline = "") as csv_file: 
        csv_writer = csv.writer(csv_file, delimiter = ",")
        csv_writer.writerow(header)
        csv_writer.writerow(data)

In [None]:
#save labels as npy file
labels = np.array(labels)
np.save(label_directory, labels) 

# Example of Loading Data

In [None]:
example = np.load("DataFilteredByDate/samples/id-16.npy")

In [None]:
example

In [None]:
example.shape

# Visualization

In [None]:
def plot_leads(file_id_nr):
    df = np.load("DataFilteredByDate/samples/id-" + str(file_id_nr) + ".npy")
    
    t = np.arange(df.shape[0])
    fig, axis = plt.subplots(4, 2, sharex=True, sharey=True, figsize=(25, 20))
    
    axis[0,0].plot(t, df[:,0])
    axis[0,0].set_title("Lead I")
    
    axis[0,1].plot(t, df[:,1])
    axis[0,1].set_title("Lead II")
    
    axis[1,0].plot(t, df[:,2])
    axis[1,0].set_title("Lead V1")
    
    axis[1,1].plot(t, df[:,3])
    axis[1,1].set_title("Lead V2")
    
    axis[2,0].plot(t, df[:,4])
    axis[2,0].set_title("Lead V3")
    
    axis[2,1].plot(t, df[:,5])
    axis[2,1].set_title("Lead V4")
    
    axis[3,0].plot(t, df[:,6])
    axis[3,0].set_title("Lead V5")
    
    axis[3,1].plot(t, df[:,7])
    axis[3,1].set_title("Lead V6")
    
    name = "plot_" + str(file_id_nr)
    fig.savefig(name)
    
    return

In [None]:
plot_leads(16)