# Imports

In [None]:
from base64 import b64decode
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib widget
%matplotlib inline
import json
import os
import glob
import datetime
import gc
from scipy import signal

# Paths to data

In [None]:
#clinical data 
path_clinical_data = "20210305Ajmaline_AI.sav"

#negative BrS examples
path_negative = "Example_data/negative"
path_positive = "Example_data/positive"

# Functions

In [None]:
#puts dates in "clinical" in same format as in ECG records
#returns dictionaty of patient id and test date
def id_w_date(clinical):
    id_date = {}
    clinical["Provocation_date"] = pd.to_datetime(clinical["Provocation_date"])
    
    for row in np.arange(clinical.shape[0]):
        p_id = str(int(clinical["ID"][row]))
        formatted_date = datetime.date.strftime(clinical["Provocation_date"][row], "%m-%d-%Y")
        id_date[p_id]  = formatted_date
        
    return id_date

In [None]:
def is_ajmaline_tested(data, ajmaline_id_date):
    #checks if data contains patientID, if not, assume not ajmaline tested.
    #if yes, check if patient id is in list of ajmaline tested and check date of test
    
    if data["RestingECG"].__contains__("PatientDemographics") and data["RestingECG"].__contains__("TestDemographics"):
        p_demographics = data["RestingECG"]["PatientDemographics"]
        t_demographics = data["RestingECG"]["TestDemographics"]
        
        if p_demographics.__contains__("PatientID") and t_demographics.__contains__("AcquisitionDate")  :            
            patient_id = p_demographics["PatientID"] 
            test_date = str(t_demographics["AcquisitionDate"])
            
            if patient_id in ajmaline_id_date.keys() and ajmaline_id_date[patient_id] == test_date:
                return True            
            else:
                return False
        else:
            return False
    else:
        return False

In [None]:
def get_lead_data(data, base_name, pos_or_neg):
    lead_I = []
    lead_II = []
    lead_V1 = []
    lead_V2 = []
    lead_V3 = []
    lead_V4 = []
    lead_V5 = []
    lead_V6 = []
    
    lead_indx = {0: lead_I, 1: lead_II, 2: lead_V1, 3: lead_V2, 4: lead_V3,
             5: lead_V4, 6: lead_V5, 7: lead_V6}
    
    #get waveform info
    waveform = pd.DataFrame(data["RestingECG"]["Waveform"])
    
    #use rhythm ECG (not median ECG)
    rhythm_ecg = waveform[waveform["WaveformType"]=="Rhythm"]
    
    #get elements inside LeadData 
    lead_data = rhythm_ecg["LeadData"]
    
    #find ECG data per lead and add offset
    for index in lead_indx:
        leadoffset = float(lead_data[1][index]["LeadOffsetFirstSample"])
        
        #if leadoffset != 0:
            #print(base_name, leadoffset)
            
        decoded = np.array(np.frombuffer(b64decode(lead_data[1][index]["WaveFormData"]), dtype=np.int16)) - leadoffset
        lead_indx[index] = np.array([base_name, pos_or_neg, decoded], dtype=object)    
        
    return lead_indx[0], lead_indx[1], lead_indx[2], lead_indx[3], lead_indx[4], lead_indx[5], lead_indx[6], lead_indx[7]

In [None]:
def read_files(directory, clinical, pos_or_neg):    
    ajmaline_id_date = id_w_date(clinical)
    lead_I = []
    lead_II = []
    lead_V1 = []
    lead_V2 = []
    lead_V3 = []
    lead_V4 = []
    lead_V5 = []
    lead_V6 = []
    
    #iterate through all file names in the directory
    for name in glob.glob(directory + "/*"):
        #open file
        f = open(name) 
        
        #load file as json file
        data = json.load(f)         

        #check if patient id is in list of patients that were tested with ajmaline
        #only keep file if not tested with ajmaline 
        if is_ajmaline_tested(data,ajmaline_id_date) is False:

            #get file name without directory
            base_name = os.path.basename(name)

            #extract ecg lead data and decode, save lead per array with base_name and test outcome
            #0: negative test, 1: positive test
            l1, l2, lv1, lv2, lv3, lv4, lv5, lv6 = get_lead_data(data, base_name, pos_or_neg)
            lead_I.append(l1)
            lead_II.append(l2)
            lead_V1.append(lv1)
            lead_V2.append(lv2)
            lead_V3.append(lv3)
            lead_V4.append(lv4)
            lead_V5.append(lv5)
            lead_V6.append(lv6)
            
    return lead_I, lead_II, lead_V1, lead_V2, lead_V3, lead_V4, lead_V5, lead_V6                                               

In [None]:
#check number of samples per patient record for one lead
def number_of_measurements(lead):
    n_measurements = []
    for row in np.arange(lead.shape[0]):
        if len(lead[row,2]) not in n_measurements:
            n_measurements.append(len(lead[row,2]))
    return n_measurements


#downsample leads measured at double speed
#keeps values at every other index
def downsample(lead):
    indeces = np.arange(0,5000, 2)
    for row in np.arange(lead.shape[0]):
        if len(lead[row,2]) == 5000:
            presampled = lead[row,2]
            downsampled = presampled[indeces]
            lead[row,2] = downsampled                
    return lead

In [None]:
#saves arrays of measurements as panda dfs
def lead_to_df(lead):
    times = np.arange(0, 2500, 1)
    prefix = "t"
    col_names = [prefix + str(sub) for sub in times]
   
    lead_df = pd.DataFrame(list(map(np.ravel, lead[:,2])), columns = col_names)
    lead_df["base_name"] = lead[:,0]
    lead_df["outcome"] = lead[:,1]
    
    return lead_df

In [None]:
#calculate leads III, aVL, aVR, aVF
def calculate_missing_leads(lead_I, lead_II):
    lead_I = lead_I.set_index("base_name")
    lead_II = lead_II.set_index("base_name")
    
    #initialise leads
    lead_III = lead_I.copy()
    lead_aVL = lead_I.copy()
    lead_aVR = lead_I.copy()
    lead_aVF = lead_I.copy()
    
    #calculate leads
    lead_III.iloc[:,:-2] = lead_II.iloc[:,:-2] - lead_I.iloc[:,:-2]
    lead_aVL.iloc[:,:-2] = (lead_I.iloc[:,:-2] - lead_III.iloc[:,:-2])/2.0
    lead_aVR.iloc[:,:-2] = (lead_I.iloc[:,:-2] + lead_II.iloc[:,:-2])/(-2.0)
    lead_aVF.iloc[:,:-2] = (lead_II.iloc[:,:-2] + lead_III.iloc[:,:-2])/2.0
    
    #add base_name as column rather than index
    lead_III = lead_III.reset_index()
    lead_aVL = lead_aVL.reset_index()
    lead_aVR = lead_aVR.reset_index()
    lead_aVF = lead_aVF.reset_index()
    
    #put base_name is same index as in old leads
    lead_III.insert(2500, "base_name", lead_III.pop("base_name"))
    lead_aVL.insert(2500, "base_name", lead_aVL.pop("base_name"))
    lead_aVR.insert(2500, "base_name", lead_aVR.pop("base_name"))
    lead_aVF.insert(2500, "base_name", lead_aVF.pop("base_name"))
    
    return lead_III, lead_aVL, lead_aVR, lead_aVF

In [None]:
#normalise each row by its maximum value
def normalize(lead):
    max_elements = lead.iloc[:, :-2].max(axis=1)
    for element in max_elements:        
        #avoid division by zero
        if element == 0:
            element = 0.0000001
    lead.iloc[:, :-2] = lead.iloc[:, :-2].div(max_elements, axis=0)    
    return lead

In [None]:
"""
#calculate fourier transform (work in progress)
def short_term_fourier_transform(lead):
    transform = lead.copy()
    for row in np.arange(lead.shape[0]):
        f,t, transform[row, :-2] = signal.stft(lead[row])
    
    return transform
"""

In [None]:
def plot_leads(row_id, lead_I, lead_II, lead_III, lead_aVL, lead_aVR, lead_aVF, lead_V1, lead_V2, lead_V3, lead_V4, lead_V5, lead_V6):
    t = np.arange(lead_I.shape[1]-2)
    fig, axis = plt.subplots(6, 2, sharex=True, sharey=True, figsize=(6, 12))
    
    axis[0,0].plot(t, lead_I.iloc[row_id, :-2])
    axis[0,0].set_title("Lead I")
    
    axis[0,1].plot(t, lead_II.iloc[row_id, :-2])
    axis[0,1].set_title("Lead II")
    
    axis[1,0].plot(t, lead_III.iloc[row_id, :-2])
    axis[1,0].set_title("Lead III")
    
    axis[1,1].plot(t, lead_aVL.iloc[row_id, :-2])
    axis[1,1].set_title("Lead aVL")
    
    axis[2,0].plot(t, lead_aVR.iloc[row_id, :-2])
    axis[2,0].set_title("Lead aVR")
    
    axis[2,1].plot(t, lead_aVF.iloc[row_id, :-2])
    axis[2,1].set_title("Lead aVF")
    
    axis[3,0].plot(t, lead_V1.iloc[row_id, :-2])
    axis[3,0].set_title("Lead V1")
    
    axis[3,1].plot(t, lead_V2.iloc[row_id, :-2])
    axis[3,1].set_title("Lead V2")
    
    axis[4,0].plot(t, lead_V3.iloc[row_id, :-2])
    axis[4,0].set_title("Lead V3")
    
    axis[4,1].plot(t, lead_V4.iloc[row_id, :-2])
    axis[4,1].set_title("Lead V4")
    
    axis[5,0].plot(t, lead_V5.iloc[row_id, :-2])
    axis[5,0].set_title("Lead V5")
    
    axis[5,1].plot(t, lead_V6.iloc[row_id, :-2])
    axis[5,1].set_title("Lead V6")
    
    plt.show()
    
    return

# Data Processing

In [None]:
#Read clinical data
clinical = pd.read_spss(path_clinical_data)

In [None]:
#Read ECG data stored as json files
lead_I_n, lead_II_n, lead_V1_n, lead_V2_n, lead_V3_n, lead_V4_n, lead_V5_n, lead_V6_n = read_files(path_negative, clinical, 0)
lead_I_p, lead_II_p, lead_V1_p, lead_V2_p, lead_V3_p, lead_V4_p, lead_V5_p, lead_V6_p = read_files(path_positive, clinical, 1)

In [None]:
#append positive and negative samples
lead_I = np.append(lead_I_n, lead_I_p, axis=0)
lead_II = np.append(lead_II_n, lead_II_p, axis=0)
lead_V1 = np.append(lead_V1_n, lead_V1_p, axis=0)
lead_V2 = np.append(lead_V2_n, lead_V2_p, axis=0)
lead_V3 = np.append(lead_V3_n, lead_V3_p, axis=0)
lead_V4 = np.append(lead_V4_n, lead_V4_p, axis=0)
lead_V5 = np.append(lead_V5_n, lead_V5_p, axis=0)
lead_V6 = np.append(lead_V6_n, lead_V6_p, axis=0)

#delete unused arrays
del lead_I_n, lead_II_n, lead_V1_n, lead_V2_n, lead_V3_n, lead_V4_n, lead_V5_n, lead_V6_n
del lead_I_p, lead_II_p, lead_V1_p, lead_V2_p, lead_V3_p, lead_V4_p, lead_V5_p, lead_V6_p
gc.collect()

In [None]:
#get the number of measurements per row
print(number_of_measurements(lead_I))
print(number_of_measurements(lead_II))
print(number_of_measurements(lead_V1))
print(number_of_measurements(lead_V2))
print(number_of_measurements(lead_V3))
print(number_of_measurements(lead_V4))
print(number_of_measurements(lead_V5))
print(number_of_measurements(lead_V6))

In [None]:
#records have either 2500 or 5000 measurements, downsample to 2500
lead_I = downsample(lead_I)
lead_II = downsample(lead_II)
lead_V1 = downsample(lead_V1)
lead_V2 = downsample(lead_V2)
lead_V3 = downsample(lead_V3)
lead_V4 = downsample(lead_V4)
lead_V5 = downsample(lead_V5)
lead_V6 = downsample(lead_V6)

In [None]:
#check if sampling worked
print(number_of_measurements(lead_I))
print(number_of_measurements(lead_II))
print(number_of_measurements(lead_V1))
print(number_of_measurements(lead_V2))
print(number_of_measurements(lead_V3))
print(number_of_measurements(lead_V4))
print(number_of_measurements(lead_V5))
print(number_of_measurements(lead_V6))

In [None]:
#transform leads to dataframes
lead_I = lead_to_df(lead_I)
lead_II = lead_to_df(lead_II)
lead_V1 = lead_to_df(lead_V1)
lead_V2 = lead_to_df(lead_V2)
lead_V3 = lead_to_df(lead_V3)
lead_V4 = lead_to_df(lead_V4)
lead_V5 = lead_to_df(lead_V5)
lead_V6 = lead_to_df(lead_V6)

In [None]:
#calculate missing leads
lead_III, lead_aVL, lead_aVR, lead_aVF = calculate_missing_leads(lead_I, lead_II)

In [None]:
#plot before normalising
plot_leads(0, lead_I, lead_II, lead_III, lead_aVL, lead_aVR, lead_aVF, lead_V1, lead_V2, lead_V3, lead_V4, lead_V5, lead_V6)

In [None]:
#normalise the leads
lead_I = normalize(lead_I)
lead_II = normalize(lead_II)
lead_III = normalize(lead_III)
lead_aVL = normalize(lead_aVL)
lead_aVR = normalize(lead_aVR)
lead_aVF = normalize(lead_aVF)
lead_V1 = normalize(lead_V1)
lead_V2 = normalize(lead_V2)
lead_V3 = normalize(lead_V3)
lead_V4 = normalize(lead_V4)
lead_V5 = normalize(lead_V5)
lead_V6 = normalize(lead_V6)

# Data Visualisation

In [None]:
plot_leads(0, lead_I, lead_II, lead_III, lead_aVL, lead_aVR, lead_aVF, lead_V1, lead_V2, lead_V3, lead_V4, lead_V5, lead_V6)