# Imports

In [None]:
from base64 import b64decode
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import os
import glob
import datetime
import gc
from scipy import signal
import csv
from pathlib import Path
%matplotlib widget
%matplotlib inline
pd.set_option("display.max_rows", None, "display.max_columns", None)

# Paths to data

In [None]:
#clinical data 
path_clinical_data = "20210305Ajmaline_AI.sav"

#negative BrS examples
path_negative = "Ajmaline_data_json/negative"
path_positive = "Ajmaline_data_json/positive"

# Functions

In [None]:
#fill csv file at specified directory
#action: "w" to write csv file, "a" to append new rows
def write_to_csv(data, file_path, action):       
    with open(file_path, action, newline = "") as csv_file: 
        csv_writer = csv.writer(csv_file, delimiter = ",")
        for line in data:
            csv_writer.writerow(line)
        csv_file.close()
    return

#specify directory to save csv file of data after processing
dir_path = Path("Output")
file_name = "output.csv"
file_path = dir_path.joinpath(file_name)

#make csv file and fill with column names
header_row = ["file_name", "outcome", "lead_I", "lead_II", "lead_III", "lead_aVL", "lead_aVR", 
              "lead_aVF", "lead_V1", "lead_V2", "lead_V3", "lead_V4", "lead_V5", "lead_V6"]
data=[header_row]
write_to_csv(data, file_path, "w")

In [None]:
#puts dates in "clinical" in same format as in ECG records
#returns dictionary of patient id and test date
def id_w_date(clinical):
    id_date = {}
    clinical["Provocation_date"] = pd.to_datetime(clinical["Provocation_date"])
    
    for row in np.arange(clinical.shape[0]):
        p_id = str(int(clinical["ID"][row]))
        formatted_date = datetime.date.strftime(clinical["Provocation_date"][row], "%m-%d-%Y")
        id_date[p_id]  = formatted_date
        
    return id_date

In [None]:
def is_ajmaline_tested(data, ajmaline_id_date):
    #checks if data contains patientID, if not, assume not ajmaline tested.
    #if yes, check if patient id is in list of ajmaline tested and check date of test
    
    if data["RestingECG"].__contains__("PatientDemographics") and data["RestingECG"].__contains__("TestDemographics"):
        p_demographics = data["RestingECG"]["PatientDemographics"]
        t_demographics = data["RestingECG"]["TestDemographics"]
        
        if p_demographics.__contains__("PatientID") and t_demographics.__contains__("AcquisitionDate")  :            
            patient_id = p_demographics["PatientID"] 
            test_date = str(t_demographics["AcquisitionDate"])
            
            if patient_id in ajmaline_id_date.keys() and ajmaline_id_date[patient_id] == test_date:
                return True            
            else:
                return False
        else:
            return False
    else:
        return False

In [None]:
def get_lead_data(data):
    lead_I = []
    lead_II = []
    lead_V1 = []
    lead_V2 = []
    lead_V3 = []
    lead_V4 = []
    lead_V5 = []
    lead_V6 = []
    
    lead_indx = {0: lead_I, 1: lead_II, 2: lead_V1, 3: lead_V2, 4: lead_V3,
             5: lead_V4, 6: lead_V5, 7: lead_V6}
    
    #get waveform info
    waveform = pd.DataFrame(data["RestingECG"]["Waveform"])
    
    #use rhythm ECG (not median ECG)
    rhythm_ecg = waveform[waveform["WaveformType"]=="Rhythm"]
    
    #get elements inside LeadData 
    lead_data = rhythm_ecg["LeadData"]
    
    #find ECG data per lead and add offset
    for index in lead_indx:
        leadoffset = float(lead_data[1][index]["LeadOffsetFirstSample"])
            
        decoded = np.array(np.frombuffer(b64decode(lead_data[1][index]["WaveFormData"]), dtype=np.int16)) - leadoffset
        lead_indx[index] = decoded
        
    return lead_indx[0], lead_indx[1], lead_indx[2], lead_indx[3], lead_indx[4], lead_indx[5], lead_indx[6], lead_indx[7]

In [None]:
#downsample leads measured at double speed
#keeps values at every other index
def downsample(lead):    
    if len(lead) == 5000:
        indeces = np.arange(0,5000,2)
        downsampled = lead[indeces]
        return downsampled
    return lead

In [None]:
#calculate leads III, aVL, aVR, aVF
def calculate_missing_leads(lead_I, lead_II):
    lead_III = lead_II - lead_I
    lead_aVL = (lead_I - lead_III)/2.0
    lead_aVR = (lead_I + lead_II)/(-2.0)
    lead_aVF = (lead_II + lead_III)/2.0    
    return lead_III, lead_aVL, lead_aVR, lead_aVF

In [None]:
#normalise each row by its maximum value
def normalize(lead):
    max_value = max(lead)
    if max_value == 0:
        max_value = 0.0000000001
    normalised = lead/max_value 
    return normalised

In [None]:
#fill csv file at specified directory
#action: "w" to write csv file, "a" to append new rows
def write_to_csv(data, file_path, action):       
    with open(file_path, action, newline = "") as csv_file: 
        csv_writer = csv.writer(csv_file, delimiter = ",")
        for line in data:
            csv_writer.writerow(line)
        csv_file.close()
    return

#specify directory to save csv file of data after processing
dir_path = Path("Output")
file_name = "output.csv"
file_path = dir_path.joinpath(file_name)

#make csv file and fill with column names
header_row = ["file_name", "outcome", "time_stamp", "lead_I", "lead_II", "lead_III", "lead_aVL", "lead_aVR", 
              "lead_aVF", "lead_V1", "lead_V2", "lead_V3", "lead_V4", "lead_V5", "lead_V6"]
data=[header_row]
write_to_csv(data, file_path, "w")

In [None]:
def process_data(directory, clinical, pos_or_neg):    
    ajmaline_id_date = id_w_date(clinical)
  
    #iterate through all file names in the directory
    for name in glob.glob(directory + "/*"):
        #open file
        f = open(name) 
        
        #load file as json file
        data = json.load(f)         

        #check if patient id is in list of patients that were tested with ajmaline
        #only keep file if not tested with ajmaline 
        if is_ajmaline_tested(data,ajmaline_id_date) is False:

            #get file name without directory
            base_name = os.path.basename(name)

            #extract ecg lead data and decode, save lead per array with base_name and test outcome
            #0: negative test, 1: positive test
            lead_I, lead_II, lead_V1, lead_V2, lead_V3, lead_V4, lead_V5, lead_V6 = get_lead_data(data)
                        
            #downsample all leads with 5000 measurements to 2500 measurements   
            lead_I = downsample(lead_I)
            lead_II = downsample(lead_II)
            lead_V1 = downsample(lead_V1)
            lead_V2 = downsample(lead_V2)
            lead_V3 = downsample(lead_V3)
            lead_V4 = downsample(lead_V4)
            lead_V5 = downsample(lead_V5)
            lead_V6 = downsample(lead_V6)           
            
            #calculate missing leads
            lead_III, lead_aVL, lead_aVR, lead_aVF = calculate_missing_leads(lead_I, lead_II)
            
            #normalise  
            lead_I = normalize(lead_I)
            lead_II = normalize(lead_II)
            lead_III = normalize(lead_III)
            lead_aVL = normalize(lead_aVL)
            lead_aVR = normalize(lead_aVR)
            lead_aVF = normalize(lead_aVF)            
            lead_V1 = normalize(lead_V1)
            lead_V2 = normalize(lead_V2)
            lead_V3 = normalize(lead_V3)
            lead_V4 = normalize(lead_V4)
            lead_V5 = normalize(lead_V5)
            lead_V6 = normalize(lead_V6)     
            
            #make arrays of same length as leads of file name, outcome, and time
            name = [base_name] * len(lead_I)
            outcome = [pos_or_neg] * len(lead_I)
            time_stamps = np.arange(0, len(lead_I), 1)
            prefix = "t"
            time = [prefix + str(sub) for sub in time_stamps]            
            
            #put everything different time stamps as different rows
            leads = np.array([name, outcome, time, lead_I, lead_II, lead_III, lead_aVL, lead_aVR, lead_aVF, lead_V1, lead_V2, lead_V3, lead_V4, lead_V5, lead_V6]).T
            
            #write to csv
            write_to_csv(leads, file_path, "a")
            
    return     

# Read and Process Data

In [None]:
#Read clinical data
clinical = pd.read_spss(path_clinical_data)

#Read ECG data stored as json files
process_data(path_negative, clinical, 0)
process_data(path_positive, clinical, 1)