In [1]:
import numpy as np
import pandas as pd
import wfdb
from tqdm.auto import tqdm
import os
import msgpack
import scipy.stats as stats
import datetime

In [2]:
rlist = []
records = os.path.normpath('mit-bih-raw/RECORDS')
with open(records) as rfile:
    for record in rfile:
        record = record[0:len(record)-1] # Remove any erronious new line characters at the end ('\n')
        rlist.append(record)

In [3]:
samples = [] # will house the samples of all subjects
good_list = [] # will list the names of the subjects we successfully extracted
bad_list = [] # will house the names of the subjects we failed to extract
qrs = [] # will house the indices of R-Peaks for all subjects
atr_label = [] # will house the labels for each rhythm annotation for all subjects
atr_locs = [] # will house the locations corresponding to the rhythm annotation labels

for x in tqdm(rlist): #this will iterate through te records that we found above
    try:
        samp = wfdb.rdsamp(os.path.normpath('mit-bih-raw/'+x)) # wfdb._____(file_location) will read the signal & header data and return a 2 value array
            # samp[0] - the signal data is the raw reading from the ecg. Each value is a sample taken.
            # samp[1] - the header data includes things about the signal data such as:
              # samples per section, denoted 'fs'
              # number of signals, denoted 'n_sig'
        ######################################################
        samples.append(samp) #add it to our array for all subject
        
            #What is our file extension that has the annotation we want? Find it here and replace _____ with it 
            #hint: READ THE VARIABLE NAMES!!!!
        qrs_tmp = wfdb.rdann(os.path.normpath('mit-bih-raw/'+x), extension="qrs") #extract the QRS Info
        qrs_locs = np.array(qrs_tmp.sample, dtype='int') #Get just the loccation of R-Peaks from the QRS Info
        qrs.append(qrs_locs) # Add to our array for all subjects
        
            #Do the same thing here
        atr = wfdb.rdann(os.path.normpath('mit-bih-raw/'+x),extension="atr") #extract the atr info which stores the rhythm type(s) over the whole signal
        atr_label.append(atr.aux_note) # aux_note stores the type of rhythm - main two are '(N' for normal and '(AFIB' for AFIB
        atr_locs.append(np.append(atr.sample, len(samp[0]))) #I add the length of the whole sample to the end for better visualization later
        
        good_list.append(x) # when all extraction is successful append the record name to good_list
    except Exception as exep:
        tqdm.write(str(exep)) # Alert the user of an exception
        bad_list.append(x) # add to the bad list

  0%|          | 0/25 [00:00<?, ?it/s]

sampto must be greater than sampfrom
sampto must be greater than sampfrom


In [4]:
atr_dics = [] #Initialize the array that will hold the dictionary for each subject

for idxs,lab in enumerate(atr_label):
    atr_dic = {} #Initialize dictionary for each subject
    for idx,x in enumerate(lab):
        if x not in atr_dic.keys():
            atr_dic[x] = [] #Add dictionary key if does not exist
        atr_dic[x].append([atr_locs[idxs][idx], atr_locs[idxs][idx+1]]) #Insert range for each rhythm
    atr_dics.append(atr_dic) #Add to dictionary array

In [5]:
full_dfs = {} # Initialize the subject_dataframes - will hold all of our subject dataframes

for s, _ in enumerate(tqdm(good_list)): # Iterate through all of the subjects that we have complete data of 
    subj = pd.DataFrame( # The below statements initialize our datafram. The first to columns will be our given signals, and the rest we initialize to 0
        data = np.transpose(np.array([ # First we give our data, for pandas they want the data by row instead of by column, so we use transpose to get the proper format
                                               [x[0] for x in samples[s][0]],
                                               [x[1] for x in samples[s][0]],
                                               np.zeros(len(samples[s][0])), # np.zeros makes an array of zeros with the given lenth
                                               np.zeros(len(samples[s][0])), 
                                               np.zeros(len(samples[s][0])), 
                                               np.zeros(len(samples[s][0])), 
                                        ])
                           ),
        columns = ['Signal 1', 'Signal 2', 'R-Peak', 'Normal', 'AFIB', 'Other'] # Here we name our columns to match the dataframe we outlined above
    )
    norm = [] # Initialize the norm array which will list every index the person is in a normal rhythm
    if '(N' in atr_dics[s].keys():
        for x in atr_dics[s]['(N']: # Then we iterate through our ranges we extracted above
            norm = norm + list(range(x[0], x[1])) # And add all values in the range to our norm array
    af = [] # Then we do the same steps above for AFIB rhythms
    if '(AFIB' in atr_dics[s].keys():
        for x in atr_dics[s]['(AFIB']:
            af = af + list(range(x[0], x[1]))
    subj['R-Peak']= subj.index.isin(qrs[s]) # the isin() function of a DataFram index will return true if the index is in that list and false if it is not
                                            # then, we can initialize our dataFrame with correct values based on that
    subj['Normal']= subj.index.isin(norm)
    subj['AFIB'] = subj.index.isin(af)
    subj['Other'] = ~subj.index.isin(np.append(norm, af)) # Because we are classifying AFIB specifically we define other as any rhythm not in the norm or AFIB list
    
    full_dfs[_] = subj

  0%|          | 0/23 [00:00<?, ?it/s]

In [6]:
reload_flag = True

In [7]:
if not os.path.exists('mit-bih-dataframes/'):
    os.mkdir('mit-bih-dataframes/')

for idx, x in enumerate(tqdm(good_list)): 
    if not os.path.exists('mit-bih-dataframes/'+x+ '.parquet') or reload_flag:
        full_dfs[x].to_parquet(os.path.normpath('mit-bih-dataframes/'+x+'.parquet'))

np.savetxt(os.path.normpath("mit-bih-dataframes/subject_list.csv"), good_list, delimiter=",",  fmt='%s') 
   # We'll load the complete list of subjects as well so that we can easily recreate the file names

  0%|          | 0/23 [00:00<?, ?it/s]

In [8]:
if not os.path.exists('mit-bih-extracted/'):
    os.mkdir('mit-bih-extracted/')

def encoder(obj):
    if isinstance(obj, datetime.time):
        return {'__datetime__': True, 'as_str': obj.strftime("%H:%M:%S.%f")}
    if isinstance(obj, np.int64):
        return {'__npint64__': True, 'as_int': int(obj)}
    return obj

np.savetxt("mit-bih-extracted/subject_list.csv", good_list, delimiter=",",  fmt='%s')
for idx, x in enumerate(tqdm(good_list)):
    if not os.path.exists("mit-bih-extracted/"+x+"_signals.parquet") or reload_flag:
        signaldf = pd.DataFrame(np.array(samples[idx][0]), columns=["signal1", "signal2"])
        signaldf.to_parquet(os.path.normpath("mit-bih-extracted/"+x+"_signals.parquet"))
    if not os.path.exists("mit-bih-extracted/"+x+"_rpeaks.parquet") or reload_flag:
        rpeaksdf = pd.DataFrame(np.array(qrs[idx]), columns=["rpeaks"])
        rpeaksdf.to_parquet(os.path.normpath("mit-bih-extracted/"+x+"_rpeaks.parquet"))
    if not os.path.exists("mit-bih-extracted/"+x+"_headers.msgpack") or reload_flag:
        with open(os.path.normpath("mit-bih-extracted/"+x+"_headers.msgpack"), 'wb') as outfile:
            outfile.write(msgpack.packb(samples[idx][1], default=encoder))
    if not os.path.exists("mit-bih-extracted/"+x+"_rhythms.msgpack") or reload_flag:
        with open(os.path.normpath("mit-bih-extracted/"+x+"_rhythms.msgpack"), 'wb') as outfile:
            outfile.write(msgpack.packb(atr_dics[idx], default=encoder))

  0%|          | 0/23 [00:00<?, ?it/s]

In [9]:
rlist = good_list

rpeak_dfs = {}
for record in tqdm(rlist):
    df = pd.read_parquet(os.path.normpath('mit-bih-extracted/'+record+'_rpeaks.parquet'))
    rhythms = full_dfs[record].filter(items = df['rpeaks'], axis=0).reset_index(drop=True)[['Normal', 'AFIB', 'Other']]

    df = pd.concat([df, rhythms], axis=1)

    rpeak_dfs[record] = df

  0%|          | 0/23 [00:00<?, ?it/s]

In [10]:
def extract_rmean(rrInts):
    rr_arr = rrInts.values
    rmeans = []
    for index, value in enumerate(rr_arr):
        if index==0:
            rmeans.append(value)
        else:
            rmeans.append(0.75*rmeans[index-1] + 0.25*value)
    
    return rmeans

In [11]:
def extract_diff(rrInts):
    rr_arr = rrInts.values
    diffs = [0]
    for idx, rrInt in enumerate(rrInts):
        if idx>0:
            diffs.append(np.abs(rrInt-rrInts[idx-1]))
    return diffs

In [12]:
def rr_int_df(rpeaks, full_df):
    raw = pd.DataFrame(data={'rr_int': rpeaks['rpeaks'].diff().dropna()})
    filtered = raw.drop(raw[raw.rr_int > 500].index).reset_index(drop=True)

    filtered['rmean'] = extract_rmean(filtered['rr_int'])
    filtered['diff'] = extract_diff(filtered['rr_int'])
    filtered['sqr_diff'] = filtered['diff']**2

    def rhythm_finder(row):
        if row['Normal']:
            return 'N'
        elif row['AFIB']:
            return 'A'
        else:
            return 'O'
    filtered['rhythm'] = rpeaks.apply(rhythm_finder, axis=1)

    return filtered

In [13]:
rr_int_dfs = {}
for record in tqdm(rlist):
    rr_int_dfs[record] = rr_int_df(rpeak_dfs[record], full_dfs[record])

  0%|          | 0/23 [00:00<?, ?it/s]

In [14]:
if not os.path.exists('mit-bih-rr-intervals/'):
    os.mkdir('mit-bih-rr-intervals/')

for record in tqdm(rlist): 
    if not os.path.exists('mit-bih-dataframes/'+record+'.parquet') or reload_flag:
        rr_int_dfs[record].to_parquet(os.path.normpath('mit-bih-rr-intervals/'+record+'.parquet'))

  0%|          | 0/23 [00:00<?, ?it/s]