In [None]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing
import wfdb
import copy as cp
import scipy.signal as signal
from sklearn import preprocessing
from tqdm import tqdm
import os
import re
import pandas as pd
import pickle
import csv
import statistics as stats

In [None]:
rlist = []
records = os.path.normpath('mit-bih-raw/RECORDS')
with open(records) as rfile: #Then we open the file 
                             #The 'with' command only opens the file while we are in it. Automatically closes the file when we're not
    for record in rfile:  # Then we iterate through the lines in the file
        record = record[0:len(record)-1] # Remove any erronious new line characters at the end ('\n')
        rlist.append(record) # Then build an array with it

In [None]:

###### Step 1: Initialize all Arrays
             # Below, replace all of the ___ with the command that declares an array/list
             # hint: https://stackoverflow.com/questions/1514553/how-to-declare-an-array-in-python
samples = [] # will house the samples of all subjects
good_list = [] # will list the names of the subjects we successfully extracted
bad_list = [] # will house the names of the subjects we failed to extract
qrs = [] # will house the indices of R-Peaks for all subjects
atr_label = [] # will house the labels for each rhythm annotation for all subjects
atr_locs = [] # will house the locations corresponding to the rhythm annotation labels


###### Step 2: Extract Information
for x in tqdm(rlist): #this will iterate through te records that we found above
    try: # A try statement will run the except statement if for some reason the try commands fail
         # In this case I use the try statement because one of the subjects has no signal data causing failure
         # I then use bad_list and good_list so that all of the indices in rlist match with the arrays we initialized in Step 1, above
        ######################################################
            # Below find the wfdb function that will return the information that is described below 
            # Then replace _____ with the correct function call
        samp = wfdb.rdsamp(os.path.normpath('mit-bih-raw/'+x)) # wfdb._____(file_location) will read the signal & header data and return a 2 value array
            # samp[0] - the signal data is the raw reading from the ecg. Each value is a sample taken.
            # samp[1] - the header data includes things about the signal data such as:
              # samples per section, denoted 'fs'
              # number of signals, denoted 'n_sig'
        ######################################################
        samples.append(samp) #add it to our array for all subject
        
            #What is our file extension that has the annotation we want? Find it here and replace _____ with it 
            #hint: READ THE VARIABLE NAMES!!!!
        qrs_tmp = wfdb.rdann(os.path.normpath('mit-bih-raw/'+x), extension="qrs") #extract the QRS Info
        qrs_locs = np.array(qrs_tmp.sample, dtype='int') #Get just the loccation of R-Peaks from the QRS Info
        qrs.append(qrs_locs) # Add to our array for all subjects
        
            #Do the same thing here
        atr = wfdb.rdann(os.path.normpath('mit-bih-raw/'+x),extension="atr") #extract the atr info which stores the rhythm type(s) over the whole signal
        atr_label.append(atr.aux_note) # aux_note stores the type of rhythm - main two are '(N' for normal and '(AFIB' for AFIB
        atr_locs.append(np.append(atr.sample, len(samp[0]))) #I add the length of the whole sample to the end for better visualization later
        
        good_list.append(x) # when all extraction is successful append the record name to good_list
    except Exception as exep:
        print(exep) # Alert the user of an exception
        bad_list.append(x) # add to the bad list

In [None]:
atr_dics = [] #Initialize the array that will hold the dictionary for each subject

for idxs,lab in enumerate(atr_label):
    atr_dic = {} #Initialize dictionary for each subject
    for idx,x in enumerate(lab):
        if x not in atr_dic.keys():
            atr_dic[x] = [] #Add dictionary key if does not exist
        atr_dic[x].append([atr_locs[idxs][idx], atr_locs[idxs][idx+1]]) #Insert range for each rhythm
    atr_dics.append(atr_dic) #Add to dictionary array

In [None]:
full_dfs = {} # Initialize the subject_dataframes - will hold all of our subject dataframes

for s, _ in enumerate(tqdm(good_list)): # Iterate through all of the subjects that we have complete data of 
    subj = pd.DataFrame( # The below statements initialize our datafram. The first to columns will be our given signals, and the rest we initialize to 0
        data = np.transpose(np.array([ # First we give our data, for pandas they want the data by row instead of by column, so we use transpose to get the proper format
                                               [x[0] for x in samples[s][0]],
                                               [x[1] for x in samples[s][0]],
                                               np.zeros(len(samples[s][0])), # np.zeros makes an array of zeros with the given lenth
                                               np.zeros(len(samples[s][0])), 
                                               np.zeros(len(samples[s][0])), 
                                               np.zeros(len(samples[s][0])), 
                                        ])
                           ),
        columns = ['Signal 1', 'Signal 2', 'R-Peak', 'Normal', 'AFIB', 'Other'] # Here we name our columns to match the dataframe we outlined above
    )
    norm = [] # Initialize the norm array which will list every index the person is in a normal rhythm
    if '(N' in atr_dics[s].keys():
        for x in atr_dics[s]['(N']: # Then we iterate through our ranges we extracted above
            norm = norm + list(range(x[0], x[1])) # And add all values in the range to our norm array
    af = [] # Then we do the same steps above for AFIB rhythms
    if '(AFIB' in atr_dics[s].keys():
        for x in atr_dics[s]['(AFIB']:
            af = af + list(range(x[0], x[1]))
    subj['R-Peak']= subj.index.isin(qrs[s]) # the isin() function of a DataFram index will return true if the index is in that list and false if it is not
                                            # then, we can initialize our dataFrame with correct values based on that
    subj['Normal']= subj.index.isin(norm)
    subj['AFIB'] = subj.index.isin(af)
    subj['Other'] = ~subj.index.isin(np.append(norm, af)) # Because we are classifying AFIB specifically we define other as any rhythm not in the norm or AFIB list
    
    full_dfs[_] = subj # Add the dataframe we built to our to array that holds all of our subjects' dataframes

In [None]:
for idx, x in enumerate(tqdm(good_list)): 
    if not os.path.exists('mit-bih-dataframes/'+x+ '.csv') or reload_flag:
        full_dfs[x].to_csv(os.path.normpath('mit-bih-dataframes/'+x+'.csv')) # Pandas DataFrames have a built in to_csv() function which whill save it at the passed path

np.savetxt(os.path.normpath("mit-bih-dataframes/subject_list.csv"), good_list, delimiter=",",  fmt='%s') 
   # We'll load the complete list of subjects as well so that we can easily recreate the file names

In [None]:
np.savetxt("mit-bih-extracted/subject_list.csv", good_list, delimiter=",",  fmt='%s') #Save the names in the folder 
for idx, x in enumerate(tqdm(good_list)): # Iterate through our subjects
    if not os.path.exists("mit-bih-extracted/"+x+"_signals.csv") or reload_flag:
        np.savetxt(os.path.normpath("mit-bih-extracted/"+x+"_signals.csv"), np.array(samples[idx][0]), delimiter=",") # numPy has a savetxt() function which by setting the delimiter as ',' we can 
                                                                                            # simulate a to_csv() function 
    if not os.path.exists("mit-bih-extracted/"+x+"_rpeaks.csv") or reload_flag:
            np.savetxt(os.path.normpath("mit-bih-extracted/"+x+"_rpeaks.csv"), np.array(qrs[idx]), delimiter=",")      
    if not os.path.exists("mit-bih-extracted/"+x+"_headers.pkl") or reload_flag:
        with open(os.path.normpath("mit-bih-extracted/"+x+"_headers.pkl"), 'wb') as picklefile: # nomPy has no way to save a dictionary as a CSV so we use the pickle package
                                    # First we open up the file we would like to write to
            pickle.dump(samples[idx][1], picklefile)
    if not os.path.exists("mit-bih-extracted/"+x+"_rhythms.pkl") or reload_flag:
        with open(os.path.normpath("mit-bih-extracted/"+x+"_rhythms.pkl"), 'wb') as picklefile:
            pickle.dump(atr_dics[idx], picklefile)

In [None]:
def classify_rr_ints(df):
    #list of types of rr_ints for each subject
    subject_types = []
    for row in df.itertuples():
        if row.rrInt < 0.85*row.rmean: 
            #if rr_ints is less than 85% of runningmean
            #label subject type as short
            subject_types.append('short')
        elif row.rrInt > 1.15*row.rmean: 
            #if rr_ints is greater than 115% of runningmean
            #label subject type as long
            subject_types.append('long')
        else:
            #label subject type as regular
            subject_types.append('regular')
    
    return subject_types

In [None]:
def find_proportions(int_types):
    StoS = 0
    StoR = 0
    StoL = 0
    RtoS = 0
    RtoR = 0
    RtoL = 0
    LtoS = 0
    LtoR = 0
    LtoL = 0
    for idx in range(len(int_types)):
        if idx<len(int_types)-1:
            if int_types[idx]=='short' and int_types[idx+1]=='short':
                StoS+=1
            elif int_types[idx]=='short' and int_types[idx+1]=='regular':
                StoR+=1
            elif int_types[idx]=='short' and int_types[idx+1]=='long':
                StoL+=1
            elif int_types[idx]=='regular' and int_types[idx+1]=='short':
                RtoS+=1
            elif int_types[idx]=='regular' and int_types[idx+1]=='regular':
                RtoR+=1
            elif int_types[idx]=='regular' and int_types[idx+1]=='long':
                RtoL+=1
            elif int_types[idx]=='long' and int_types[idx+1]=='short':
                LtoS+=1
            elif int_types[idx]=='long' and int_types[idx+1]=='regular':
                LtoR+=1
            elif int_types[idx]=='long' and int_types[idx+1]=='long':
                LtoL+=1
    
    count = len(int_types)-1
    subject_transitions = [StoS/count, StoR/count, StoL/count, RtoS/count, RtoR/count, RtoL/count, LtoS/count, LtoR/count, LtoL/count]
    
    return subject_transitions

In [None]:
rpeak_dfs = {}
for record in tqdm(rlist):
    rpeak_dfs[record] = pd.read_csv(os.path.normpath('mit-bih-extracted/'+record+'_rpeaks.csv'), names=['rpeak'])

In [None]:
def extract_rmean(rrInts):
    rmeans = []
    for index, value in enumerate(rrInts):
        if index==0:
            rmeans.append(value)
        else:
            rmeans.append(0.75*rmeans[index-1] + 0.25*value)
    
    return rmeans

In [None]:
def extract_rmssd(subset):
    rrInts = subset['rrInt'].tolist()
    sum_of_squares = 0
    for idx, rrInt in enumerate(rrInts):
        if idx<len(rrInts)-1:
            square_difference = (rrInt-rrInts[idx-1])**2
            sum_of_squares+=square_difference
    mean_sum = sum_of_squares/(len(rrInts)-1)
    return np.sqrt(mean_sum)

In [None]:
def subset_subject(full_df, rpeak_df, interval_length = 4, calib_length = 100):
    rpeaks = rpeak_df['rpeak'].tolist()

    current_subset = []
    subsets = [] 

    beginning_idx = 0
    outlier_comp = 0
    prev_peak = rpeaks[beginning_idx]
    for idx, peak in enumerate(rpeaks[:calib_length]):
        if idx<len(rpeaks)-1:
            if peak-prev_peak<500:
                current_subset.append(peak)
            else:
                outlier_comp+=1
            prev_peak = peak
        else:
            rr_int_column = [current_subset[x]-current_subset[x-1] for x in range(1, len(current_subset))]
            rhythm_column = []
            for x in range(1, len(current_subset)):
                if full_df['Normal'][current_subset[x]]:
                    rhythm_column.append('N')
                elif full_df['AFIB'][current_subset[x]]:
                    rhythm_column.append('A')
                elif full_df['Other'][current_subset[x]]:
                    rhythm_column.append('O')
            
            rmean_column = extract_rmean(rr_int_column)

            subsets.append(pd.DataFrame({'rhythmLabel': rhythm_column, 'rrInt': rr_int_column, 'rmean': rmean_column}, columns=['rhythmLabel', 'rrInt', 'rmean']))
    
    current_subset = []
    outlier_comp = 0
    prev_peak = rpeaks[calib_length-1]

    counter = 0
    for idx, peak in enumerate(rpeaks[calib_length:], calib_length):
        if idx<len(rpeaks)-1:
            if counter-outlier_comp<=interval_length:
                if peak-prev_peak<500:
                    current_subset.append(peak)
                    counter+=1
                else:
                    outlier_comp+=1
                prev_peak = peak
            elif peak-prev_peak>500:
                outlier_comp+=1
            else:
                rr_int_column = [current_subset[x]-current_subset[x-1] for x in range(1, len(current_subset))]
                rhythm_column = []
                for x in range(1, len(current_subset)):
                    if full_df['Normal'][current_subset[x]]:
                        rhythm_column.append('N')
                    elif full_df['AFIB'][current_subset[x]]:
                        rhythm_column.append('A')
                    elif full_df['Other'][current_subset[x]]:
                        rhythm_column.append('O')
                
                rmean_column = extract_rmean(rr_int_column)

                subsets.append(pd.DataFrame({'rhythmLabel': rhythm_column, 'rrInt': rr_int_column, 'rmean': rmean_column}, columns=['rhythmLabel', 'rrInt', 'rmean']))
                current_subset = []
                outlier_comp = 0
                counter = 0
                prev_peak = rpeaks[idx]

    return subsets

In [None]:
subset_dfs = {}
for record in tqdm(rlist):
    subset_dfs[record] = subset_subject(full_dfs[record], rpeak_dfs[record])

In [None]:
for record in tqdm(rlist):
    subsets = subset_dfs[record]
    idx_list = list(range(len(subsets)))
    
    data = {
        "subjectID": [record]*len(subsets),
        "subsetID": idx_list,
        "rhythmLabel": [subsets[x]['rhythmLabel'].mode()[0] for x in idx_list]
    }
    
    subset_list = pd.DataFrame(data)
    subset_list['mappedLabel'] = subset_list['rhythmLabel'].map({'N': 'Non-Afib', 'A': 'Afib', 'O': 'Non-Afib'})
    subset_list.to_csv(os.path.normpath('mit-bih-time-subsets/'+record+"_subset_list.csv"))
    
    os.makedirs(os.path.normpath('mit-bih-time-subsets/'+str(record)), exist_ok=True)

    for x, subset in enumerate(subsets):
        subset.to_csv(os.path.normpath('mit-bih-time-subsets/'+str(record)+'/'+str(record)+"-"+str(idx_list[x])+".csv"))

In [None]:
def subset_features(subset_list, current_weight = 0.25, prev_weight = 0.75):
    subset_dfs = {}
    for x, subset in enumerate(subset_list.itertuples()):
        subset_dfs[x] = pd.read_csv(os.path.normpath('mit-bih-time-subsets/'+str(subset.subjectID)+'/'+str(subset.subjectID)+"-"+str(x)+".csv"), index_col=0)

    calib_df = subset_dfs[0]

    feature_dict = {}

    props = find_proportions(classify_rr_ints(calib_df))
    feature_dict['StoS'] = [props[0]]
    feature_dict['StoR'] = [props[1]]
    feature_dict['StoL'] = [props[2]]
    feature_dict['RtoS'] = [props[3]]
    feature_dict['RtoR'] = [props[4]]
    feature_dict['RtoL'] = [props[5]]
    feature_dict['LtoS'] = [props[6]]
    feature_dict['LtoR'] = [props[7]]
    feature_dict['LtoL'] = [props[8]]

    feature_dict['std'] = [np.std(calib_df['rrInt'])]
    feature_dict['cov'] = [feature_dict['std'][0]/np.mean(calib_df['rrInt'])]
    feature_dict['range'] = [np.max(calib_df['rrInt'])-np.min(calib_df['rrInt'])]
    #feature_dict['rmean'] = df['rmean'].tolist()
    #feature_dict['rrv'] = df['rr_variance'].tolist()
    feature_dict['rrInt_var'] = [calib_df['rrInt'].var()]
    feature_dict['rmean_var'] = [calib_df['rmean'].var()]
    feature_dict['rmssd'] = [extract_rmssd(calib_df)]
    feature_dict['mad'] = [stats.median_abs_deviation(calib_df['rrInt'])]
    feature_dict['iqr'] = [stats.iqr(calib_df['rrInt'])]

    
    for key in subset_dfs:
        if key>0:
            props = find_proportions(classify_rr_ints(subset_dfs[key]))
            feature_dict['StoS'].append(props[0]*current_weight + feature_dict['StoS'][key-1]*prev_weight)
            feature_dict['StoR'].append(props[1]*current_weight + feature_dict['StoR'][key-1]*prev_weight)
            feature_dict['StoL'].append(props[2]*current_weight + feature_dict['StoL'][key-1]*prev_weight)
            feature_dict['RtoS'].append(props[3]*current_weight + feature_dict['RtoS'][key-1]*prev_weight)
            feature_dict['RtoR'].append(props[4]*current_weight + feature_dict['RtoR'][key-1]*prev_weight)
            feature_dict['RtoL'].append(props[5]*current_weight + feature_dict['RtoL'][key-1]*prev_weight)
            feature_dict['LtoS'].append(props[6]*current_weight + feature_dict['LtoS'][key-1]*prev_weight)
            feature_dict['LtoR'].append(props[7]*current_weight + feature_dict['LtoR'][key-1]*prev_weight)
            feature_dict['LtoL'].append(props[8]*current_weight + feature_dict['LtoL'][key-1]*prev_weight)

            feature_dict['std'].append(np.std(subset_dfs[key]['rrInt'])*current_weight + feature_dict['std'][key-1]*prev_weight)
            feature_dict['cov'].append((feature_dict['std'][key]/np.mean(subset_dfs[key]['rrInt']))*current_weight + feature_dict['cov'][key-1]*prev_weight)
            feature_dict['range'].append(np.max(subset_dfs[key]['rrInt'])-np.min(subset_dfs[key]['rrInt'])*current_weight + feature_dict['range'][key-1]*prev_weight)
            #feature_dict['rmean'] = df['rmean'].tolist()
            #feature_dict['rrv'] = df['rr_variance'].tolist()
            feature_dict['rrInt_var'].append(subset_dfs[key]['rrInt'].var()*current_weight + feature_dict['rrInt_var'][key-1]*prev_weight)
            feature_dict['rmean_var'].append(subset_dfs[key]['rmean'].var()*current_weight + feature_dict['rmean_var'][key-1]*prev_weight)
            feature_dict['rmssd'].append(extract_rmssd(subset_dfs[key])*current_weight + feature_dict['rmssd'][key-1]*prev_weight)
            feature_dict['mad'].append(stats.median_abs_deviation(subset_dfs[key]['rrInt'])*current_weight + feature_dict['mad'][key-1]*prev_weight)
            feature_dict['iqr'].append(stats.iqr(subset_dfs[key]['rrInt'])*current_weight + feature_dict['iqr'][key-1]*prev_weight)

    feature_df = pd.DataFrame(data=feature_dict)
    return pd.concat([subset_list, feature_df], axis=1)

In [None]:
features_df = {}
for record in tqdm(rlist):
    subset_list = pd.read_csv(os.path.normpath('mit-bih-time-subsets/'+record+'_subset_list.csv'), index_col=0, dtype={'subjectID': str})
    features = subset_features(subset_list)

    features.to_csv(os.path.normpath('mit-bih-time-features/'+record+".csv"))