In [1]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing
import wfdb
import copy as cp
import scipy.signal as signal
import scipy.stats as stats
from sklearn import preprocessing
from tqdm import tqdm
import os
import re
import pandas as pd
import pickle
import csv
import statistics

In [2]:
rlist = []
records = os.path.normpath('mit-bih-dataframes/subject_list.csv')
with open(records) as rfile:
    recordreader = csv.reader(rfile, delimiter=' ', quotechar='|')
    for row in recordreader:
        rlist.append(row[0])

In [78]:
full_dfs = {}
for record in tqdm(rlist):
    full_dfs[record] = pd.read_csv(os.path.normpath('mit-bih-dataframes/'+record+'.csv'), index_col=[0])

100%|███████████████████████████████████████████| 23/23 [01:29<00:00,  3.89s/it]


In [5]:
def classify_rr_ints(df):
    #list of types of rr_ints for each subject
    subject_types = []
    for row in df.itertuples():
        if row.rrInt < 0.85*row.rmean: 
            #if rr_ints is less than 85% of runningmean
            #label subject type as short
            subject_types.append('short')
        elif row.rrInt > 1.15*row.rmean: 
            #if rr_ints is greater than 115% of runningmean
            #label subject type as long
            subject_types.append('long')
        else:
            #label subject type as regular
            subject_types.append('regular')
    
    return subject_types

In [6]:
def find_proportions(int_types):
    StoS = 0
    StoR = 0
    StoL = 0
    RtoS = 0
    RtoR = 0
    RtoL = 0
    LtoS = 0
    LtoR = 0
    LtoL = 0
    for idx in range(len(int_types)):
        if idx<len(int_types)-1:
            if int_types[idx]=='short' and int_types[idx+1]=='short':
                StoS+=1
            elif int_types[idx]=='short' and int_types[idx+1]=='regular':
                StoR+=1
            elif int_types[idx]=='short' and int_types[idx+1]=='long':
                StoL+=1
            elif int_types[idx]=='regular' and int_types[idx+1]=='short':
                RtoS+=1
            elif int_types[idx]=='regular' and int_types[idx+1]=='regular':
                RtoR+=1
            elif int_types[idx]=='regular' and int_types[idx+1]=='long':
                RtoL+=1
            elif int_types[idx]=='long' and int_types[idx+1]=='short':
                LtoS+=1
            elif int_types[idx]=='long' and int_types[idx+1]=='regular':
                LtoR+=1
            elif int_types[idx]=='long' and int_types[idx+1]=='long':
                LtoL+=1
    
    count = len(int_types)-1
    subject_transitions = [StoS/count, StoR/count, StoL/count, RtoS/count, RtoR/count, RtoL/count, LtoS/count, LtoR/count, LtoL/count]
    
    return subject_transitions

In [7]:
rpeak_dfs = {}
for record in tqdm(rlist):
    rpeak_dfs[record] = pd.read_csv(os.path.normpath('mit-bih-extracted/'+record+'_rpeaks.csv'), names=['rpeak'])

100%|███████████████████████████████████████████| 23/23 [00:00<00:00, 76.32it/s]


In [8]:
def extract_rmean(rrInts):
    rmeans = []
    for index, value in enumerate(rrInts):
        if index==0:
            rmeans.append(value)
        else:
            rmeans.append(0.75*rmeans[index-1] + 0.25*value)
    
    return rmeans

In [9]:
def extract_rmssd(subset):
    rrInts = subset['rrInt'].tolist()
    sum_of_squares = 0
    for idx, rrInt in enumerate(rrInts):
        if idx<len(rrInts)-1:
            square_difference = (rrInt-rrInts[idx-1])**2
            sum_of_squares+=square_difference
    mean_sum = sum_of_squares/(len(rrInts)-1)
    return np.sqrt(mean_sum)

In [187]:
def subset_subject(full_df, rpeak_df, interval_length = 4, calib_length = 100):
    rpeaks = rpeak_df['rpeak'].tolist()

    current_subset = []
    subsets = [] 

    beginning_idx = 0
    outlier_comp = 0
    prev_peak = rpeaks[beginning_idx]
    for idx, peak in enumerate(rpeaks[:calib_length]):
        if idx<len(rpeaks)-1:
            if peak-prev_peak<500:
                current_subset.append(peak)
            else:
                outlier_comp+=1
            prev_peak = peak
        else:
            rr_int_column = [current_subset[x]-current_subset[x-1] for x in range(1, len(current_subset))]
            rhythm_column = []
            for x in range(1, len(current_subset)):
                if full_df['Normal'][current_subset[x]]:
                    rhythm_column.append('N')
                elif full_df['AFIB'][current_subset[x]]:
                    rhythm_column.append('A')
                elif full_df['Other'][current_subset[x]]:
                    rhythm_column.append('O')
            
            rmean_column = extract_rmean(rr_int_column)

            subsets.append(pd.DataFrame({'rhythmLabel': rhythm_column, 'rrInt': rr_int_column, 'rmean': rmean_column}, columns=['rhythmLabel', 'rrInt', 'rmean']))
    
    current_subset = []
    outlier_comp = 0
    prev_peak = rpeaks[calib_length-1]

    counter = 0
    for idx, peak in enumerate(rpeaks[calib_length:], calib_length):
        if idx<len(rpeaks)-1:
            if counter-outlier_comp<=interval_length:
                if peak-prev_peak<500:
                    current_subset.append(peak)
                    counter+=1
                else:
                    outlier_comp+=1
                prev_peak = peak
            elif peak-prev_peak>500:
                outlier_comp+=1
            else:
                rr_int_column = [current_subset[x]-current_subset[x-1] for x in range(1, len(current_subset))]
                rhythm_column = []
                for x in range(1, len(current_subset)):
                    if full_df['Normal'][current_subset[x]]:
                        rhythm_column.append('N')
                    elif full_df['AFIB'][current_subset[x]]:
                        rhythm_column.append('A')
                    elif full_df['Other'][current_subset[x]]:
                        rhythm_column.append('O')
                
                rmean_column = extract_rmean(rr_int_column)

                subsets.append(pd.DataFrame({'rhythmLabel': rhythm_column, 'rrInt': rr_int_column, 'rmean': rmean_column}, columns=['rhythmLabel', 'rrInt', 'rmean']))
                current_subset = []
                outlier_comp = 0
                counter = 0
                prev_peak = rpeaks[idx]

    return subsets

In [188]:
subset_dfs = {}
for record in tqdm(rlist):
    subset_dfs[record] = subset_subject(full_dfs[record], rpeak_dfs[record])

100%|███████████████████████████████████████████| 23/23 [02:40<00:00,  6.97s/it]


In [189]:
for record in tqdm(rlist):
    subsets = subset_dfs[record]
    idx_list = list(range(len(subsets)))
    
    data = {
        "subjectID": [record]*len(subsets),
        "subsetID": idx_list,
        "rhythmLabel": [subsets[x]['rhythmLabel'].mode()[0] for x in idx_list]
    }
    
    subset_list = pd.DataFrame(data)
    subset_list['mappedLabel'] = subset_list['rhythmLabel'].map({'N': 'Non-Afib', 'A': 'Afib', 'O': 'Non-Afib'})
    subset_list.to_csv(os.path.normpath('mit-bih-time-subsets/'+record+"_subset_list.csv"))
    
    os.makedirs('mit-bih-time-subsets/'+str(record), exist_ok=True)

    for x, subset in enumerate(subsets):
        subset.to_csv(os.path.normpath('mit-bih-time-subsets/'+str(record)+'/'+str(record)+"-"+str(idx_list[x])+".csv"))

100%|███████████████████████████████████████████| 23/23 [02:47<00:00,  7.29s/it]


In [190]:
def subset_features(subset_list, current_weight = 0.25, prev_weight = 0.75):
    subset_dfs = {}
    for x, subset in enumerate(subset_list.itertuples()):
        subset_dfs[x] = pd.read_csv(os.path.normpath('mit-bih-time-subsets/'+str(subset.subjectID)+'/'+str(subset.subjectID)+"-"+str(x)+".csv"), index_col=0)

    calib_df = subset_dfs[0]

    feature_dict = {}

    props = find_proportions(classify_rr_ints(calib_df))
    feature_dict['StoS'] = [props[0]]
    feature_dict['StoR'] = [props[1]]
    feature_dict['StoL'] = [props[2]]
    feature_dict['RtoS'] = [props[3]]
    feature_dict['RtoR'] = [props[4]]
    feature_dict['RtoL'] = [props[5]]
    feature_dict['LtoS'] = [props[6]]
    feature_dict['LtoR'] = [props[7]]
    feature_dict['LtoL'] = [props[8]]

    feature_dict['std'] = [np.std(calib_df['rrInt'])]
    feature_dict['cov'] = [feature_dict['std'][0]/np.mean(calib_df['rrInt'])]
    feature_dict['range'] = [np.max(calib_df['rrInt'])-np.min(calib_df['rrInt'])]
    #feature_dict['rmean'] = df['rmean'].tolist()
    #feature_dict['rrv'] = df['rr_variance'].tolist()
    feature_dict['rrInt_var'] = [calib_df['rrInt'].var()]
    feature_dict['rmean_var'] = [calib_df['rmean'].var()]
    feature_dict['rmssd'] = [extract_rmssd(calib_df)]
    feature_dict['mad'] = [stats.median_abs_deviation(calib_df['rrInt'])]
    feature_dict['iqr'] = [stats.iqr(calib_df['rrInt'])]

    
    for key in subset_dfs:
        if key>0:
            props = find_proportions(classify_rr_ints(subset_dfs[key]))
            feature_dict['StoS'].append(props[0]*current_weight + feature_dict['StoS'][key-1]*prev_weight)
            feature_dict['StoR'].append(props[1]*current_weight + feature_dict['StoR'][key-1]*prev_weight)
            feature_dict['StoL'].append(props[2]*current_weight + feature_dict['StoL'][key-1]*prev_weight)
            feature_dict['RtoS'].append(props[3]*current_weight + feature_dict['RtoS'][key-1]*prev_weight)
            feature_dict['RtoR'].append(props[4]*current_weight + feature_dict['RtoR'][key-1]*prev_weight)
            feature_dict['RtoL'].append(props[5]*current_weight + feature_dict['RtoL'][key-1]*prev_weight)
            feature_dict['LtoS'].append(props[6]*current_weight + feature_dict['LtoS'][key-1]*prev_weight)
            feature_dict['LtoR'].append(props[7]*current_weight + feature_dict['LtoR'][key-1]*prev_weight)
            feature_dict['LtoL'].append(props[8]*current_weight + feature_dict['LtoL'][key-1]*prev_weight)

            feature_dict['std'].append(np.std(subset_dfs[key]['rrInt'])*current_weight + feature_dict['std'][key-1]*prev_weight)
            feature_dict['cov'].append((feature_dict['std'][key]/np.mean(subset_dfs[key]['rrInt']))*current_weight + feature_dict['cov'][key-1]*prev_weight)
            feature_dict['range'].append(np.max(subset_dfs[key]['rrInt'])-np.min(subset_dfs[key]['rrInt'])*current_weight + feature_dict['range'][key-1]*prev_weight)
            #feature_dict['rmean'] = df['rmean'].tolist()
            #feature_dict['rrv'] = df['rr_variance'].tolist()
            feature_dict['rrInt_var'].append(subset_dfs[key]['rrInt'].var()*current_weight + feature_dict['rrInt_var'][key-1]*prev_weight)
            feature_dict['rmean_var'].append(subset_dfs[key]['rmean'].var()*current_weight + feature_dict['rmean_var'][key-1]*prev_weight)
            feature_dict['rmssd'].append(extract_rmssd(subset_dfs[key])*current_weight + feature_dict['rmssd'][key-1]*prev_weight)
            feature_dict['mad'].append(stats.median_abs_deviation(subset_dfs[key]['rrInt'])*current_weight + feature_dict['mad'][key-1]*prev_weight)
            feature_dict['iqr'].append(stats.iqr(subset_dfs[key]['rrInt'])*current_weight + feature_dict['iqr'][key-1]*prev_weight)

    feature_df = pd.DataFrame(data=feature_dict)
    return pd.concat([subset_list, feature_df], axis=1)

In [191]:
features_df = {}
for record in tqdm(rlist):
    subset_list = pd.read_csv('mit-bih-time-subsets/'+record+'_subset_list.csv', index_col=0, dtype={'subjectID': str})
    features = subset_features(subset_list)

    features.to_csv(os.path.normpath('mit-bih-time-features/'+record+".csv"))

100%|███████████████████████████████████████████| 23/23 [08:48<00:00, 22.99s/it]
