In [74]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing
import wfdb
import copy as cp
import scipy.signal as signal
import scipy.stats as stats
from sklearn import preprocessing
from tqdm import tqdm
import os
import re
import pandas as pd
import pickle
import csv

In [2]:
rlist = []
records = os.path.normpath('mit-bih-dataframes/subject_list.csv')
with open(records) as rfile:
    recordreader = csv.reader(rfile, delimiter=' ', quotechar='|')
    for row in recordreader:
        rlist.append(row[0])

In [3]:
subject_dfs = {}
for record in tqdm(rlist):
    subject_dfs[record] = pd.read_csv(os.path.normpath('mit-bih-extracted/'+record+'_full.csv'), index_col=[0,1])

100%|███████████████████████████████████████████| 23/23 [00:01<00:00, 12.36it/s]


In [10]:
subject_dfs['04015'].iloc[:100].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rr_int,rr_int_seconds,rhythmLabel,rmean,rmean_seconds,drmean,rr_variance
Unnamed: 0_level_1,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.0,139,0.556,N,139.0,0.556,0.0,0.0
1,0.632,158,0.632,N,143.75,0.575,4.75,0.132174
2,1.536,226,0.904,N,164.3125,0.65725,20.5625,0.413846
3,2.116,145,0.58,N,159.484375,0.637938,-4.828125,0.507887
4,2.692,144,0.576,N,155.613281,0.622453,-3.871094,0.006426


In [8]:
def classify_rr_ints(df):
    #list of types of rr_ints for each subject
    subject_types = []
    for idx, row in enumerate(df.itertuples()):
        if row.rr_int < 0.85*row.rmean: 
            #if rr_ints is less than 85% of runningmean
            #label subject type as short
            subject_types.append('short')
        elif row.rr_int > 1.15*row.rmean: 
            #if rr_ints is greater than 115% of runningmean
            #label subject type as long
            subject_types.append('long')
        else:
            #label subject type as regular
            subject_types.append('regular')
    
    return subject_types

In [12]:
interval_types = classify_rr_ints(subject_dfs['04015'].iloc[:100])
print(interval_types)

['regular', 'regular', 'long', 'regular', 'regular', 'long', 'regular', 'regular', 'long', 'regular', 'regular', 'regular', 'regular', 'regular', 'short', 'short', 'short', 'short', 'short', 'long', 'long', 'short', 'regular', 'long', 'short', 'regular', 'long', 'long', 'regular', 'regular', 'regular', 'regular', 'short', 'short', 'short', 'short', 'short', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'long', 'long', 'regular', 'short', 'long', 'short', 'long', 'regular', 'regular', 'long', 'regular', 'regular', 'long', 'short', 'regular', 'long', 'long', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'short', 'short', 'short', 'short', 'regular', 'regular', 'regular', 'long', 'regular', 'regular', 'long', 'short', 'regular', 'long', 'short', 'short', 'long', 'short', 'short', 'short', 'long', 'long', 'short']


In [30]:
def find_proportions(int_types):
    StoS = 0
    StoR = 0
    StoL = 0
    RtoS = 0
    RtoR = 0
    RtoL = 0
    LtoS = 0
    LtoR = 0
    LtoL = 0
    count = 0
    for idx in range(len(int_types)):
        if idx<len(int_types)-1:
            count+=1
            if int_types[idx]=='short' and int_types[idx+1]=='short':
                StoS+=1
            elif int_types[idx]=='short' and int_types[idx+1]=='regular':
                StoR+=1
            elif int_types[idx]=='short' and int_types[idx+1]=='long':
                StoL+=1
            elif int_types[idx]=='regular' and int_types[idx+1]=='short':
                RtoS+=1
            elif int_types[idx]=='regular' and int_types[idx+1]=='regular':
                RtoR+=1
            elif int_types[idx]=='regular' and int_types[idx+1]=='long':
                RtoL+=1
            elif int_types[idx]=='long' and int_types[idx+1]=='short':
                LtoS+=1
            elif int_types[idx]=='long' and int_types[idx+1]=='regular':
                LtoR+=1
            elif int_types[idx]=='long' and int_types[idx+1]=='long':
                LtoL+=1
    
    subject_transitions = [StoS, StoR, StoL, RtoS, RtoR, RtoL, LtoS, LtoR, LtoL]
    
    return subject_transitions

In [35]:
def find_rrv(df):
    rrvs = [0]
    prev_rr_int = 0
    for idx, row in enumerate(df.itertuples()):
        if idx==0:
            prev_rr_int = row.rr_int
        else:
            rrvs.append(np.abs(row.rr_int-prev_rr_int)/row.rmean)
    return rrvs

44.1587635696472

In [97]:
def subset_features(df):
    feature_df = {}

    feature_df['length'] = len(df)
    feature_df['props'] = find_proportions(classify_rr_ints(df))
    feature_df['std'] = np.std(df['rr_int'])
    feature_df['cov'] = feature_df['std']/np.mean(df['rr_int'])
    feature_df['range'] = np.max(df['rr_int'])-np.min(df['rr_int'])
    feature_df['rmean'] = df['rmean'].tolist()
    feature_df['rrv'] = df['rr_variance'].tolist()
    feature_df['rmean_var'] = df['rmean'].var()
    feature_df['mad'] = stats.median_abs_deviation(df['rr_int'])
    feature_df['iqr'] = stats.iqr(df['rr_int'])
    feature_df['rhythm'] = df['rhythmLabel'].mode().iloc[0]

    return feature_df

In [79]:
print(subset_features(subject_dfs['04015'].iloc[1000:1003]))

{'props': [0, 0, 0, 0, 2, 0, 0, 0, 0], 'std': 1.247219128924647, 'cov': 0.00637420338462341, 'range': 3, 'rmean': [197.0, 196.25, 196.1875], 'rrv': [0.1319796954314721, 0.0152866242038216, 0.0101943294042688], 'rmean_var': 0.20442708333333334, 'mad': 1.0, 'iqr': 1.5}


In [95]:
def feature_gen(df, calibsize, subsetsize, cumulative_range):
    cumulative_features = []
    cumulative = df.head(100)

    calib_features = subset_features(cumulative)

    print('Calibration features: ')
    print(calib_features)

    subsetcount = (len(df)-calibsize) // subsetsize

    for i in range(cumulative_range, subsetcount):
        subset_df = df.iloc[((i-cumulative_range)*subsetsize)+calibsize:((i+1)*subsetsize)+calibsize]
        #print(subset_df)
        features = subset_features(subset_df)
        print(features)
        cumulative_features.append(features)

In [98]:
feature_gen(subject_dfs['04015'].head(1000), 100, 5, 3)

Calibration features: 
{'length': 100, 'props': [14, 6, 5, 4, 36, 12, 8, 9, 5], 'std': 44.1587635696472, 'cov': 0.2467797226424902, 'range': 172, 'rmean': [139.0, 143.75, 164.3125, 159.484375, 155.61328125, 176.4599609375, 169.344970703125, 164.25872802734375, 181.6940460205078, 189.2705345153809, 196.20290088653564, 197.65217566490173, 199.9891317486763, 201.49184881150725, 185.3688866086304, 172.7766649564728, 162.3324987173546, 152.99937403801596, 143.99953052851197, 162.99964789638398, 178.24973592228798, 167.187301941716, 161.890476456287, 183.66785734221529, 172.50089300666144, 153.0, 173.25, 184.1875, 193.390625, 198.79296875, 200.3447265625, 202.508544921875, 187.13140869140625, 175.0985565185547, 162.07391738891602, 151.555438041687, 142.41657853126526, 148.31243389844894, 141.9843254238367, 137.23824406787753, 133.42868305090815, 137.3215122881811, 138.24113421613583, 165.43085066210188, 179.3231379965764, 187.7423534974323, 173.80676512307423, 202.10507384230567, 188.0788053