In [17]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import os
from pathlib import Path, PurePath
import csv
from scipy import stats
import math
from collections import Counter

In [18]:
rlist = []
extractedpath = PurePath(Path(os.getcwd()).parents[0], 'mit-bih-extracted/')
records = extractedpath / 'subject_list.csv'
with open(records) as rfile: # reads in all of the subject IDs
    recordreader = csv.reader(rfile, delimiter=' ', quotechar='|')
    for row in recordreader:
        rlist.append(row[0])

In [19]:
def classify_rr_ints(df):
    #list of types of rr_ints for each subject
    subject_types = []
    for row in df.itertuples():
        if row.rrInt < 0.85*row.rmean: 
            #if rr_ints is less than 85% of runningmean
            #label subject type as short
            subject_types.append('short')
        elif row.rrInt > 1.15*row.rmean: 
            #if rr_ints is greater than 115% of runningmean
            #label subject type as long
            subject_types.append('long')
        else:
            #label subject type as regular
            subject_types.append('regular')
    
    return subject_types

def find_proportions(int_types): # take the interval types and count the transitions/return the proportions
    StoS = 0
    StoR = 0
    StoL = 0
    RtoS = 0
    RtoR = 0
    RtoL = 0
    LtoS = 0
    LtoR = 0
    LtoL = 0
    for idx in range(len(int_types)-1):
        if int_types[idx]=='short' and int_types[idx+1]=='short':
            StoS+=1
        elif int_types[idx]=='short' and int_types[idx+1]=='regular':
            StoR+=1
        elif int_types[idx]=='short' and int_types[idx+1]=='long':
            StoL+=1
        elif int_types[idx]=='regular' and int_types[idx+1]=='short':
            RtoS+=1
        elif int_types[idx]=='regular' and int_types[idx+1]=='regular':
            RtoR+=1
        elif int_types[idx]=='regular' and int_types[idx+1]=='long':
            RtoL+=1
        elif int_types[idx]=='long' and int_types[idx+1]=='short':
            LtoS+=1
        elif int_types[idx]=='long' and int_types[idx+1]=='regular':
            LtoR+=1
        elif int_types[idx]=='long' and int_types[idx+1]=='long':
            LtoL+=1
    
    count = len(int_types)-1
    return [StoS/count, StoR/count, StoL/count, RtoS/count, RtoR/count, RtoL/count, LtoS/count, LtoR/count, LtoL/count]

def extract_rmssd(subset): # calculate the RMSSD of a subset
    rrInts = subset['rrInt'].to_numpy()
    diffs = np.diff(rrInts)
    sum_of_squares = np.sum(diffs**2)
    return np.sqrt(sum_of_squares/len(diffs))

def shannon_entropy(subset):
    # Get the frequency of each rrint classification in the data
    frequencies = Counter(subset)
    
    # Calculate the probability of each classification
    probabilities = [float(frequency) / len(subset) for frequency in frequencies.values()]
    
    # Calculate the Shannon entropy
    entropy = -sum(probability * math.log(probability, 2) for probability in probabilities)
    
    return entropy


In [20]:
def subset_features(record, subsetpath, current_weight = 0.25, prev_weight = 0.75):
    subset_list = pd.read_parquet(subsetpath / (record+'_subset_list.parquet')) # read the subset list for that subject
    feature_dict = {} # create a dictionary to store all of the features

    calib_df = pd.read_parquet(subsetpath / record / (record+"-"+str(0)+".parquet")) # read the first subset--the calibration window

    # calculate the features for the calibration window
    # also initalize all of the lists in the dictionary that will later become columns
    props = find_proportions(classify_rr_ints(calib_df))
    feature_dict['StoS'] = [props[0]]
    feature_dict['StoR'] = [props[1]]
    feature_dict['StoL'] = [props[2]]
    feature_dict['RtoS'] = [props[3]]
    feature_dict['RtoR'] = [props[4]]
    feature_dict['RtoL'] = [props[5]]
    feature_dict['LtoS'] = [props[6]]
    feature_dict['LtoR'] = [props[7]]
    feature_dict['LtoL'] = [props[8]]

    feature_dict['std'] = [np.std(calib_df['rrInt'])]
    feature_dict['cov'] = [feature_dict['std'][0]/np.mean(calib_df['rrInt'])]
    feature_dict['range'] = [np.max(calib_df['rrInt'])-np.min(calib_df['rrInt'])]
    feature_dict['rrInt_var'] = [calib_df['rrInt'].var()]
    feature_dict['rmean_var'] = [calib_df['rmean'].var()]
    feature_dict['rmssd'] = [extract_rmssd(calib_df)]
    feature_dict['mad'] = [stats.median_abs_deviation(calib_df['rrInt'])]
    feature_dict['iqr'] = [stats.iqr(calib_df['rrInt'])]
    feature_dict['entropy'] = [shannon_entropy(calib_df)]

    # looping over all of the windows
    for idx in tqdm(range(1, len(subset_list)), desc="Calculating features for each window"):
        window = pd.read_parquet(subsetpath / record / (record+"-"+str(idx)+".parquet")) # read the window

        # calculate the features while including the weights
        props = find_proportions(classify_rr_ints(window))
        feature_dict['StoS'].append(props[0]*current_weight + feature_dict['StoS'][idx-1]*prev_weight)
        feature_dict['StoR'].append(props[1]*current_weight + feature_dict['StoR'][idx-1]*prev_weight)
        feature_dict['StoL'].append(props[2]*current_weight + feature_dict['StoL'][idx-1]*prev_weight)
        feature_dict['RtoS'].append(props[3]*current_weight + feature_dict['RtoS'][idx-1]*prev_weight)
        feature_dict['RtoR'].append(props[4]*current_weight + feature_dict['RtoR'][idx-1]*prev_weight)
        feature_dict['RtoL'].append(props[5]*current_weight + feature_dict['RtoL'][idx-1]*prev_weight)
        feature_dict['LtoS'].append(props[6]*current_weight + feature_dict['LtoS'][idx-1]*prev_weight)
        feature_dict['LtoR'].append(props[7]*current_weight + feature_dict['LtoR'][idx-1]*prev_weight)
        feature_dict['LtoL'].append(props[8]*current_weight + feature_dict['LtoL'][idx-1]*prev_weight)

        feature_dict['std'].append(np.std(window['rrInt'])*current_weight + feature_dict['std'][idx-1]*prev_weight)
        feature_dict['cov'].append((feature_dict['std'][idx]/np.mean(window['rrInt']))*current_weight + feature_dict['cov'][idx-1]*prev_weight)
        feature_dict['range'].append(np.max(window['rrInt'])-np.min(window['rrInt'])*current_weight + feature_dict['range'][idx-1]*prev_weight)
        feature_dict['rrInt_var'].append(window['rrInt'].var()*current_weight + feature_dict['rrInt_var'][idx-1]*prev_weight)
        feature_dict['rmean_var'].append(window['rmean'].var()*current_weight + feature_dict['rmean_var'][idx-1]*prev_weight)
        feature_dict['rmssd'].append(extract_rmssd(window)*current_weight + feature_dict['rmssd'][idx-1]*prev_weight)
        feature_dict['mad'].append(stats.median_abs_deviation(window['rrInt'])*current_weight + feature_dict['mad'][idx-1]*prev_weight)
        feature_dict['iqr'].append(stats.iqr(window['rrInt'])*current_weight + feature_dict['iqr'][idx-1]*prev_weight)
        feature_dict['entropy'].append(shannon_entropy(window)*current_weight + feature_dict['entropy'][idx-1]*prev_weight)

    feature_df = pd.DataFrame(data=feature_dict) # make a DataFrame out of the feature dictionary
    return pd.concat([subset_list, feature_df], axis=1) # return the features DataFrame combined with the subset list DataFrame

In [21]:
subsetpath = PurePath(Path(os.getcwd()).parents[0], 'mit-bih-time-subsets/')
featurespath = PurePath(Path(os.getcwd()).parents[0], 'mit-bih-time-features/')
if not os.path.exists(featurespath):
    os.mkdir(featurespath)

for record in tqdm(rlist): # calculate the features for all of the subjects
    features = subset_features(record, subsetpath)
    features.to_parquet(featurespath / (record+".parquet")) # and then write them to disk

  0%|          | 0/23 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/10975 [00:00<?, ?it/s]

KeyboardInterrupt: 