In [5]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import os
from pathlib import Path, PurePath
import csv
from scipy import stats
import math
from collections import Counter, defaultdict

In [3]:
rlist = []
extractedpath = PurePath(Path(os.getcwd()).parents[0], 'mit-bih-extracted/')
records = extractedpath / 'subject_list.csv'
with open(records) as rfile: # reads in all of the subject IDs
    recordreader = csv.reader(rfile, delimiter=' ', quotechar='|')
    for row in recordreader:
        rlist.append(row[0])

In [4]:
def classify_rr_ints(df):
    #list of types of rr_ints for each subject
    subject_types = []
    for row in df.itertuples():
        if row.rrInt < 0.85*row.rmean: 
            #if rr_ints is less than 85% of runningmean
            #label subject type as short
            subject_types.append('short')
        elif row.rrInt > 1.15*row.rmean: 
            #if rr_ints is greater than 115% of runningmean
            #label subject type as long
            subject_types.append('long')
        else:
            #label subject type as regular
            subject_types.append('regular')
    
    return subject_types

def find_proportions(int_types): # take the interval types and count the transitions/return the proportions
    StoS = 0
    StoR = 0
    StoL = 0
    RtoS = 0
    RtoR = 0
    RtoL = 0
    LtoS = 0
    LtoR = 0
    LtoL = 0
    for idx in range(len(int_types)-1):
        if int_types[idx]=='short' and int_types[idx+1]=='short':
            StoS+=1
        elif int_types[idx]=='short' and int_types[idx+1]=='regular':
            StoR+=1
        elif int_types[idx]=='short' and int_types[idx+1]=='long':
            StoL+=1
        elif int_types[idx]=='regular' and int_types[idx+1]=='short':
            RtoS+=1
        elif int_types[idx]=='regular' and int_types[idx+1]=='regular':
            RtoR+=1
        elif int_types[idx]=='regular' and int_types[idx+1]=='long':
            RtoL+=1
        elif int_types[idx]=='long' and int_types[idx+1]=='short':
            LtoS+=1
        elif int_types[idx]=='long' and int_types[idx+1]=='regular':
            LtoR+=1
        elif int_types[idx]=='long' and int_types[idx+1]=='long':
            LtoL+=1
    
    count = len(int_types)-1
    return [StoS/count, StoR/count, StoL/count, RtoS/count, RtoR/count, RtoL/count, LtoS/count, LtoR/count, LtoL/count]

def extract_rmssd(rrInts): # calculate the RMSSD of a subset
    diffs = np.diff(rrInts)
    sum_of_squares = np.sum(diffs**2)
    return np.sqrt(sum_of_squares/len(diffs))

def shannon_entropy(subset):
    # Get the frequency of each rrint classification in the data
    frequencies = Counter(subset)
    
    # Calculate the probability of each classification
    probabilities = [float(frequency) / len(subset) for frequency in frequencies.values()]
    
    # Calculate the Shannon entropy
    entropy = -sum(probability * math.log(probability, 2) for probability in probabilities)
    
    return entropy

def approx_entropy(subset, m=2, r=None):
    def _maxdist(x_i, x_j):
        return max([abs(ua - va) for ua, va in zip(x_i, x_j)])

    def _phi(m):
        x = [[subset[j] for j in range(i, i + m - 1 + 1)] for i in range(N - m + 1)]
        C = [len([1 for x_j in x if _maxdist(x_i, x_j) <= r]) / (N - m + 1.0) for x_i in x]
        return (N - m + 1.0)**(-1) * sum(np.log(C))

    N = len(subset)

    if r is None:
        r = 0.2 * np.std(subset)

    return abs(_phi(m+1) - _phi(m))

def calc_range(arr):
    return np.max(arr) - np.min(arr)

def calc_cov(arr):
    return np.std(arr)/np.mean(arr)

# usage
# subset is a list of RR intervals fr


In [None]:
def subset_features(record, subsetpath, N = 200, current_weight = 0.25, prev_weight = 0.75):
    subset_list = pd.read_parquet(subsetpath / (record+'_subset_list.parquet')) # read the subset list for that subject
    feature_dict = defaultdict(list) # create a dictionary to store all of the features
    all_intervals = pd.concat([pd.read_parquet(subsetpath / record / (record+"-"+str(idx)+".parquet")) for idx in range(1, len(subset_list))], ignore_index=True)

    window_size = 4

    # looping over all of the windows
    for idx in tqdm(range(N*window_size, (len(subset_list)*window_size) - window_size), desc="Calculating features for each window"):
        previous_window = all_intervals.iloc[idx - N*window_size:idx]
        current_window = all_intervals.iloc[idx:idx + window_size]

        # calculate the features while including the weights
        current_props = find_proportions(classify_rr_ints(current_window))
        previous_props = find_proportions(classify_rr_ints(previous_window))
        feature_dict['StoS'].append(current_props[0]*current_weight + previous_props[0]*prev_weight)
        feature_dict['StoR'].append(current_props[1]*current_weight + previous_props[1]*prev_weight)
        feature_dict['StoL'].append(current_props[2]*current_weight + previous_props[2]*prev_weight)
        feature_dict['RtoS'].append(current_props[3]*current_weight + previous_props[3]*prev_weight)
        feature_dict['RtoR'].append(current_props[4]*current_weight + previous_props[4]*prev_weight)
        feature_dict['RtoL'].append(current_props[5]*current_weight + previous_props[5]*prev_weight)
        feature_dict['LtoS'].append(current_props[6]*current_weight + previous_props[6]*prev_weight)
        feature_dict['LtoR'].append(current_props[7]*current_weight + previous_props[7]*prev_weight)
        feature_dict['LtoL'].append(current_props[8]*current_weight + previous_props[8]*prev_weight)

        feature_dict['std'].append(np.std(current_window['rrInt'])*current_weight + np.std(previous_window['rrInt'])*prev_weight)
        feature_dict['cov'].append(calc_cov(current_window['rrInt'])*current_weight + calc_cov(previous_window['rrInt'])*prev_weight)
        feature_dict['range'].append(calc_range(current_window['rrInt'])*current_weight + calc_range(previous_window['rrInt'])*prev_weight)
        feature_dict['rrInt_var'].append(current_window['rrInt'].var()*current_weight + previous_window['rrInt'].var()*prev_weight)
        feature_dict['rmean_var'].append(current_window['rmean'].var()*current_weight + previous_window['rmean'].var()*prev_weight)
        feature_dict['rmssd'].append(extract_rmssd(current_window['rrInt'])*current_weight + extract_rmssd(previous_window['rrInt'])*prev_weight)
        feature_dict['mad'].append(stats.median_abs_deviation(current_window['rrInt'])*current_weight + stats.median_abs_deviation(previous_window['rrInt'])*prev_weight)
        feature_dict['iqr'].append(stats.iqr(current_window['rrInt'])*current_weight + stats.iqr(previous_window['rrInt'])*prev_weight)
        feature_dict['entropy'].append(shannon_entropy(current_window)*current_weight + shannon_entropy(previous_window)*prev_weight)
        feature_dict['approx_entropy'].append(approx_entropy(current_window['rrInt'])*current_weight + approx_entropy(current_window['rrInt'])*prev_weight)

    feature_df = pd.DataFrame(data=feature_dict) # make a DataFrame out of the feature dictionary
    return pd.concat([subset_list, feature_df], axis=1) # return the features DataFrame combined with the subset list DataFrame


In [None]:
subsetpath = PurePath(Path(os.getcwd()).parents[0], 'mit-bih-time-subsets-stepping/')
featurespath = PurePath(Path(os.getcwd()).parents[0], 'mit-bih-time-features-stepping/')
if not os.path.exists(featurespath):
    os.mkdir(featurespath)

for record in tqdm(rlist): # calculate the features for all of the subjects
    features = subset_features(record, subsetpath)
    features.to_parquet(featurespath / (record+".parquet")) # and then write them to disk