In [44]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import os
from pathlib import Path, PurePath
import csv
from scipy import stats
import math
from collections import Counter

In [45]:
rlist = []
extractedpath = PurePath(Path(os.getcwd()).parents[0], 'mit-bih-extracted-nocalibration/')
records = extractedpath / 'subject_list.csv'
with open(records) as rfile: # reads in all of the subject IDs
    recordreader = csv.reader(rfile, delimiter=' ', quotechar='|')
    for row in recordreader:
        rlist.append(row[0])

In [46]:
def classify_rr_ints(df):
    #list of types of rr_ints for each subject
    subject_types = []
    for row in df.itertuples():
        if row.rrInt < 0.85*row.rmean: 
            #if rr_ints is less than 85% of runningmean
            #label subject type as short
            subject_types.append('short')
        elif row.rrInt > 1.15*row.rmean: 
            #if rr_ints is greater than 115% of runningmean
            #label subject type as long
            subject_types.append('long')
        else:
            #label subject type as regular
            subject_types.append('regular')
    
    return subject_types

def find_proportions(int_types): # take the interval types and count the transitions/return the proportions
    StoS = 0
    StoR = 0
    StoL = 0
    RtoS = 0
    RtoR = 0
    RtoL = 0
    LtoS = 0
    LtoR = 0
    LtoL = 0
    for idx in range(len(int_types)-1):
        if int_types[idx]=='short' and int_types[idx+1]=='short':
            StoS+=1
        elif int_types[idx]=='short' and int_types[idx+1]=='regular':
            StoR+=1
        elif int_types[idx]=='short' and int_types[idx+1]=='long':
            StoL+=1
        elif int_types[idx]=='regular' and int_types[idx+1]=='short':
            RtoS+=1
        elif int_types[idx]=='regular' and int_types[idx+1]=='regular':
            RtoR+=1
        elif int_types[idx]=='regular' and int_types[idx+1]=='long':
            RtoL+=1
        elif int_types[idx]=='long' and int_types[idx+1]=='short':
            LtoS+=1
        elif int_types[idx]=='long' and int_types[idx+1]=='regular':
            LtoR+=1
        elif int_types[idx]=='long' and int_types[idx+1]=='long':
            LtoL+=1
    
    count = len(int_types)-1
    print(int_types)
    return [StoS/count, StoR/count, StoL/count, RtoS/count, RtoR/count, RtoL/count, LtoS/count, LtoR/count, LtoL/count]

def extract_rmssd(subset): # calculate the RMSSD of a subset
    rrInts = subset['rrInt'].to_numpy()
    diffs = np.diff(rrInts)
    sum_of_squares = np.sum(diffs**2)
    return np.sqrt(sum_of_squares/len(diffs))

def shannon_entropy(subset):
    # Get the frequency of each rrint classification in the data
    frequencies = Counter(subset)
    
    # Calculate the probability of each classification
    probabilities = [float(frequency) / len(subset) for frequency in frequencies.values()]
    
    # Calculate the Shannon entropy
    entropy = -sum(probability * math.log(probability, 2) for probability in probabilities)
    
    return entropy
def approx_entropy(subset, m=2, r=None):
    def _maxdist(x_i, x_j):
        return max([abs(ua - va) for ua, va in zip(x_i, x_j)])

    def _phi(m):
        x = [[subset[j] for j in range(i, i + m - 1 + 1)] for i in range(N - m + 1)]
        C = [len([1 for x_j in x if _maxdist(x_i, x_j) <= r]) / (N - m + 1.0) for x_i in x]
        return (N - m + 1.0)**(-1) * sum(np.log(C))

    # Ensure subset has a default integer index
    subset = subset.reset_index(drop=True)

    N = len(subset)

    if r is None:
        r = 0.2 * np.std(subset)

    return abs(_phi(m+1) - _phi(m))

# usage
# subset is a list of RR intervals fr


In [47]:
N = 800

def subset_features(record, subsetpath, current_weight = 0.25, prev_weight = 0.75):
    bigDataframe = pd.concat([pd.read_parquet(subsetpath / record / (record+"-"+str(idx)+".parquet")) for idx in range(N+1)])

    feature_dict = {} 
    init_data = bigDataframe.iloc[:N]
    props = find_proportions(classify_rr_ints(init_data))
    feature_dict['StoS'] = [props[0]]
    feature_dict['StoR'] = [props[1]]
    feature_dict['StoL'] = [props[2]]
    feature_dict['RtoS'] = [props[3]]
    feature_dict['RtoR'] = [props[4]]
    feature_dict['RtoL'] = [props[5]]
    feature_dict['LtoS'] = [props[6]]
    feature_dict['LtoR'] = [props[7]]
    feature_dict['LtoL'] = [props[8]]

    feature_dict['std'] = [np.std(init_data['rrInt'])]
    feature_dict['cov'] = [feature_dict['std'][0]/np.mean(init_data['rrInt'])]
    feature_dict['range'] = [np.max(init_data['rrInt'])-np.min(init_data['rrInt'])]
    feature_dict['rrInt_var'] = [init_data['rrInt'].var()]
    feature_dict['rmean_var'] = [init_data['rmean'].var()]
    feature_dict['rmssd'] = [extract_rmssd(init_data)]
    feature_dict['mad'] = [stats.median_abs_deviation(init_data['rrInt'])]
    feature_dict['iqr'] = [stats.iqr(init_data['rrInt'])]
    feature_dict['entropy'] = [shannon_entropy(init_data)]
    feature_dict['approx_entropy'] = [approx_entropy(init_data['rrInt'])]

    for idx in tqdm(range(N+4, len(bigDataframe)), desc="Calculating features for each window"):
        current_window = bigDataframe.iloc[[idx]]
        previous_window = bigDataframe.iloc[idx - N: idx]


        prev_props = find_proportions(classify_rr_ints(previous_window))
        prev_std = np.std(previous_window['rrInt'])
        prev_cov = prev_std/np.mean(previous_window['rrInt'])
        prev_range = np.max(previous_window['rrInt'])-np.min(previous_window['rrInt'])
        prev_rrInt_var = previous_window['rrInt'].var()
        prev_rmean_var = previous_window['rmean'].var()
        prev_rmssd = extract_rmssd(previous_window)
        prev_mad = stats.median_abs_deviation(previous_window['rrInt'])
        prev_iqr = stats.iqr(previous_window['rrInt'])
        prev_entropy = shannon_entropy(previous_window)
        prev_approx_entropy = approx_entropy(previous_window['rrInt'])


        curr_props = find_proportions(classify_rr_ints(current_window))
        curr_std = np.std(current_window['rrInt'])
        curr_cov = curr_std/np.mean(current_window['rrInt'])
        curr_range = np.max(current_window['rrInt'])-np.min(current_window['rrInt'])
        curr_rrInt_var = current_window['rrInt'].var()
        curr_rmean_var = current_window['rmean'].var()
        curr_rmssd = extract_rmssd(current_window)
        curr_mad = stats.median_abs_deviation(current_window['rrInt'])
        curr_iqr = stats.iqr(current_window['rrInt'])
        curr_entropy = shannon_entropy(current_window)
        curr_approx_entropy = approx_entropy(current_window['rrInt'])

        feature_dict['StoS'].append(curr_props[0]*current_weight + prev_props[0]*prev_weight)
        feature_dict['StoR'].append(curr_props[1]*current_weight + prev_props[1]*prev_weight)
        feature_dict['StoL'].append(curr_props[2]*current_weight + prev_props[2]*prev_weight)
        feature_dict['RtoS'].append(curr_props[3]*current_weight + prev_props[3]*prev_weight)
        feature_dict['RtoR'].append(curr_props[4]*current_weight + prev_props[4]*prev_weight)
        feature_dict['RtoL'].append(curr_props[5]*current_weight + prev_props[5]*prev_weight)
        feature_dict['LtoS'].append(curr_props[6]*current_weight + prev_props[6]*prev_weight)
        feature_dict['LtoR'].append(curr_props[7]*current_weight + prev_props[7]*prev_weight)
        feature_dict['LtoL'].append(curr_props[8]*current_weight + prev_props[8]*prev_weight)

        feature_dict['std'].append(curr_std*current_weight + prev_std*prev_weight)
        feature_dict['cov'].append(curr_cov*current_weight + prev_cov*prev_weight)
        feature_dict['range'].append(curr_range*current_weight + prev_range*prev_weight)
        feature_dict['rrInt_var'].append(curr_rrInt_var*current_weight + prev_rrInt_var*prev_weight)
        feature_dict['rmean_var'].append(curr_rmean_var*current_weight + prev_rmean_var*prev_weight)
        feature_dict['rmssd'].append(curr_rmssd*current_weight + prev_rmssd*prev_weight)
        feature_dict['mad'].append(curr_mad*current_weight + prev_mad*prev_weight)
        feature_dict['iqr'].append(curr_iqr*current_weight + prev_iqr*prev_weight)
        feature_dict['entropy'].append(curr_entropy*current_weight + prev_entropy*prev_weight)
        feature_dict['approx_entropy'].append(curr_approx_entropy*current_weight + prev_approx_entropy*prev_weight)

    return feature_dict


In [48]:
subsetpath = PurePath(Path(os.getcwd()).parents[0], 'mit-bih-time-subsets-nocalibration/')
featurespath = PurePath(Path(os.getcwd()).parents[0], 'mit-bih-time-features-nocalibration/')
if not os.path.exists(featurespath):
    os.mkdir(featurespath)

for record in tqdm(rlist): # calculate the features for all of the subjects
    features = subset_features(record, subsetpath)
    features.to_parquet(featurespath / (record+".parquet")) # and then write them to disk

  0%|          | 0/23 [00:00<?, ?it/s]

['regular', 'regular', 'long', 'regular', 'regular', 'long', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'short', 'short', 'regular', 'short', 'regular', 'regular', 'regular', 'regular', 'short', 'short', 'regular', 'short', 'short', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'short', 'short', 'short', 'regular', 'long', 'regular', 'regular', 'regular', 'regular', 'long', 'regular', 'regular', 'long', 'long', 'regular', 'regular', 'long', 'long', 'regular', 'regular', 'long', 'long', 'regular', 'regular', 'long', 'regular', 'long', 'regular', 'regular', 'long', 'regular', 'regular', 'long', 'short', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'short', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'short', 'regular', 'regular', 

Calculating features for each window:   0%|          | 0/2400 [00:00<?, ?it/s]

['regular', 'long', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'short', 'short', 'regular', 'short', 'regular', 'regular', 'regular', 'regular', 'short', 'short', 'regular', 'short', 'short', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'short', 'short', 'short', 'regular', 'long', 'regular', 'regular', 'regular', 'regular', 'long', 'regular', 'regular', 'long', 'long', 'regular', 'regular', 'long', 'long', 'regular', 'regular', 'long', 'long', 'regular', 'regular', 'long', 'regular', 'long', 'regular', 'regular', 'long', 'regular', 'regular', 'long', 'short', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'short', 'regular', 'regular', 'regular', 'regular', 'regular', 'regular', 'short', 'regular', 'regular', 'short', 'regular', 'regular', 'regular',

ZeroDivisionError: division by zero