In [None]:
import numpy as np
import pandas as pd
import antropy as ant
from tqdm.auto import tqdm
import os
from pathlib import Path, PurePath
import csv
from scipy import stats
from collections import Counter, defaultdict
import time

In [None]:
rlist = []
extractedpath = PurePath(Path(os.getcwd()).parents[0], 'mit-bih-extracted/')
records = extractedpath / 'subject_list.csv'
with open(records) as rfile: # reads in all of the subject IDs
    recordreader = csv.reader(rfile, delimiter=' ', quotechar='|')
    for row in recordreader:
        rlist.append(row[0])

In [None]:
def classify_rr_ints(df):
    conditions = [
        df['rrInt'] < (0.85*df['rmean']),
        df['rrInt'] > (1.15*df['rmean'])
    ]
    choices = [0, 2]

    return np.select(conditions, choices, default=1)

def find_proportions(int_types): # take the interval types and count the transitions/return the proportions
    first = int_types[:-1]
    second = int_types[1:]
    StoS = np.count_nonzero(np.logical_and(first == 0, second == 0))
    StoR = np.count_nonzero(np.logical_and(first == 0, second == 1))
    StoL = np.count_nonzero(np.logical_and(first == 0, second == 2))
    RtoS = np.count_nonzero(np.logical_and(first == 1, second == 0))
    RtoR = np.count_nonzero(np.logical_and(first == 1, second == 1))
    RtoL = np.count_nonzero(np.logical_and(first == 1, second == 2))
    LtoS = np.count_nonzero(np.logical_and(first == 2, second == 0))
    LtoR = np.count_nonzero(np.logical_and(first == 2, second == 1))
    LtoL = np.count_nonzero(np.logical_and(first == 2, second == 2))
    count = len(int_types)-1
    return [StoS/count, StoR/count, StoL/count, RtoS/count, RtoR/count, RtoL/count, LtoS/count, LtoR/count, LtoL/count]

def extract_rmssd(rrInts): # calculate the RMSSD of a subset
    diffs = np.diff(rrInts)
    sum_of_squares = np.sum(diffs**2)
    return np.sqrt(sum_of_squares/len(diffs))

def shannon_entropy(subset):
    # Get the frequency of each rrint classification in the data
    unique, frequencies = np.unique(subset, return_counts=True)
    
    # Calculate the probability of each classification
    probabilities = frequencies / len(subset)
    
    # Calculate the Shannon entropy
    entropy = -np.sum(probabilities*np.log2(probabilities))

    return entropy

def approx_entropy(subset, m=2, r=None):
    def _maxdist(x_i, x_j):
        return max([abs(ua - va) for ua, va in zip(x_i, x_j)])

    def _phi(m):
        x = [[subset[j] for j in range(i, i + m - 1 + 1)] for i in range(N - m + 1)]
        C = [len([1 for x_j in x if _maxdist(x_i, x_j) <= r]) / (N - m + 1.0) for x_i in x]
        return (N - m + 1.0)**(-1) * np.sum(np.log(C))

    N = len(subset)

    if r is None:
        r = 0.2 * np.std(subset)

    return abs(_phi(m+1) - _phi(m))

def calc_range(arr):
    return np.max(arr) - np.min(arr)

def calc_cov(arr):
    return np.std(arr)/np.mean(arr)

In [None]:
def subset_features(record, subsetpath, N = 50, current_weight = 0.25, prev_weight = 0.75):
    subset_list = pd.read_parquet(subsetpath / (record+'_subset_list.parquet')) # read the subset list for that subject
    feature_dict = defaultdict(list) # create a dictionary to store all of the features
    all_intervals = pd.concat([pd.read_parquet(subsetpath / record / (record+"-"+str(idx)+".parquet")) for idx in tqdm(range(1, len(subset_list)), desc="Reading all windows")], ignore_index=True)

    window_size = 4

    # looping over all of the windows
    for idx in tqdm(range(N*window_size, (len(subset_list)*window_size) - window_size), desc="Calculating features for each window"):
        previous_window = all_intervals.iloc[idx - N*window_size:idx]
        current_window = all_intervals.iloc[idx:idx + window_size]

        # calculate the features while including the weights
        current_classifications = classify_rr_ints(current_window)
        previous_classifications = classify_rr_ints(previous_window)
        current_props = find_proportions(current_classifications)
        previous_props = find_proportions(previous_classifications)
        feature_dict['StoS'].append(current_props[0]*current_weight + previous_props[0]*prev_weight)
        feature_dict['StoR'].append(current_props[1]*current_weight + previous_props[1]*prev_weight)
        feature_dict['StoL'].append(current_props[2]*current_weight + previous_props[2]*prev_weight)
        feature_dict['RtoS'].append(current_props[3]*current_weight + previous_props[3]*prev_weight)
        feature_dict['RtoR'].append(current_props[4]*current_weight + previous_props[4]*prev_weight)
        feature_dict['RtoL'].append(current_props[5]*current_weight + previous_props[5]*prev_weight)
        feature_dict['LtoS'].append(current_props[6]*current_weight + previous_props[6]*prev_weight)
        feature_dict['LtoR'].append(current_props[7]*current_weight + previous_props[7]*prev_weight)
        feature_dict['LtoL'].append(current_props[8]*current_weight + previous_props[8]*prev_weight)

        feature_dict['std'].append(np.std(current_window['rrInt'])*current_weight + np.std(previous_window['rrInt'])*prev_weight)
        feature_dict['cov'].append(calc_cov(current_window['rrInt'])*current_weight + calc_cov(previous_window['rrInt'])*prev_weight)
        feature_dict['range'].append(calc_range(current_window['rrInt'])*current_weight + calc_range(previous_window['rrInt'])*prev_weight)
        feature_dict['rrInt_var'].append(current_window['rrInt'].var()*current_weight + previous_window['rrInt'].var()*prev_weight)
        feature_dict['rmean_var'].append(current_window['rmean'].var()*current_weight + previous_window['rmean'].var()*prev_weight)
        feature_dict['rmssd'].append(extract_rmssd(current_window['rrInt'])*current_weight + extract_rmssd(previous_window['rrInt'])*prev_weight)
        feature_dict['mad'].append(stats.median_abs_deviation(current_window['rrInt'])*current_weight + stats.median_abs_deviation(previous_window['rrInt'])*prev_weight)
        feature_dict['iqr'].append(stats.iqr(current_window['rrInt'])*current_weight + stats.iqr(previous_window['rrInt'])*prev_weight)

        feature_dict['entropy'].append(shannon_entropy(current_classifications)*current_weight + shannon_entropy(previous_classifications)*prev_weight)
        feature_dict['approx_entropy'].append(ant.app_entropy(current_window['rrInt'])*current_weight + ant.app_entropy(previous_window['rrInt'])*prev_weight)

    feature_df = pd.DataFrame(data=feature_dict) # make a DataFrame out of the feature dictionary
    return pd.concat([subset_list, feature_df], axis=1) # return the features DataFrame combined with the subset list DataFrame

In [None]:
subsetpath = PurePath(Path(os.getcwd()).parents[0], 'mit-bih-time-subsets/')
featurespath = PurePath(Path(os.getcwd()).parents[0], 'mit-bih-time-features/')
if not os.path.exists(featurespath):
    os.mkdir(featurespath)

for record in tqdm(rlist): # calculate the features for all of the subjects
    features = subset_features(record, subsetpath)
    features.to_parquet(featurespath / (record+".parquet")) # and then write them to disk