In [1]:
import numpy as np
import pandas as pd
import antropy as ant
from tqdm.auto import tqdm
import os
from pathlib import Path, PurePath
import csv
from scipy import stats
from joblib import Parallel, delayed, dump, load

In [2]:
rlist = []
extractedpath = PurePath(Path(os.getcwd()).parents[0], 'mit-bih-extracted/')
records = extractedpath / 'subject_list.csv'
with open(records) as rfile: # reads in all of the subject IDs
    recordreader = csv.reader(rfile, delimiter=' ', quotechar='|')
    for row in recordreader:
        rlist.append(row[0])

In [3]:
def parallel_extract(data, idx, N, window_size, current_weight, prev_weight):
    previous_window = data[idx - N*window_size:idx]
    current_window = data[idx:idx + window_size]

    if len(current_window) == window_size:
        return extract_all(current_window[:, 1], current_window[:, 2])*current_weight + extract_all(previous_window[:, 1], previous_window[:, 2])*prev_weight
    else:
        return extract_all(previous_window[:, 1], previous_window[:, 2])*prev_weight

def extract_all(rrInts, rmeans):
    conditions = [
        rrInts < (0.85*rmeans),
        rrInts > (1.15*rmeans)
    ]
    choices = [0, 2]

    int_types = np.select(conditions, choices, default=1)

    first = int_types[:-1]
    second = int_types[1:]
    count = len(int_types)-1
    StoS = np.count_nonzero(np.logical_and(first == 0, second == 0)) / count
    StoR = np.count_nonzero(np.logical_and(first == 0, second == 1)) / count
    StoL = np.count_nonzero(np.logical_and(first == 0, second == 2)) / count
    RtoS = np.count_nonzero(np.logical_and(first == 1, second == 0)) / count
    RtoR = np.count_nonzero(np.logical_and(first == 1, second == 1)) / count
    RtoL = np.count_nonzero(np.logical_and(first == 1, second == 2)) / count
    LtoS = np.count_nonzero(np.logical_and(first == 2, second == 0)) / count
    LtoR = np.count_nonzero(np.logical_and(first == 2, second == 1)) / count
    LtoL = np.count_nonzero(np.logical_and(first == 2, second == 2)) / count

    stdev = np.std(rrInts)
    cov = stdev / np.mean(rrInts)
    rr_range = np.max(rrInts) - np.min(rrInts)
    rrInt_var = rrInts.var()
    rmean_var = rmeans.var()
    rmssd = extract_rmssd(rrInts)
    mad = stats.median_abs_deviation(rrInts)
    iqr = stats.iqr(rrInts)

    shannon = shannon_entropy(int_types)
    approx = ant.app_entropy(rrInts)

    return np.array([StoS, StoR, StoL, RtoS, RtoR, RtoL, LtoS, LtoR, LtoL, stdev, cov, rr_range, rrInt_var, rmean_var, rmssd, mad, iqr, shannon, approx])

def extract_rmssd(rrInts): # calculate the RMSSD of a subset
    diffs = np.diff(rrInts)
    sum_of_squares = np.sum(diffs**2)
    return np.sqrt(sum_of_squares/len(diffs))

def shannon_entropy(subset):
    # Get the frequency of each rrint classification in the data
    unique, frequencies = np.unique(subset, return_counts=True)
    
    # Calculate the probability of each classification
    probabilities = frequencies / len(subset)
    
    # Calculate the Shannon entropy
    entropy = -np.sum(probabilities*np.log2(probabilities))

    return entropy

In [4]:
def subset_features(record, subsetpath, N = 50, current_weight = 0.4, prev_weight = 0.6):
    subset_list = pd.read_parquet(subsetpath / (record+'_subset_list.parquet')) # read the subset list for that subject

    all_intervals = pd.concat([pd.read_parquet(subsetpath / record / (record+"-"+str(idx)+".parquet")) for idx in tqdm(range(1, len(subset_list)), desc="Reading all windows")], ignore_index=True)
    filename = Path("joblib_memmap")
    if not filename.exists():
        filename.mkdir()
    dump(all_intervals.to_numpy(), filename / "all_intervals.memmap")
    all_memmap = load(filename / "all_intervals.memmap", mmap_mode="r")

    window_size = 4

    rows = list(
        tqdm(Parallel(n_jobs=6, return_as="generator", max_nbytes=None, batch_size=100)(
            delayed(parallel_extract)(all_memmap, idx, N, window_size, current_weight, prev_weight) 
            for idx in range(N*window_size, (len(subset_list)*window_size) - window_size)),
            desc="Calculating features for each window", 
            total=(len(subset_list)*window_size) - window_size - (N*window_size))
    )
    
    feature_arr = np.vstack(rows)
    columns = ['StoS', 'StoR', 'StoL', 'RtoS', 'RtoR', 'RtoL', 'LtoS', 'LtoR', 'LtoL', 'std', 'cov', 'range', 'rrInt_var', 'rmean_var', 'rmssd', 'mad', 'iqr','entropy','approx_entropy']
    feature_df = pd.DataFrame(data=feature_arr, columns=columns) # make a DataFrame out of the feature dictionary
    return pd.concat([subset_list, feature_df], axis=1) # return the features DataFrame combined with the subset list DataFrame

In [5]:
subsetpath = PurePath(Path(os.getcwd()).parents[0], 'mit-bih-time-subsets/')
featurespath = PurePath(Path(os.getcwd()).parents[0], 'mit-bih-time-features/')
if not os.path.exists(featurespath):
    os.mkdir(featurespath)

for record in tqdm(rlist): # calculate the features for all of the subjects
    features = subset_features(record, subsetpath)
    features.to_parquet(featurespath / (record+".parquet")) # and then write them to disk

  0%|          | 0/23 [00:00<?, ?it/s]

Reading all windows:   0%|          | 0/7315 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/29060 [00:00<?, ?it/s]

Reading all windows:   0%|          | 0/10301 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/41004 [00:00<?, ?it/s]

Reading all windows:   0%|          | 0/6630 [00:00<?, ?it/s]