In [1]:
import numpy as np
import pandas as pd
import antropy as ant
from tqdm.auto import tqdm
import os
from pathlib import Path, PurePath
import csv
from scipy import stats
from joblib import Parallel, delayed, dump, load

In [2]:
rlist = []
extractedpath = PurePath(Path(os.getcwd()).parents[0], 'mit-bih-extracted/')
records = extractedpath / 'subject_list.csv'
with open(records) as rfile: # reads in all of the subject IDs
    recordreader = csv.reader(rfile, delimiter=' ', quotechar='|')
    for row in recordreader:
        rlist.append(row[0])

In [3]:
def parallel_extract(data, idx, N, window_size, current_weight, prev_weight):
    previous_window = data[idx - N*window_size:idx]
    current_window = data[idx:idx + window_size]

    return extract_all(current_window[:, 0], current_window[:, 1])*current_weight + extract_all(previous_window[:, 0], previous_window[:, 1])*prev_weight

def extract_all(rrInts, rmeans):
    conditions = [
        rrInts < (0.85*rmeans),
        rrInts > (1.15*rmeans)
    ]
    choices = [0, 2]

    int_types = np.select(conditions, choices, default=1)

    first = int_types[:-1]
    second = int_types[1:]
    count = len(int_types)-1
    StoS = np.count_nonzero(np.logical_and(first == 0, second == 0)) / count
    StoR = np.count_nonzero(np.logical_and(first == 0, second == 1)) / count
    StoL = np.count_nonzero(np.logical_and(first == 0, second == 2)) / count
    RtoS = np.count_nonzero(np.logical_and(first == 1, second == 0)) / count
    RtoR = np.count_nonzero(np.logical_and(first == 1, second == 1)) / count
    RtoL = np.count_nonzero(np.logical_and(first == 1, second == 2)) / count
    LtoS = np.count_nonzero(np.logical_and(first == 2, second == 0)) / count
    LtoR = np.count_nonzero(np.logical_and(first == 2, second == 1)) / count
    LtoL = np.count_nonzero(np.logical_and(first == 2, second == 2)) / count

    stdev = np.std(rrInts)
    cov = stdev / np.mean(rrInts)
    rr_range = np.max(rrInts) - np.min(rrInts)
    rrInt_var = rrInts.var()
    rmean_var = rmeans.var()
    rmssd = extract_rmssd(rrInts)
    mad = stats.median_abs_deviation(rrInts)
    iqr = stats.iqr(rrInts)

    shannon = shannon_entropy(int_types)
    approx = ant.app_entropy(rrInts, order=2)

    return np.array([StoS, StoR, StoL, RtoS, RtoR, RtoL, LtoS, LtoR, LtoL, stdev, cov, rr_range, rrInt_var, rmean_var, rmssd, mad, iqr, shannon, approx])

def extract_rmssd(rrInts): # calculate the RMSSD of a subset
    diffs = np.diff(rrInts)
    sum_of_squares = np.sum(diffs**2)
    return np.sqrt(sum_of_squares/len(diffs))

def shannon_entropy(subset):
    # Get the frequency of each rrint classification in the data
    unique, frequencies = np.unique(subset, return_counts=True)
    
    # Calculate the probability of each classification
    probabilities = frequencies / len(subset)
    
    # Calculate the Shannon entropy
    entropy = -np.sum(probabilities*np.log2(probabilities))

    return entropy

In [4]:
def subset_features(all_memmap, N = 50, current_weight = 0.25, prev_weight = 0.75):
    window_size = 4

    rows = list(
        tqdm(Parallel(n_jobs=12, return_as="generator", max_nbytes=None, batch_size="auto")(
            delayed(parallel_extract)(all_memmap, idx, N, window_size, current_weight, prev_weight) 
            for idx in range(N*window_size, len(all_memmap) - window_size)),
            desc="Calculating features for each window", 
            total=(len(all_memmap)) - window_size - (N*window_size))
    )
    
    feature_arr = np.vstack(rows)
    columns = ['StoS', 'StoR', 'StoL', 'RtoS', 'RtoR', 'RtoL', 'LtoS', 'LtoR', 'LtoL', 'std', 'cov', 'range', 'rrInt_var', 'rmean_var', 'rmssd', 'mad', 'iqr','entropy','approx_entropy']
    feature_df = pd.DataFrame(data=feature_arr, columns=columns) # make a DataFrame out of the feature dictionary
    return feature_df # return the features DataFrame

In [5]:
subsetpath = PurePath(Path(os.getcwd()).parents[0], 'mit-bih-time-subsets-stepping/')
featurespath = PurePath(Path(os.getcwd()).parents[0], 'mit-bih-time-features-stepping/')
if not os.path.exists(featurespath):
    os.mkdir(featurespath)

recent_steps = [50, 100, 200, 400]

for record in tqdm(rlist): # calculate the features for all of the subjects
    subset_list = pd.read_parquet(subsetpath / (record+'_subset_list.parquet')) # read the subset list for that subject

    all_intervals = pd.read_parquet(subsetpath / (record+'-all.parquet'))
    
    filename = Path("joblib_memmap")
    if not filename.exists():
        filename.mkdir()
    dump(all_intervals.drop(["rhythmLabel", "mappedLabel"], axis=1).to_numpy(), filename / "all_intervals.memmap")
    all_memmap = load(filename / "all_intervals.memmap", mmap_mode="r")

    for recent in tqdm(recent_steps, desc="Calculating features for each recent value"): # for each value of N
        features = subset_features(all_memmap, N=recent).assign(
            subjectID=record,
            rhythmLabel=all_intervals["rhythmLabel"],
            mappedLabel=all_intervals["mappedLabel"]
        )
        recentpath = featurespath / Path(str(recent))
        if not os.path.exists(recentpath):
            os.mkdir(recentpath)
        features.to_parquet(recentpath / (record+".parquet")) # and then write them to disk

  0%|          | 0/23 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/43796 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/43596 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/43196 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/42396 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/61708 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/61508 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/61108 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/60308 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/39712 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/39512 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/39112 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/38312 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/42652 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/42452 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/42052 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/41252 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/47668 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/47468 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/47068 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/46268 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/61552 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/61352 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/60952 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/60152 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/53436 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/53236 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/52836 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/52036 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/36576 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/36376 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/35976 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/35176 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/49672 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/49472 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/49072 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/48272 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/45324 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/45124 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/44724 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/43924 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/54940 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/54740 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/54340 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/53540 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/34628 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/34428 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/34028 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/33228 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/54984 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/54784 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/54384 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/53584 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/39076 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/38876 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/38476 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/37676 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/60056 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/59856 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/59456 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/58656 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/56280 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/56080 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/55680 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/54880 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/36284 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/36084 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/35684 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/34884 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/43144 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/42944 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/42544 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/41744 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/59084 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/58884 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/58484 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/57684 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/45308 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/45108 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/44708 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/43908 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/58648 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/58448 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/58048 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/57248 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/39640 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/39440 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/39040 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/38240 [00:00<?, ?it/s]

Calculating features for each recent value:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/59344 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/59144 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/58744 [00:00<?, ?it/s]

Calculating features for each window:   0%|          | 0/57944 [00:00<?, ?it/s]