In [11]:
import numpy as np
import pandas as pd
import cudf as cd
from tqdm.auto import tqdm
import os
import csv

In [12]:
rlist = []
records = os.path.normpath('mit-bih-dataframes/subject_list.csv')
with open(records) as rfile:
    recordreader = csv.reader(rfile, delimiter=' ', quotechar='|')
    for row in recordreader:
        rlist.append(row[0])

In [13]:
rr_int_dfs = {}
for record in tqdm(rlist): 
    rr_int_dfs[record] = cd.read_parquet(os.path.normpath('mit-bih-rr-intervals/'+record+'.parquet'))

  0%|          | 0/23 [00:00<?, ?it/s]

In [14]:
def find_proportions(groups, window_size):
    series_list = []

    for i in range(window_size):
        row = groups[['rr_int', 'rmean']].nth(i).to_pandas()
        conditions = [
            row['rr_int'] <= 0.85*row['rmean'],
            (row['rr_int'] > 0.85*row['rmean']) & (row['rr_int'] < 1.15*row['rmean']),
            row['rr_int'] >= 1.15*row['rmean']
        ]
        choices = [
            'short',
            'regular',
            'long'
        ]
        row['int_type'] = np.select(conditions, choices)
        series_list.append(row)
    
    transition_matrix = pd.DataFrame(data={
                                            'StoS': np.zeros(len(series_list[0])), 
                                            'StoR': np.zeros(len(series_list[0])), 
                                            'StoL': np.zeros(len(series_list[0])), 
                                            'RtoS': np.zeros(len(series_list[0])), 
                                            'RtoR': np.zeros(len(series_list[0])), 
                                            'RtoL': np.zeros(len(series_list[0])), 
                                            'LtoS': np.zeros(len(series_list[0])), 
                                            'LtoR': np.zeros(len(series_list[0])), 
                                            'LtoL': np.zeros(len(series_list[0]))})

    for i in range(len(series_list)-1):
        temp_matrix = pd.DataFrame()
        conditions = [
            (series_list[i]['int_type']=='short') & (series_list[i+1]['int_type']=='short'),
            (series_list[i]['int_type']=='short') & (series_list[i+1]['int_type']=='regular'),
            (series_list[i]['int_type']=='short') & (series_list[i+1]['int_type']=='long'),
            (series_list[i]['int_type']=='regular') & (series_list[i+1]['int_type']=='short'),
            (series_list[i]['int_type']=='regular') & (series_list[i+1]['int_type']=='regular'),
            (series_list[i]['int_type']=='regular') & (series_list[i+1]['int_type']=='long'),
            (series_list[i]['int_type']=='long') & (series_list[i+1]['int_type']=='short'),
            (series_list[i]['int_type']=='long') & (series_list[i+1]['int_type']=='regular'),
            (series_list[i]['int_type']=='long') & (series_list[i+1]['int_type']=='long')
        ]
        choices = [
            'StoS',
            'StoR',
            'StoL',
            'RtoS',
            'RtoR',
            'RtoL',
            'LtoS',
            'LtoR',
            'LtoL'
        ]
        transition_types = pd.Series(data=np.select(conditions, choices, default=cd.NA))
        temp_matrix['StoS'] = np.where(transition_types=='StoS', 1, 0)
        temp_matrix['StoR'] = np.where(transition_types=='StoR', 1, 0)
        temp_matrix['StoL'] = np.where(transition_types=='StoL', 1, 0)
        temp_matrix['RtoS'] = np.where(transition_types=='RtoS', 1, 0)
        temp_matrix['RtoR'] = np.where(transition_types=='RtoR', 1, 0)
        temp_matrix['RtoL'] = np.where(transition_types=='RtoL', 1, 0)
        temp_matrix['LtoS'] = np.where(transition_types=='LtoS', 1, 0)
        temp_matrix['LtoR'] = np.where(transition_types=='LtoR', 1, 0)
        temp_matrix['LtoL'] = np.where(transition_types=='LtoL', 1, 0)
        transition_matrix = transition_matrix.add(temp_matrix)

    count = len(series_list) - 1
    
    return cd.from_pandas(transition_matrix / count)

In [15]:
def feature_calc(rr_int_df, window_size=4):
    rr_int_df.drop(rr_int_df.tail(len(rr_int_df) % window_size).index, inplace=True)
    subsets = rr_int_df.groupby(rr_int_df.index // window_size, sort=True)

    feature_df = cd.DataFrame(data={'rhythm': subsets['rhythm'].nth(0)})

    feature_df['std'] = subsets['rr_int'].agg('std')
    feature_df['cov'] = feature_df['std'] / subsets['rr_int'].agg('mean')
    feature_df['range'] = subsets['rr_int'].agg('max') - subsets['rr_int'].agg('min')
    feature_df['rrInt_var'] = subsets['rr_int'].agg('var')
    feature_df['rmean_var'] = subsets['rmean'].agg('var')
    feature_df['rmssd'] = np.sqrt(subsets['sqr_diff'].agg('sum') / subsets['sqr_diff'].agg('count'))
    feature_df['mad'] = subsets['diff'].agg('median')
    feature_df['iqr'] = subsets['rr_int'].quantile(0.75) - subsets['rr_int'].quantile(0.25)

    feature_df = cd.concat([feature_df, find_proportions(subsets, window_size)], axis='columns')

    return feature_df

In [16]:
feature_dfs = {}
for record in tqdm(rlist):
    feature_dfs[record] = feature_calc(rr_int_dfs[record])

  0%|          | 0/23 [00:00<?, ?it/s]

In [21]:
print(feature_dfs['04015'].head())

  rhythm        std       cov  range    rrInt_var   rmean_var      rmssd  \
0      N  40.124805  0.240268   87.0  1610.000000  148.049377  53.726158   
1      N  46.050697  0.270886   95.0  2120.666667   76.937511  65.779936   
2      N  13.375973  0.061854   32.0   178.916667   53.581830  44.606614   
3      N  40.713429  0.237743   72.0  1657.583333  183.300318  34.608525   
4      N  16.214705  0.138884   37.0   262.916667  145.014391  24.545875   

    mad    iqr      StoS      StoR  StoL      RtoS      RtoR      RtoL  LtoS  \
0  43.5  31.50  0.000000  0.000000   0.0  0.000000  0.333333  0.333333   0.0   
1  46.0  24.50  0.000000  0.000000   0.0  0.000000  0.333333  0.333333   0.0   
2  18.5  11.75  0.000000  0.000000   0.0  0.000000  0.666667  0.000000   0.0   
3   3.5  69.75  0.333333  0.000000   0.0  0.333333  0.333333  0.000000   0.0   
4  19.5  15.25  0.333333  0.333333   0.0  0.000000  0.333333  0.000000   0.0   

       LtoR  LtoL  
0  0.333333   0.0  
1  0.333333   0.0  
2 