# Merge HRV Metrics

Purpose of this notebook is to:
1. Read all the HRV Excel sheets from the output of the Neurokit2 pipeline
2. Flagging all HRV values above and below an upper and lower bound as non-plausible.
3. Perform outlier detection separately for each subject's condition on the HRV_RMSSD values using z-scores. For example, if a condition such as *baseline start* is a 300s recording with 30s HRV analysis windows, then outliers will be flagged across the 10 segments only relying on information from those 10 segments.
4. Imputing outliers with the mean HRV value across the remaining non-outlier segments.
5. Calculating the Coefficient of Variation (CoV) across the HRV values within a subject's condition (also using the imputed values). An upper bound on the CoV is used to flag entire conditions (i.e., all 10 segments) within a subject as outlier.
6. After the outlier flagging / imputation, all the excel files are concatenated into a single file and exported as Excel sheet that can be used for further analysis


In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
from typing import Union, List
import sys
sys.path.append(str(Path().cwd().parent/'src'))
import utils.clean_impute_hrv as clean_impute_hrv

## Parameters

I/O

In [2]:
WORKING_DIR = Path().cwd()
ROOT_DIR = WORKING_DIR.parent
DATA_DIR = ROOT_DIR / 'data'
PROCESSED_DATA_DIR = DATA_DIR / 'processed'
HRV_DATA_DIR = DATA_DIR / 'hrv' # the output directory for the concatenated hrv data
HRV_DATA_DIR.mkdir(exist_ok=True, parents=True)
REPORTS_DIR = ROOT_DIR / 'reports'

Cleaning parameters

In [3]:
plausible_hrv_lower_upper_limit = [3, 120]
threshold_z_score = 2.36 # was 1.96
coeff_variation_upper_bound = 0.7

## Support functions

## Identify data

In [4]:
# Get all non-empty directories in the processed data directory since they contain (hopefully) the HRV metrics
all_items = PROCESSED_DATA_DIR.glob("*/")
all_dirs = [x for x in all_items if x.is_dir()]
non_empty_dirs = [dir for dir in all_dirs if any(dir.iterdir())]

## Load, clean/impute, concatenate, and export the HRV data

In [9]:
# In each directory, look for xlsx files, read them, and concatenate them into a single df
n_files = 0 # to keep track of the number of excel files we read and process
hrv_df = pd.DataFrame()
for directory in non_empty_dirs:
    xlsx_files = [file for file in directory.glob('*.xlsx') if not file.name.startswith('~$')]
    n_files += len(xlsx_files)
    for file in xlsx_files:
        df = pd.read_excel(file)
        df_cleaned = clean_impute_hrv.plausible_to_nan(df, column = "HRV_RMSSD" , lower_bound=plausible_hrv_lower_upper_limit[0]
                                      , upper_bound=plausible_hrv_lower_upper_limit[1])
        df_cleaned = clean_impute_hrv.identify_clean_outliers(
            df_cleaned, 
            threshold_z_score=threshold_z_score, 
            hrv_variable_name = 'HRV_RMSSD_plausible', 
            method = "mean")
        df_cleaned = clean_impute_hrv.detect_segment_level_outliers(df_cleaned, cv_threshold=coeff_variation_upper_bound, hrv_variable_name='HRV_RMSSD_plausible_imputed')
        hrv_df = pd.concat([hrv_df, df_cleaned])

# # Export
hrv_df.to_excel(HRV_DATA_DIR / 'cleaned_hrv_data.xlsx', index=False)

  mean = valid_data.mean()
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  z_scores = (valid_data - mean) / std
  mean = valid_data.mean()
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  z_scores = (valid_data - mean) / std
  z_scores = (valid_data - mean) / std
  z_scores = (valid_data - mean) / std
  mean = valid_data.mean()
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  mean = 

In [7]:
df_cleaned.head()

Unnamed: 0,HRV_MeanNN,HRV_SDNN,HRV_SDANN1,HRV_SDNNI1,HRV_SDANN2,HRV_SDNNI2,HRV_SDANN5,HRV_SDNNI5,HRV_RMSSD,HRV_SDSD,...,analysis_window,heart_rate_bpm,segment_name,subject_type,condition,wave,subject_id,HRV_RMSSD_plausible,HRV_RMSSD_plausible_z_score_outlier,HRV_RMSSD_plausible_imputed
0,698.47619,62.681245,,,,,,,71.331586,72.20631,...,0,86.0,baseline resting start,child,B,W3,9,71.331586,False,71.331586
1,667.318182,44.727481,,,,,,,57.906099,58.590192,...,1,90.0,baseline resting start,child,B,W3,9,57.906099,False,57.906099
2,657.688889,48.85406,,,,,,,55.26383,55.8944,...,2,92.0,baseline resting start,child,B,W3,9,55.26383,False,55.26383
3,693.857143,53.6136,,,,,,,66.265541,67.087856,...,3,86.0,baseline resting start,child,B,W3,9,66.265541,False,66.265541
4,646.666667,36.814029,,,,,,,40.320308,40.758221,...,4,92.0,baseline resting start,child,B,W3,9,40.320308,False,40.320308
