# ECG Analysis

### Purpose
Purpose of this notebook is to preprocess and analyse the Mindware ECG data

### Approach
1. HRV data will be calculated per 30s analysis-window per segment (baseline, story 1, story 2,...)
2. RSA values will be calculate once per segment because longer segments are needed to calculate PB-RSA.
3. Aggregation for HRV values happens at the analysis-window to arrive at average 30s HRV per segment
4. No aggregation for RSA will be performed since RSA already is at the segment level

#### Notes
- Outlier detection will be performed for HRV values at the level of analysis-windows usign z-score approach. Moreover, only analysis windows with > 20 peaks will be used for aggregation and outlier detection 

### Input / Output

- Input `~/data/interim/signals`
- Processed data output (including metrics): `~/data/processed/ecg`
- QA figures Output: `~reports/QA/ecg`
- Aggregated metrics `~data/final/ecg`

## TO-DO's

### Imports

In [1]:
# fmt: off
from pathlib import Path
import sys
from typing import Union, List, Dict, Tuple
import warnings
import pandas as pd 
import importlib
import traceback
sys.path.append(str(Path().cwd().parent/"src"))
sys.path.append(str(Path().cwd().parent/"app"))
import ecg_utils.data_utils as data_utils
import ecg_utils.parameters as parameters
import ecg_utils.clean_impute as clean_impute
import ecg_utils.nk_pipeline as nk_pipeline
import ecg_utils.common as common
import app.ecg_high_level_fnc as app
import numpy as np
importlib.reload(data_utils)
importlib.reload(parameters)
importlib.reload(common)
importlib.reload(nk_pipeline)
importlib.reload(clean_impute)
importlib.reload(app)
# fmt:on

<module 'app.ecg_high_level_fnc' from '/Users/lukasspiess/Library/CloudStorage/OneDrive-SpiessSolution/Neurophysiological profiles/General/Mindware data analysis/src/app/ecg_high_level_fnc.py'>

## Parameters

In [2]:
WORKING_DIR = Path().cwd()
ROOT_DIR = WORKING_DIR.parent

DATA_DIR = ROOT_DIR / 'data'
INTERIM_SIGNAL_DATA_DIR = DATA_DIR / 'interim' / 'signals'

PROCESSED_ECG_DATA_DIR = DATA_DIR / 'processed' / 'ecg'
PROCESSED_ECG_DATA_DIR.mkdir(exist_ok=True, parents=True)

FINAL_ECG_METRICS_DIR = DATA_DIR / 'final' / 'ecg_metrics'
FINAL_ECG_METRICS_DIR.mkdir(exist_ok=True, parents=True)

REPORTS_DIR = ROOT_DIR / 'reports'
QA_REPORTS_DIR = REPORTS_DIR / 'QA' / 'ecg'
QA_REPORTS_DIR.mkdir(exist_ok=True, parents=True)

## Support functions


In [3]:
def get_folders_in_directory(directory_path: Union[str,Path]) -> list[Path]:
    """
    Get all folders in the specified directory.

    Args:
        directory_path (str): The path of the directory to scan for folders.

    Returns:
        list[Path]: A list of Path objects representing folders in the directory.
    """
    path = Path(directory_path)
    return [item for item in path.iterdir() if item.is_dir()]


# Main

In [4]:
# data files
data_files_generator = INTERIM_SIGNAL_DATA_DIR.glob('*.csv')
print(f"Identified {data_files_generator.__sizeof__()} data files")

Identified 88 data files


# Analyse ECG

**For each subject:**
1. Read data
2. preprocess
3. segment
4. calculate windowed HRV + generate peak QA plots + output hrv metrics + output cleaned signal data. 
5. calculate and export RSA metrics per segment

In [None]:
ecg_pipeline_params = parameters.base_params
not_successful = list()
for index, signal_filepath in enumerate(data_files_generator):
    
    try:
        # prepare data
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            signal_df = pd.read_csv(signal_filepath)
        subject_id = str(signal_df['subject_id'].unique()[0])
        # if subject_id != "2094":
        #     continue
        ecg_series = signal_df['MWMOBILEJ_Bio']
        print(f"Processing subject {subject_id}")

        subject_figure_output_dir = QA_REPORTS_DIR / subject_id 
        subject_figure_output_dir.mkdir(exist_ok=True, parents=True)
        subject_data_output_dir = PROCESSED_ECG_DATA_DIR / subject_id
        subject_data_output_dir.mkdir(exist_ok=True, parents=True)
        
        # preprocess and join with event data
        tmp_signals_preproc_df = nk_pipeline.ecg_preprocess(ecg_series, ecg_pipeline_params)
        preproc_df = signal_df.merge(tmp_signals_preproc_df, left_index=True, right_index=True)
        
        # segment
        segments_df_list = data_utils.segment_df(preproc_df, ecg_pipeline_params)
        data_utils.check_segment_list(segments_df_list)
        [df.set_index('time_seconds_original_file', inplace=True) for df in segments_df_list] # to have time in ms as index
        
        # calculate and export HRV 
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            hrv_df, ecg_df = app.compute_windowed_hrv_across_segments(
                segments_df_list=segments_df_list,
                parameters=ecg_pipeline_params,
                figure_output_dir=subject_figure_output_dir,
                data_output_dir=subject_data_output_dir,
                subject_id = subject_id,
                create_qa_plots=True
            )
        

        # Calculate and export RSA metrics
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            rsa_agg_df = nk_pipeline.calculate_rsa_per_segment(segments_df_list, 
                                                            ecg_pipeline_params, 
                                                            subject_id=subject_id, 
                                                            data_output_dir=subject_data_output_dir)
    except Exception as e:
        print(f"Error processing {signal_filepath}: {e}")
        traceback.print_exc()
        not_successful.append(signal_filepath)
        break
    
    

print(f"Processing done. Not successful for {len(not_successful)} files: {not_successful}")

# Read the HRV and RSA data, aggregate, and export as group level file

1. Read HRV an RSA data of a subject
2. Perform outlier detection/correction on HRV an RSA values
3. Aggregate HRV data (to have a single value per segment)
4. Concatenate the data of each subject into a single file and export for statistical analysis

In [9]:
processed_subject_folders = get_folders_in_directory(PROCESSED_ECG_DATA_DIR)
group_level_metrics_df = pd.DataFrame()

# Loop through each subject
for subject_folder in processed_subject_folders:
    subject_id = subject_folder.parts[-1]
    # print(f"Processing subject {subject_id}")
    
    # Read data
    hrv_df = pd.read_excel(subject_folder / 'hrv_metrics.xlsx').drop(columns='Unnamed: 0')
    rsa_df = pd.read_excel(subject_folder / 'rsa_metrics.xlsx').drop(columns='Unnamed: 0')
    
    # Merge the two metrics
    metrics_df = hrv_df.merge(rsa_df, on='segment_name', how = 'left', suffixes=('_hrv', '_rsa'))
    metrics_df = clean_impute.flag_windows_insufficient_n_peaks(metrics_df, min_peaks_required=20)
    
    # HRV outlier detection
    metrics_df = clean_impute.flag_outliers_based_on_zscore(metrics_df, 'HRV_SDNN', z_threshold=2.56)
    metrics_df = clean_impute.flag_usable_aggregation_windows(metrics_df)
    
    # provide information about the number of usable analysis windows for each segment and add that as column to the dataframe
    usable_windows_per_segment = metrics_df.groupby('segment_name')['usable_window'].sum()
    metrics_df = metrics_df.assign(usable_analysis_windows_in_segment = metrics_df['segment_name'].map(usable_windows_per_segment))
    
    # Aggregate HRV values 
    single_subject_metrics_agg_df = metrics_df.groupby('segment_name').mean()
    
    # clean up redundant columns etc.
    single_subject_metrics_agg_df = single_subject_metrics_agg_df.rename(columns = {
        "subject_id_hrv": "subject_id",
        "start_time_hrv": "start_time",
        "end_time_hrv": "end_time",
    })
    single_subject_metrics_agg_df = single_subject_metrics_agg_df.drop(columns = 
                                                                       ["subject_id_rsa", "start_time_rsa", "end_time_rsa",
                                                                        "usable_window", "window_has_enough_peaks", "HRV_SDNN_outlier",
                                                                        "start_time", "end_time", "n_peaks_detected",
                                                                        "analysis_window", "MWMOBILEJ_GSC"
                                                                        ], errors="ignore")
    
    # concatenate with the data from other subjects
    group_level_metrics_df = pd.concat([group_level_metrics_df, single_subject_metrics_agg_df])
    
    

#### Perform baseline correction

In [10]:
# Prepare
group_level_metrics_df = group_level_metrics_df.reset_index()
baseline_df = group_level_metrics_df[group_level_metrics_df["segment_name"] == "Baseline"].set_index("subject_id")

# Merge baseline values with the original DataFrame
df = group_level_metrics_df.merge(
    baseline_df[["RSA_PorgesBohrer", "HRV_SDNN", "heart_rate_bpm"]],
    on="subject_id",
    suffixes=("", "_baseline")
)


In [11]:

# Apply baseline correction
df["RSA_PorgesBohrer_corrected"] = df["RSA_PorgesBohrer"] - df["RSA_PorgesBohrer_baseline"]
df["HRV_SDNN_corrected"] = df["HRV_SDNN"] - df["HRV_SDNN_baseline"]
df["heart_rate_bpm_corrected"] = df["heart_rate_bpm"] - df["heart_rate_bpm_baseline"]

# Save the aggregated, baseline-corrected metrics
baseline_corrected_metrics_df = df[df["segment_name"] != "Baseline"]
baseline_corrected_metrics_df.to_excel(FINAL_ECG_METRICS_DIR / 'group_level_blc_ecg_metrics.xlsx')
