# 003 EDA Analysis

### Purpose

Purpose of this notebook is to analyse the Mindware EDA data 

### Approach
- Preprocess the EDA signal data
- Segment the EDA signal
- Extract features
- Baseline correct

### Input <> Output

- Input `~/data/interim/signals`
- Processed data output (including metrics): `~/data/processed/eda`
- QA figures Output: `~reports/QA/eda`
- Aggregated metrics `~data/final/eda`

## Imports

In [1]:
# fmt: off
from pathlib import Path
import sys
from typing import List, Dict, Any, Optional, Union
import pandas as pd 
import importlib
import matplotlib.pyplot as plt
import matplotlib.collections
import numpy as np
import warnings
import neurokit2 as nk
sys.path.append(str(Path().cwd().parent/"src"))
sys.path.append(str(Path().cwd().parent/"app"))
import ecg_utils.data_utils as data_utils
import ecg_utils.parameters as parameters
importlib.reload(data_utils)
importlib.reload(parameters)
# fmt: on

<module 'ecg_utils.parameters' from '/Users/lukasspiess/Library/CloudStorage/OneDrive-SpiessSolution/Neurophysiological profiles/General/Mindware data analysis/src/ecg_utils/parameters.py'>

## Parameters

In [2]:
WORKING_DIR = Path().cwd()
ROOT_DIR = WORKING_DIR.parent

DATA_DIR = ROOT_DIR / 'data'
INTERIM_SIGNAL_DATA_DIR = DATA_DIR / 'interim' / 'signals'

PROCESSED_DATA_DIR = DATA_DIR / 'processed' / 'eda'
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

FINAL_DATA_DIR = DATA_DIR / 'final' / 'eda_features'
FINAL_DATA_DIR.mkdir(parents=True, exist_ok=True)

QA_REPORTS_DIR = ROOT_DIR / 'reports' / 'QA' / 'eda'
QA_REPORTS_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
sampling_rate = 500
base_parameters = parameters.base_params

## Support Functions

In [4]:
def detect_outliers_zscore(
    df: pd.DataFrame,
    column: str,
    z_threshold: float,
    window_size: int
) -> pd.DataFrame:
    """
    Detects outliers in a specified column using the z-score method 
    for non-overlapping windows.
    
    Args:
        df (pd.DataFrame): The input data frame.
        column (str): The column name for which to compute outliers.
        z_threshold (float): The z-score threshold for outlier detection.
        window_size (int): The size of the non-overlapping windows.
    
    Returns:
        pd.DataFrame: The original data frame with two new columns:
                      - 'is_outlier': Indicates if the value is an outlier (True/False).
                      - 'z_score': The computed z-score for each value.
    """
    # Ensure the column exists
    if column not in df.columns:
        raise ValueError(f"Column '{column}' not found in the DataFrame.")

    # Create a new column to assign non-overlapping groups
    df['group'] = df.index // window_size

    # Function to compute z-scores and detect outliers within each group
    def compute_outliers(group):
        values = group[column]
        mean = values.mean()
        std = values.std()

        # Avoid division by zero
        if std == 0:
            group['z_score'] = 0
            group['is_outlier'] = False
        else:
            group['z_score'] = (values - mean) / std
            group['is_outlier'] = group['z_score'].abs() > z_threshold
        return group

    # Apply the outlier detection function to each group
    df = df.groupby('group').apply(compute_outliers)

    # Drop the temporary 'group' column
    df.drop(columns=['group'], inplace=True)
    return df



def plot_thresholded_z_scores(df, threshold, filename, sampling_rate:int=500):
    """
    Plots the given data and saves the plot to a file.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the data to be plotted. 
                            It should have columns 'MWMOBILEJ_GSC' and 'z_score'.
    threshold (float): The threshold value to be plotted as a horizontal line on the z-score plot.
    filename (str): The path and name of the file where the plot will be saved.

    Returns:
    None
    """
    fig, ax = plt.subplots(nrows=2, ncols =1,figsize=(15, 5))
    ax[0].plot(df.index / sampling_rate ,df['MWMOBILEJ_GSC'], color = 'black')
    ax[0].set_ylabel('Microsiemens')
    ax[1].plot(df.index / sampling_rate ,df['z_score'].abs(), color = 'black')
    ax[1].set_ylabel('z-score')
    ax[1].axhline(y=threshold, color='darkred', linestyle='--', label = 'Threshold')
    ax[1].set_xlabel('Time (s)')
    plt.legend()
    plt.savefig(filename)

In [5]:
def extract_features_from_segments(segment_df_list: List[pd.DataFrame], sampling_rate: int = 500) -> pd.DataFrame:
    """
    Extract features from a list of physiological data segments.

    This function processes a list of dataframes, each containing physiological data
    for a specific segment, and extracts features using NeuroKit2's EDA analysis.
    The extracted features are complemented with metadata such as segment name,
    subject ID, and additional calculated metrics.

    Args:
        segment_df_list (List[pd.DataFrame]): 
            A list of dataframes where each dataframe represents a segment of 
            physiological data. Each dataframe must include columns `event_name` 
            and `subject_id` to identify the segment and subject, respectively.
        sampling_rate (int, optional): 
            Sampling rate of the physiological data in Hz. Defaults to 500.

    Returns:
        pd.DataFrame: 
            A dataframe containing the extracted features for all segments, 
            enriched with metadata and calculated metrics such as 
            `segment_name`, `subject_id`, `segment_length_seconds`, 
            and `SCR_Peaks_N_per_seconds`.

    Raises:
        KeyError: 
            If `event_name` or `subject_id` is missing in the input dataframes.
        ValueError: 
            If the input segment list is empty or the segment dataframes are malformed.
    """
    results_df = pd.DataFrame()
    for segment_df in segment_df_list:
        # assign some variables
        segment_name = segment_df["event_name"].unique()[0]
        subject_id = segment_df["subject_id"].unique()[0]
        segment_length_seconds = len(segment_df) / sampling_rate
        # extract features
        features_df = nk.eda_analyze(segment_df, sampling_rate=sampling_rate, method="interval-related")
        # complement df with additional information
        features_df = features_df.assign(
            segment_name=segment_name, 
            subject_id=subject_id,
            segment_length_seconds=segment_length_seconds,
            SCR_Peaks_N_per_seconds=features_df["SCR_Peaks_N"] / segment_length_seconds,
            )
        # concatenate results   
        results_df = pd.concat([results_df, features_df])
    return results_df

In [6]:

def eda_plot(
    eda_signals: pd.DataFrame, info_dict: dict, sampling_rate: int = 500
) -> plt.Figure:
    """
    Simplified visualization of electrodermal activity (EDA) data with SCR peaks.

    Parameters
    ----------
    eda_signals : pd.DataFrame
        DataFrame containing columns 'EDA_Raw', 'EDA_Clean', 'EDA_Phasic', and 'EDA_Tonic'.
    info_dict : dict
        Dictionary containing SCR information with keys 'SCR_Peaks' and optionally others.
    sampling_rate : int, optional
        Sampling rate of the EDA signal, in Hz. Defaults to 1000.

    Returns
    -------
    plt.Figure
        Matplotlib figure object.
    """
    required_columns = ['EDA_Raw', 'EDA_Clean', 'EDA_Phasic', 'EDA_Tonic']
    if not all(col in eda_signals.columns for col in required_columns):
        raise ValueError(f"Input DataFrame must contain the following columns: {required_columns}")

    if "SCR_Peaks" not in info_dict:
        raise ValueError("info_dict must contain the key 'SCR_Peaks'.")

    # Create time axis
    x_axis = np.linspace(0, len(eda_signals) / sampling_rate, len(eda_signals))
    x_label = "Time (seconds)"
    peaks_seconds = np.array(info_dict["SCR_Peaks"]) / sampling_rate

    # Create subplots
    fig, (ax0, ax1, ax2) = plt.subplots(nrows=3, ncols=1, sharex=True, figsize=(10, 8))

    # Plot Raw and Cleaned Signal
    ax0.set_title("Raw and Cleaned Signal")
    ax0.plot(x_axis, eda_signals["EDA_Raw"], color="#B0BEC5", label="Raw", zorder=1)
    ax0.plot(x_axis, eda_signals["EDA_Clean"], color="#9C27B0", label="Cleaned", linewidth=1.5, zorder=2)
    ax0.legend(loc="upper right")
    ax0.set_ylabel("EDA (µS)")

    # Plot Phasic Component
    ax1.set_title("Phasic Component")
    ax1.plot(x_axis, eda_signals["EDA_Phasic"], color="#E91E63", label="Phasic", linewidth=1.5)
    # Mark SCR peaks with vertical lines
    for peak in peaks_seconds:
        ax1.axvline(x=peak, color='blue', linestyle='--', alpha=0.7, label="SCR Peak" if peak == peaks_seconds[0] else None)
    ax1.legend(loc="upper right")
    ax1.set_ylabel("EDA (µS)")

    # Plot Tonic Component
    ax2.set_title("Tonic Component")
    ax2.plot(x_axis, eda_signals["EDA_Tonic"], color="#673AB7", label="Tonic", linewidth=1.5)
    ax2.legend(loc="upper right")
    ax2.set_xlabel(x_label)
    ax2.set_ylabel("EDA (µS)")

    fig.suptitle("Electrodermal Activity (EDA)", fontweight="bold")

    # Adjust layout
    plt.tight_layout(rect=[0, 0, 1, 0.96])

    return fig

# Analyse EDA

For each subject:

1. Read data
2. preprocess using neurokit2 (and export the preprocessed data)
3. Create QA plots with tonic and phasic signal (export)
4. Segment
5. Extract features (and export)

In [None]:
# data files
data_files_generator = INTERIM_SIGNAL_DATA_DIR.glob('*.csv')
print(f"Identified {data_files_generator.__sizeof__()} data files")

In [None]:
for signal_filepath in data_files_generator:
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        signal_df = pd.read_csv(signal_filepath)
        if not 'MWMOBILEJ_GSC' in signal_df.columns:
            print(f"Skipping {signal_filepath} as it does not contain the required column 'MWMOBILEJ_GSC'")
            continue
        signal_df = signal_df.drop(columns = "MWMOBILEJ_Bio")
    subject_id = str(signal_df['subject_id'].unique()[0])
    print(f"Processing data for subject {subject_id}")
    
    # Preprocess and join with original data
    signal_preproc_df, info_dict = nk.eda_process(signal_df["MWMOBILEJ_GSC"], sampling_rate=sampling_rate, report=None)
    signal_df = (signal_df
                 .drop(columns=['MWMOBILEJ_GSC'])
                 .merge(signal_preproc_df, left_index=True, right_index=True))
    
    # Create QA plot of the entire recoding and export it
    eda_plot(signal_preproc_df,info_dict, 500);
    plt.savefig(QA_REPORTS_DIR / f'{subject_id}_eda.png')
    
    # Segment
    segments_df_list = data_utils.segment_df(signal_df, base_parameters)
    
    # Extract features
    features_df = extract_features_from_segments(segments_df_list, sampling_rate=sampling_rate)
    
    # save the pre-processed signal data and the extracted features 
    output_dir = PROCESSED_DATA_DIR / subject_id
    output_dir.mkdir(parents=True, exist_ok=True)
    signal_df.to_csv(output_dir / 'preprocessed_eda.csv', index=False)
    features_df.to_excel(output_dir / 'eda_features.xlsx', index=False)



## Read the EDA features, baseline correct and export as group-level file

1. Read the EDA features of a subject
2. Perform baseline correction
3. Concatenate the data from each subject

In [4]:
def get_folders_in_directory(directory_path: Union[str,Path]) -> list[Path]:
    """
    Get all folders in the specified directory.

    Args:
        directory_path (str): The path of the directory to scan for folders.

    Returns:
        list[Path]: A list of Path objects representing folders in the directory.
    """
    path = Path(directory_path)
    return [item for item in path.iterdir() if item.is_dir()]


In [5]:

def apply_baseline_correction(features_df: pd.DataFrame, ) -> pd.DataFrame:
    """
    Applies baseline correction to physiological feature data by subtracting
    baseline values for specified features from the corresponding segment data.

    Args:
        features_df (pd.DataFrame): 
            A dataframe containing extracted features with a `segment_name` column 
            identifying the segment type (e.g., "Baseline") and a `subject_id` column 
            for identifying the subject.
   

    Returns:
        pd.DataFrame: 
            A dataframe with baseline-corrected metrics, where baseline values 
            have been subtracted from the corresponding features. The output 
            excludes rows where `segment_name` is "Baseline" and removes 
            the original feature columns and baseline values.
    """
    #
    feature_cols = ["SCR_Peaks_Amplitude_Mean",	"EDA_Tonic_SD",	"EDA_Sympathetic", "EDA_SympatheticN",	"EDA_Autocorrelation", "SCR_Peaks_N_per_seconds", "SCR_Peaks_N"]
    # Prepare the data
    group_level_metrics_df = features_df.reset_index()
    baseline_df = group_level_metrics_df[group_level_metrics_df["segment_name"] == "Baseline"].set_index("subject_id")

    # Merge baseline values with the original DataFrame
    df = group_level_metrics_df.merge(
        baseline_df[feature_cols],
        on="subject_id",
        suffixes=("", "_baseline")
    )

    # Apply baseline correction
    for col in feature_cols:
        df[f"{col}_blc"] = df[col] - df[f"{col}_baseline"]

    # # Drop unneeded columns
    # df = df.drop(columns=feature_cols + ['index']) #  + [f"{col}_baseline" for col in feature_cols]

    # Filter out baseline rows
    baseline_corrected_metrics_df = df[df["segment_name"] != "Baseline"]

    return baseline_corrected_metrics_df

In [7]:
processed_subject_folders = get_folders_in_directory(PROCESSED_DATA_DIR)
group_level_features_df = pd.DataFrame()


for subject_folder in processed_subject_folders:
    subject_id = subject_folder.parts[-1]
    # print(f"Processing subject {subject_id}")
    features_df = pd.read_excel(subject_folder / 'eda_features.xlsx')
    group_level_features_df = pd.concat([group_level_features_df, features_df])
    
# Apply baseline correction
baseline_corrected_df = apply_baseline_correction(group_level_features_df)
baseline_corrected_df.to_excel(FINAL_DATA_DIR / 'group_level_blc_eda_features.xlsx')