# Part 1: Data Exploration and Preprocessing

In this notebook, you will implement functions to load, preprocess, and visualize physiological data from the Wearable Exam Stress Dataset.

In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from pathlib import Path
import os

# Set style for plots
sns.set()

## 1. Data Loading

Implement the `load_data` function to read and organize the physiological data from the dataset.

In [12]:
from pathlib import Path
import pandas as pd

def load_data(data_dir='~/Documents/4-it-s-about-time-kanting6/data'):
    """
    Load and organize the physiological data from the dataset.

    Parameters
    ----------
    data_dir : str
        Path to the directory containing the dataset files.

    Returns
    -------
    pd.DataFrame
        DataFrame containing the organized physiological data with columns:
        ['timestamp', 'heart_rate', 'eda', 'temperature', 'subject_id', 'session']
    """
    data_dir = Path(data_dir).expanduser()
    all_records = []

    # Search for participant folders (S1, S2, etc.)
    for subj_folder in sorted(data_dir.glob('S*')):
        subject_id = subj_folder.name

        # Search for exam sessions inside each subject folder
        for session_folder in sorted(subj_folder.glob('*')):
            session = session_folder.name

            # Define paths for the required CSV files
            hr_file = session_folder / 'HR.csv'
            eda_file = session_folder / 'EDA.csv'
            temp_file = session_folder / 'TEMP.csv'

            # Check that all required files exist
            if not (hr_file.exists() and eda_file.exists() and temp_file.exists()):
                continue  # Skip if any file is missing

            # Load each file
            hr_df = pd.read_csv(hr_file, header=None, names=['timestamp', 'heart_rate'])
            eda_df = pd.read_csv(eda_file, header=None, names=['timestamp', 'eda'])
            temp_df = pd.read_csv(temp_file, header=None, names=['timestamp', 'temperature'])

            # Merge on timestamp
            merged = hr_df.merge(eda_df, on='timestamp', how='outer')
            merged = merged.merge(temp_df, on='timestamp', how='outer')

            # Add subject_id and session
            merged['subject_id'] = subject_id
            merged['session'] = session

            # Collect this subject-session dataframe
            all_records.append(merged)

    # Concatenate all records into one big DataFrame
    df = pd.concat(all_records, ignore_index=True)

    # Optional: sort by subject, session, timestamp
    df = df.sort_values(by=['subject_id', 'session', 'timestamp']).reset_index(drop=True)

    return df

# Example usage:
# df = load_data()
# print(df.head())


## 2. Data Preprocessing

Implement the `preprocess_data` function to clean and prepare the data for analysis.

In [13]:
def preprocess_data(data, output_dir='~/Documents/4-it-s-about-time-kanting6/data/processed'):
    """Clean and prepare the physiological data for analysis.

    Parameters
    ----------
    data : pd.DataFrame
        Raw physiological data.
    output_dir : str
        Directory to save processed data files.

    Returns
    -------
    pd.DataFrame
        Cleaned and preprocessed data.
    """

    # Create output directory
    output_dir = Path(output_dir).expanduser()
    output_dir.mkdir(parents=True, exist_ok=True)

    # 1. Handle missing values
    missing_ratio = data.isna().mean()
    if (missing_ratio > 0.01).any():
        raise ValueError("More than 1% missing values detected. Please check your data.")
    else:
        data = data.fillna(method='ffill').fillna(method='bfill')  # forward fill then backfill

    # 2. Resample to regular intervals
    # Assume that each (subject_id, session) is a separate time series
    processed_list = []
    for (subject, session), group in data.groupby(['subject_id', 'session']):
        group = group.sort_values('timestamp')

        # If timestamp is datetime, set it as index; else assume numeric
        if np.issubdtype(group['timestamp'].dtype, np.datetime64):
            group = group.set_index('timestamp')

            # Resample to 1-minute intervals
            group = group.resample('1T').mean()

            # Fill missing after resampling
            group = group.interpolate(method='time').fillna(method='ffill').fillna(method='bfill')

            group['subject_id'] = subject
            group['session'] = session
            group = group.reset_index()
        else:
            pass  # could implement if needed later

        processed_list.append(group)

    processed_data = pd.concat(processed_list, ignore_index=True)

    # 3. Remove outliers using z-score method (threshold=3.5)
    physiological_cols = ['heart_rate', 'eda', 'temperature']
    z_scores = np.abs(stats.zscore(processed_data[physiological_cols], nan_policy='omit'))

    mask = (z_scores < 3.5).all(axis=1)
    processed_data = processed_data[mask].reset_index(drop=True)

    # 4. Save processed data
    for subject_id, subject_df in processed_data.groupby('subject_id'):
        filename_base = output_dir / f"S{subject_id}_processed"

        # Save in all three formats
        subject_df.to_csv(f"{filename_base}.csv", index=False)
        subject_df.to_parquet(f"{filename_base}.parquet", index=False)
        subject_df.to_feather(f"{filename_base}.feather")

    return processed_data


## 3. Visualization

Implement the `plot_physiological_signals` function to create visualizations of the physiological data.

In [15]:
def plot_physiological_signals(data, subject_id, session, output_dir='~/Documents/4-it-s-about-time-kanting6/plots'):
    """Create plots of physiological signals for a given subject and session.

    Parameters
    ----------
    data : pd.DataFrame
        Preprocessed physiological data.
    subject_id : str
        Subject identifier (e.g., 'S1').
    session : str
        Session identifier (e.g., 'Midterm 1').
    output_dir : str
        Directory to save plot files.

    Returns
    -------
    matplotlib.figure.Figure
        Figure object containing the plots.
    """
    # Create output directory if it doesn't exist
    output_dir = Path(output_dir).expanduser()
    output_dir.mkdir(parents=True, exist_ok=True)

    # 1. Filter data for the specific subject and session
    subject_num = subject_id.lstrip('S')  
    session_data = data[(data['subject_id'].astype(str) == str(subject_num)) & 
                        (data['session'] == session)]

    if session_data.empty:
        raise ValueError(f"No data found for subject {subject_id} and session {session}.")

    # 2. Create figure with subplots
    fig, axs = plt.subplots(3, 1, figsize=(15, 10), sharex=True)
    fig.suptitle(f'Physiological Signals for {subject_id} - {session}', fontsize=16)

    # 3. Plot each physiological signal
    signals = ['heart_rate', 'eda', 'temperature']
    titles = ['Heart Rate (bpm)', 'Electrodermal Activity (EDA)', 'Temperature (°C)']

    for ax, signal, title in zip(axs, signals, titles):
        ax.plot(session_data['timestamp'], session_data[signal], label=title)
        ax.set_ylabel(title)
        ax.legend(loc='upper right')
        ax.grid(True)

    axs[-1].set_xlabel('Time')

    plt.tight_layout(rect=[0, 0, 1, 0.96]) 

    # 4. Save plot to file
    clean_session = session.replace(' ', '_').replace('/', '_')
    plot_filename = output_dir / f"{subject_id}_{clean_session}_signals.png"
    fig.savefig(plot_filename)

    return fig
