# Part 1: Data Exploration and Preprocessing

In this notebook, you will implement functions to load, preprocess, and visualize physiological data from the Wearable Exam Stress Dataset.

In [62]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import zscore
from pathlib import Path
import os
from scipy import stats, signal


# Set style for plots
plt.style.use('seaborn-v0_8')
%matplotlib inline

## 1. Data Loading

Implement the `load_data` function to read and organize the physiological data from the dataset.

In [69]:
import os
import pandas as pd
import numpy as np
from scipy import stats, signal
import matplotlib.pyplot as plt

def load_data(data_dir='data/raw'):
    """Load and organize the physiological data from the dataset.
    
    Parameters
    ----------
    data_dir : str
        Path to the directory containing the dataset files
        
    Returns
    -------
    pd.DataFrame
        DataFrame containing the organized physiological data with columns:
        ['timestamp', 'heart_rate', 'eda', 'temperature', 'subject_id', 'session']
    """

    def load_sensor_csv(folder_path, sensor_file, column_name):
        
        raw = pd.read_csv(os.path.join(folder_path, sensor_file), header = None)
        
        start_time = float(raw.iloc[0, 0])
        sample_rate = float(raw.iloc[1, 0])
        data = raw.iloc[2:].reset_index(drop = True)
        
        # Timestamps
        timestamps = [start_time + i / sample_rate for i in range(len(data))]
        data['timestamp'] = timestamps
        data.columns = [column_name, 'timestamp']
        
        return data

    path = os.path.join(data_dir, "Data")
    if not os.path.exists(path):
        return pd.DataFrame({
            'timestamp': pd.Series(dtype='datetime64[ns]'),  # ← key fix
            'heart_rate': pd.Series(dtype='float64'),
            'eda': pd.Series(dtype='float64'),
            'temperature': pd.Series(dtype='float64'),
            'subject_id': pd.Series(dtype='str'),
            'session': pd.Series(dtype='str')
        })


    all_data = []

    for subject_id in os.listdir(path):
        subject_path = os.path.join(path, subject_id)
        
        if os.path.isdir(subject_path):
            for session in ['Final', 'Midterm 1', 'Midterm 2']:
                session_path = os.path.join(subject_path, session)

                hr_df = load_sensor_csv(session_path, "HR.csv", "heart_rate")
                eda_df = load_sensor_csv(session_path, "EDA.csv", "eda")
                temp_df = load_sensor_csv(session_path, "TEMP.csv", "temperature")

                merged = hr_df.merge(eda_df, on = "timestamp", how = "outer")
                merged = merged.merge(temp_df, on = "timestamp", how = "outer")
                merged['subject_id'] = subject_id
                merged['session'] = session

                all_data.append(merged)

    return pd.concat(all_data, ignore_index = True)

## 2. Data Preprocessing

Implement the `preprocess_data` function to clean and prepare the data for analysis.

In [67]:
import os
import pandas as pd
import numpy as np
from scipy import stats, signal
import matplotlib.pyplot as plt

def preprocess_data(data, output_dir='data/processed'):
    """Clean and prepare the physiological data for analysis.
    
    Parameters
    ----------
    data : pd.DataFrame
        Raw physiological data
    output_dir : str
        Directory to save processed data files
        
    Returns
    -------
    pd.DataFrame
        Cleaned and preprocessed data
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok = True)
    
    # Convert timestamps
    data['timestamp'] = pd.to_datetime(data['timestamp'], unit = 's', utc = True)

    processed = []

    # Process data per subject
    for subject_id, subject_df in data.groupby('subject_id'):
        subject_sessions = []

        for session, session_df in subject_df.groupby('session'):
            session_df = session_df.sort_values('timestamp').reset_index(drop = True)

            # Interpolate missing values
            session_df[['heart_rate', 'eda', 'temperature']] = session_df[['heart_rate', 'eda', 'temperature']].interpolate()

            # Drop session if >1% missing values in any column
            if session_df[['heart_rate', 'eda', 'temperature']].isna().mean().max() > 0.01:
                continue

            # Resample to 1s intervals
            session_df = session_df.set_index('timestamp').resample('1S').mean().interpolate()

            # Remove outliers
            z = session_df[['heart_rate', 'eda', 'temperature']].apply(zscore)
            session_df = session_df[(z.abs() <= 3).all(axis = 1)]

            # Add back ids
            session_df['subject_id'] = subject_id
            session_df['session'] = session

            subject_sessions.append(session_df)

        if subject_sessions:
            subject_data = pd.concat(subject_sessions)
            processed.append(subject_data)

            # Save to CSV
            csv_path = os.path.join(output_dir, f"{subject_id}_processed.csv")
            subject_data.reset_index().to_csv(csv_path, index = False)

    # Return all processed data
    return pd.concat(processed).reset_index(drop = True)

## 3. Visualization

Implement the `plot_physiological_signals` function to create visualizations of the physiological data.

In [68]:
import os
import pandas as pd
import numpy as np
from scipy import stats, signal
import matplotlib.pyplot as plt

def plot_physiological_signals(data, subject_id, session, output_dir='plots'):
    """Create plots of physiological signals for a given subject and session.
    
    Parameters
    ----------
    data : pd.DataFrame
        Preprocessed physiological data
    subject_id : str
        Subject identifier (e.g., 'S1')
    session : str
        Session identifier (e.g., 'Midterm 1')
    output_dir : str
        Directory to save plot files
        
    Returns
    -------
    matplotlib.figure.Figure
        Figure object containing the plots
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok = True)
    
    # Filter data for the given subject and session
    df = data[(data['subject_id'] == subject_id) & (data['session'] == session)]

    # Sort by time
    df = df.sort_values('timestamp')

    # Create figure and subplots
    fig, axes = plt.subplots(3, 1, figsize=(12, 8), sharex = True)
    fig.suptitle(f'Physiological Signals - {subject_id} - {session}', fontsize = 14)

    # Plot each signal
    signals = ['heart_rate', 'eda', 'temperature']
    for ax, signal in zip(axes, signals):
        ax.plot(df['timestamp'], df[signal])
        ax.set_ylabel(signal.title())
        ax.grid(True)

    axes[-1].set_xlabel('Timestamp')

    # Save the figure
    filename = f"{subject_id}_{session.replace(' ', '_')}_signals.png"
    fig_path = os.path.join(output_dir, filename)
    plt.tight_layout(rect = [0, 0.03, 1, 0.95])
    fig.savefig(fig_path)

    # Show the plot window
    plt.tight_layout(rect = [0, 0.03, 1, 0.95])
    fig.savefig(fig_path)
    plt.show()

    return fig
