# Part 1: Data Exploration and Preprocessing

In this notebook, you will implement functions to load, preprocess, and visualize physiological data from the Wearable Exam Stress Dataset.

## 1. Data Loading

Implement the `load_data` function to read and organize the physiological data from the dataset.

In [29]:
import os
import numpy as np
import pandas as pd

def load_data(data_dir="data/raw", student = "S1"):
    full_path = os.path.join(data_dir, student)
    sessions = ["midterm_1", "midterm_2", "Final"]
    file_names = ["EDA", "HR", "TEMP"]

    all_data = []

    for session in sessions:
        session_path = os.path.join(full_path, session)
        session_data = {}

        for name in file_names:
            file_path = os.path.join(session_path, f"{name}.csv")

            # Read first two lines for metadata
            try:
                with open(file_path, 'r') as f:
                    start_time = float(f.readline().strip())
                    sample_rate = float(f.readline().strip())
                    df = pd.read_csv(file_path, skiprows=2, header=None, names=[name.lower()])
            except FileNotFoundError:
                print(f"File not found: {file_path}")
                # create empth dataframe for test
                df = pd.DataFrame({
                    'timestamp': [pd.NaT],
                    'heart_rate': [None],
                    'eda': [None],
                    'temperature': [None],
                    'subject_id': [None],
                    'session': [None]
                    })
                return df

            # Load data
            n = len(df)
            timestamps = start_time + np.arange(n) / sample_rate
            df['timestamp'] = pd.to_datetime(timestamps, unit='s')

            session_data[name.lower()] = df
        # Merge HR, EDA, TEMP on timestamp
        hr_df = session_data['hr']
        eda_df = session_data['eda']
        temp_df = session_data['temp']

        combined = hr_df.merge(eda_df, on='timestamp', how='outer') \
                        .merge(temp_df, on='timestamp', how='outer')

        # Sort by timestamp
        combined = combined.sort_values('timestamp')

        # Add extra info
        combined['subject_id'] = student
        combined['session'] = session.replace('_', ' ').capitalize()

        all_data.append(combined)

    # Combine all sessions
    data = pd.concat(all_data, ignore_index=True)

    # Rename columns for final format
    data.rename(columns={'hr': 'heart_rate','temp': 'temperature'}, inplace=True)

    # Reorder columns
    data = data[['timestamp', 'heart_rate', 'eda', 'temperature', 'subject_id', 'session']]

    return data
load_data()

Unnamed: 0,timestamp,heart_rate,eda,temperature,subject_id,session
0,2018-10-13 12:56:06.000,,0.000000,22.51,S1,Midterm 1
1,2018-10-13 12:56:06.250,,0.002563,22.51,S1,Midterm 1
2,2018-10-13 12:56:06.500,,0.019221,22.51,S1,Midterm 1
3,2018-10-13 12:56:06.750,,0.021784,22.51,S1,Midterm 1
4,2018-10-13 12:56:07.000,,0.023065,22.51,S1,Midterm 1
...,...,...,...,...,...,...
182838,2018-12-05 22:58:52.000,121.95,0.025627,26.67,S1,Final
182839,2018-12-05 22:58:52.250,,0.026908,26.67,S1,Final
182840,2018-12-05 22:58:52.500,,,26.67,S1,Final
182841,2018-12-05 22:58:52.750,,,26.67,S1,Final


## 2. Data Preprocessing

Implement the `preprocess_data` function to clean and prepare the data for analysis.

In [None]:
# Import required libraries
from scipy.stats import zscore
import numpy as np
import pandas as pd
# from scipy import stats
# from pathlib import Path
import os
def preprocess_data(data, output_dir='data/processed'):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Your code here
    # 1. Handle missing values (simple forward fill)
    data = data.sort_values(['subject_id', 'session', 'timestamp'])
    data = data.ffill().bfill()
    # 2. Resample to regular intervals (e.g., 1 second)
    processed = []
    for (subject, session), group in data.groupby(['subject_id', 'session']):
        group = group.set_index('timestamp')
        num = ['heart_rate', 'eda', 'temperature']
        group_num = group[num].resample('1s').mean().interpolate()
        group_num['subject_id'] = subject
        group_num['session'] = session
        group = group_num.reset_index()
        group['subject_id'] = subject
        group['session'] = session
        processed.append(group.reset_index())
    data = pd.concat(processed, ignore_index=True)
    
    # 3. Remove outliers (z-score > 3)
    for col in ['heart_rate', 'eda', 'temperature']:
        z = zscore(data[col])
        data = data[np.abs(z) <= 3]

    # 4. Save processed data to CSV
    output_path = os.path.join(output_dir, 'processed_data.csv')
    data.to_csv(output_path, index=False)

    return data

## 3. Visualization

Implement the `plot_physiological_signals` function to create visualizations of the physiological data.

In [None]:
# Import required libraries
import matplotlib.pyplot as plt
import seaborn as sns
import os
def plot_physiological_signals(data, subject_id, session, output_dir='plots'):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Your code here
    # 1. Create figure with subplots
    fig, axs = plt.subplots(3, 1, figsize=(10, 15), sharex=True)
    fig.suptitle(f'Physiological Signals for {subject_id} - {session}', fontsize=16)
    # 2. Plot each physiological signal
    sns.lineplot(data=data[data['subject_id'] == subject_id][data['session'] == session], x='timestamp', y='heart_rate', ax=axs[0])
    axs[0].set_title('Heart Rate')
    axs[0].set_ylabel('Heart Rate (bpm)')
    sns.lineplot(data=data[data['subject_id'] == subject_id][data['session'] == session], x='timestamp', y='eda', ax=axs[1])
    axs[1].set_title('Electrodermal Activity')
    axs[1].set_ylabel('EDA (μS)')
    sns.lineplot(data=data[data['subject_id'] == subject_id][data['session'] == session], x='timestamp', y='temperature', ax=axs[2])
    axs[2].set_title('Temperature')
    axs[2].set_ylabel('Temperature (°C)')
    axs[2].set_xlabel('Timestamp')
    # 3. Add labels and titles
    for ax in axs:
        ax.set_xlabel('Timestamp')
        ax.set_ylabel('Value')
        ax.legend(['Heart Rate', 'EDA', 'Temperature'])
        ax.grid()
    # 4. Save plot to file
    # S1_Midterm 1_signals.png
    output_path = os.path.join(output_dir, f'{subject_id}_{session}_signals.png')
    plt.savefig(output_path)
    plt.close(fig)
    return fig