# Part 1: Data Exploration and Preprocessing

In this notebook, you will implement functions to load, preprocess, and visualize physiological data from the Wearable Exam Stress Dataset.

In [None]:
# Import required libraries
from scipy.stats import zscore
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# from scipy import stats
# from pathlib import Path
import os

# Set style for plots
plt.style.use('seaborn-v0_8')
%matplotlib inline

## 1. Data Loading

Implement the `load_data` function to read and organize the physiological data from the dataset.

In [None]:
def load_data(data_dir='data/raw'):
    # Import required libraries
    from scipy.stats import zscore
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    # from scipy import stats
    # from pathlib import Path
    import os

    # Set style for plots
    plt.style.use('seaborn-v0_8')
    %matplotlib inline
    """Load and organize the physiological data from the dataset.
    
    Parameters
    ----------
    data_dir : str
        Path to the directory containing the dataset files
        
    Returns
    -------
    pd.DataFrame
        DataFrame containing the organized physiological data with columns:
        ['timestamp', 'heart_rate', 'eda', 'temperature', 'subject_id', 'session']
    """
    # base_dir = "/workspaces/4-it-s-about-time-ZhangZwaa/"
    # full_path = os.path.join(base_dir, data_dir)
    full_path = data_dir
    sessions = ["midterm_1", "midterm_2", "Final"]
    file_name = ["EDA", "HR", "TEMP"]

    all_data = []

    for session in sessions:
        session_path = os.path.join(full_path, session)
        session_data = {}

        for name in file_name:
            file_path = os.path.join(session_path, f"{name}.csv")

            # Read the first two lines manually
            with open(file_path, 'r') as f:
                start_time = float(f.readline().strip())  # UNIX timestamp
                sample_rate = float(f.readline().strip())

            # Load the actual data
            df = pd.read_csv(file_path, skiprows=2, header=None, names=[name.lower()])

            # Generate timestamps
            n = len(df)
            timestamps = start_time + np.arange(n) / sample_rate
            df['timestamp'] = pd.to_datetime(timestamps, unit='s')

            session_data[name.lower()] = df

        # Combine the signals into one DataFrame (by timestamp)
        combined = pd.concat([session_data['hr'], session_data['eda']['eda'], session_data['temp']['temp']], axis=1)
        combined['subject_id'] = 1  # or extract from file path if multiple subjects
        combined['session'] = session.lower()

        all_data.append(combined)

    data = pd.concat(all_data, ignore_index=True)
    data.rename(columns={'hr': 'heart_rate', 'temp': 'temperature'}, inplace=True)
    data = data[['timestamp', 'heart_rate', 'eda', 'temperature', 'subject_id', 'session']]

    return data

## 2. Data Preprocessing

Implement the `preprocess_data` function to clean and prepare the data for analysis.

In [None]:
def preprocess_data(data, output_dir='data/processed'):
    # Import required libraries
    from scipy.stats import zscore
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    # from scipy import stats
    # from pathlib import Path
    import os

    # Set style for plots
    plt.style.use('seaborn-v0_8')
    %matplotlib inline
    """Clean and prepare the physiological data for analysis.
    
    Parameters
    ----------
    data : pd.DataFrame
        Raw physiological data
    output_dir : str
        Directory to save processed data files
        
    Returns
    -------
    pd.DataFrame
        Cleaned and preprocessed data
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Your code here
    # 1. Handle missing values (simple forward fill)
    data = data.sort_values(['subject_id', 'session', 'timestamp'])
    data = data.ffill().bfill()
    # 2. Resample to regular intervals (e.g., 1 second)
    processed = []
    for (subject, session), group in data.groupby(['subject_id', 'session']):
        group = group.set_index('timestamp')
        num = ['heart_rate', 'eda', 'temperature']
        group_num = group[num].resample('1s').mean().interpolate()
        group_num['subject_id'] = subject
        group_num['session'] = session
        group = group_num.reset_index()
        group['subject_id'] = subject
        group['session'] = session
        processed.append(group.reset_index())
    data = pd.concat(processed, ignore_index=True)
    
    # 3. Remove outliers (z-score > 3)
    for col in ['heart_rate', 'eda', 'temperature']:
        z = zscore(data[col])
        data = data[np.abs(z) <= 3]

    # 4. Save processed data to CSV
    output_path = os.path.join(output_dir, 'processed_data.csv')
    data.to_csv(output_path, index=False)

    return data

## 3. Visualization

Implement the `plot_physiological_signals` function to create visualizations of the physiological data.

In [None]:
def plot_physiological_signals(data, subject_id, session, output_dir='plots'):
    # Import required libraries
    from scipy.stats import zscore
    import matplotlib.pyplot as plt
    import seaborn as sns
    import os

    # Set style for plots
    plt.style.use('seaborn-v0_8')
    %matplotlib inline
    """Create plots of physiological signals for a given subject and session.
    
    Parameters
    ----------
    data : pd.DataFrame
        Preprocessed physiological data
    subject_id : str
        Subject identifier (e.g., 'S1')
    session : str
        Session identifier (e.g., 'Midterm 1')
    output_dir : str
        Directory to save plot files
        
    Returns
    -------
    matplotlib.figure.Figure
        Figure object containing the plots
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Your code here
    # 1. Create figure with subplots
    fig, axs = plt.subplots(3, 1, figsize=(10, 15), sharex=True)
    fig.suptitle(f'Physiological Signals for {subject_id} - {session}', fontsize=16)
    # 2. Plot each physiological signal
    sns.lineplot(data=data[data['subject_id'] == subject_id][data['session'] == session], x='timestamp', y='heart_rate', ax=axs[0])
    axs[0].set_title('Heart Rate')
    axs[0].set_ylabel('Heart Rate (bpm)')
    sns.lineplot(data=data[data['subject_id'] == subject_id][data['session'] == session], x='timestamp', y='eda', ax=axs[1])
    axs[1].set_title('Electrodermal Activity')
    axs[1].set_ylabel('EDA (μS)')
    sns.lineplot(data=data[data['subject_id'] == subject_id][data['session'] == session], x='timestamp', y='temperature', ax=axs[2])
    axs[2].set_title('Temperature')
    axs[2].set_ylabel('Temperature (°C)')
    axs[2].set_xlabel('Timestamp')
    # 3. Add labels and titles
    for ax in axs:
        ax.set_xlabel('Timestamp')
        ax.set_ylabel('Value')
        ax.legend(['Heart Rate', 'EDA', 'Temperature'])
        ax.grid()
    # 4. Save plot to file
    output_path = os.path.join(output_dir, f'{subject_id}_{session}.png')
    plt.savefig(output_path)
    plt.close(fig)
    return fig