# Part 1: Data Exploration and Preprocessing

In this notebook, you will implement functions to load, preprocess, and visualize physiological data from the Wearable Exam Stress Dataset.

In [3]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from pathlib import Path
import os

# Set style for plots
plt.style.use('seaborn-v0_8')
%matplotlib inline

## 1. Data Loading

Implement the `load_data` function to read and organize the physiological data from the dataset.

In [None]:
def load_data(data_dir='data/raw'):
    """Load and organize the physiological data from the dataset.
    
    Parameters
    ----------
    data_dir : str
        Path to the directory containing the dataset files
        
    Returns
    -------
    pd.DataFrame
        DataFrame containing the organized physiological data with columns:
        ['timestamp', 'heart_rate', 'eda', 'temperature', 'subject_id', 'session']
    """
    all_data = []

    for subject in os.listdir(data_dir):
        subject_path = os.path.join(data_dir, subject)
        if not os.path.isdir(subject_path):
            continue

        for session in ['Midterm 1', 'Midterm 2', 'Final']:
            session_path = os.path.join(subject_path, session)
            if not os.path.exists(session_path):
                continue

            try:
                # heart rate data
                with open(os.path.join(session_path, 'HR.csv'), 'r') as f:
                    hr_start = float(f.readline()) # starting timestamp
                    hr_freq = float(f.readline()) # sampling freq
                    hr_data = [float(line.strip()) for line in f]
                    hr_time = pd.to_datetime(np.arange(len(hr_data)) / hr_freq + hr_start, unit='s', utc=True)
                    df_hr = pd.DataFrame({'timestamp': hr_time, 'heart_rate': hr_data})

                # EDA data
                with open(os.path.join(session_path, 'EDA.csv'), 'r') as f:
                    eda_start = float(f.readline()) # starting timestamp
                    eda_freq = float(f.readline()) # sampling freq
                    eda_data = [float(line.strip()) for line in f]
                    eda_time = pd.to_datetime(np.arange(len(eda_data)) / eda_freq + eda_start, unit='s', utc=True)
                    df_eda = pd.DataFrame({'timestamp': eda_time, 'eda': eda_data})

                # temperature data
                with open(os.path.join(session_path, 'TEMP.csv'), 'r') as f:
                    temp_start = float(f.readline()) # starting timestamp
                    temp_freq = float(f.readline()) # sampling freq
                    temp_data = [float(line.strip()) for line in f]
                    temp_time = pd.to_datetime(np.arange(len(temp_data)) / temp_freq + temp_start, unit='s', utc=True)
                    df_temp = pd.DataFrame({'timestamp': temp_time, 'temperature': temp_data})

                # merge all dataframes on timestamp
                df = pd.concat([df_hr, df_eda, df_temp], axis=1)
                df['subject_id'] = subject  
                df['session'] = session
                df.reset_index(inplace=True)
                all_data.append(df)

            except FileNotFoundError:
                print(f"Missing file(s) in {session_path}")

    # combine dataframes
    df_all = pd.concat(all_data, ignore_index=True)
    return df_all

    pass

In [12]:
raw_data = load_data('data/raw')
raw_data

Unnamed: 0,timestamp,heart_rate,timestamp.1,eda,timestamp.2,temperature,subject_id,session
0,2018-10-13 12:55:31+00:00,50.00,2018-10-13 12:55:21+00:00,0.000000,2018-10-13 12:55:21+00:00,22.23,S5,Midterm 1
1,2018-10-13 12:55:32+00:00,53.50,2018-10-13 12:55:21.250000+00:00,0.002562,2018-10-13 12:55:21.250000+00:00,22.23,S5,Midterm 1
2,2018-10-13 12:55:33+00:00,61.00,2018-10-13 12:55:21.500000+00:00,0.021779,2018-10-13 12:55:21.500000+00:00,22.23,S5,Midterm 1
3,2018-10-13 12:55:34+00:00,91.25,2018-10-13 12:55:21.750000+00:00,0.021779,2018-10-13 12:55:21.750000+00:00,22.23,S5,Midterm 1
4,2018-10-13 12:55:35+00:00,89.60,2018-10-13 12:55:22+00:00,0.019217,2018-10-13 12:55:22+00:00,22.23,S5,Midterm 1
...,...,...,...,...,...,...,...,...
1773127,NaT,,2018-12-05 20:25:52.250000+00:00,0.011532,2018-12-05 20:25:52.250000+00:00,23.21,S9,Final
1773128,NaT,,2018-12-05 20:25:52.500000+00:00,0.012813,2018-12-05 20:25:52.500000+00:00,23.21,S9,Final
1773129,NaT,,2018-12-05 20:25:52.750000+00:00,0.011532,2018-12-05 20:25:52.750000+00:00,23.21,S9,Final
1773130,NaT,,2018-12-05 20:25:53+00:00,0.011532,NaT,,S9,Final


Unnamed: 0,timestamp,heart_rate,eda,temperature,subject_id,session
0,1.539435e+09,,0.000000,22.23,S5,Midterm 1
1,1.539435e+09,,0.002562,22.23,S5,Midterm 1
2,1.539435e+09,,0.021779,22.23,S5,Midterm 1
3,1.539435e+09,,0.021779,22.23,S5,Midterm 1
4,1.539435e+09,,0.019217,22.23,S5,Midterm 1
...,...,...,...,...,...,...
1773140,1.544042e+09,,0.011532,23.21,S9,Final
1773141,1.544042e+09,,0.012813,23.21,S9,Final
1773142,1.544042e+09,,0.011532,23.21,S9,Final
1773143,1.544042e+09,123.07,0.011532,,S9,Final


## 2. Data Preprocessing

Implement the `preprocess_data` function to clean and prepare the data for analysis.

In [None]:
def preprocess_data(data, output_dir='data/processed'):
    """Clean and prepare the physiological data for analysis.
    
    Parameters
    ----------
    data : pd.DataFrame
        Raw physiological data
    output_dir : str
        Directory to save processed data files
        
    Returns
    -------
    pd.DataFrame
        Cleaned and preprocessed data
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Your code here
    # 1. Handle missing values
    # 2. Resample to regular intervals
    # 3. Remove outliers (z-score > 3)
    # 4. Save processed data to CSV files
    
    pass

## 3. Visualization

Implement the `plot_physiological_signals` function to create visualizations of the physiological data.

In [None]:
def plot_physiological_signals(data, subject_id, session, output_dir='plots'):
    """Create plots of physiological signals for a given subject and session.
    
    Parameters
    ----------
    data : pd.DataFrame
        Preprocessed physiological data
    subject_id : str
        Subject identifier (e.g., 'S1')
    session : str
        Session identifier (e.g., 'Midterm 1')
    output_dir : str
        Directory to save plot files
        
    Returns
    -------
    matplotlib.figure.Figure
        Figure object containing the plots
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Your code here
    # 1. Create figure with subplots
    # 2. Plot each physiological signal
    # 3. Add labels and titles
    # 4. Save plot to file
    
    pass