# Part 1: Data Exploration and Preprocessing

In this notebook, you will implement functions to load, preprocess, and visualize physiological data from the Wearable Exam Stress Dataset.

In [2]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from pathlib import Path
import os

# Set style for plots
plt.style.use('seaborn')
%matplotlib inline

OSError: 'seaborn' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)

## 1. Data Loading

Implement the `load_data` function to read and organize the physiological data from the dataset.

In [None]:
def load_data(data_dir='data/raw/physionet.org/files/wearable-exam-stress/1.0.0/data'): 
    """Load and organize the physiological data from the dataset.
    
    Parameters
    ----------
    data_dir : str
        Path to the directory containing the dataset files
        
    Returns
    -------
    pd.DataFrame
        DataFrame containing the organized physiological data with columns:
        ['timestamp', 'heart_rate', 'eda', 'temperature', 'subject_id', 'session']
    """
    data = []

    for student in os.listdir(data_dir):
        if student.startswith("S"):

            for folder in ['Final', 'midterm_1', 'midterm_2']:

                    try:
                    
                        path = os.path.join(data_dir, student, folder)

                        #new function so I can open each file and filter out first 2 values (start time and rate)
                        #so I can then calcualte timepoints from them for the rest of the values
                        def load_file(file_name, col_name):
                            file_path = os.path.join(path, file_name)

                            with open(file_path, 'r') as f:
                                lines = f.readlines()

                            start_time = float(lines[0].strip())
                            sample_rate = float(lines[1].strip())

                            values = [float(line.strip()) for line in lines[2:]]
                            timestamps = [start_time + i / sample_rate for i in range(len(values))]

                            return pd.DataFrame({'timestamp': timestamps, col_name: values})

                        hr_df = load_file('HR.csv', 'heart_rate')
                        eda_df = load_file('EDA.csv', 'eda')
                        temp_df = load_file('TEMP.csv', 'temperature')

                        # Merge on timestamp
                        df = hr_df.merge(eda_df, on='timestamp').merge(temp_df, on='timestamp')
                        df['subject_id'] = student
                        df['session'] = folder

                        data.append(df)


                    except Exception as e:
                        print(f"Error loading {student} {folder}: {e}") #um don't crash plz

    return pd.concat(data).head()


load_data()

Unnamed: 0,timestamp,heart_rate,eda,temperature,subject_id,session
0,1544027000.0,118.0,0.0,21.65,S5,Final
1,1544027000.0,84.5,0.0,21.65,S5,Final
2,1544027000.0,97.67,0.0,21.67,S5,Final
3,1544027000.0,85.25,0.0,21.65,S5,Final
4,1544027000.0,97.8,0.0,21.65,S5,Final


## 2. Data Preprocessing

Implement the `preprocess_data` function to clean and prepare the data for analysis.

In [None]:
def preprocess_data(data, output_dir='data/processed'):
    """Clean and prepare the physiological data for analysis.
    
    Parameters
    ----------
    data : pd.DataFrame
        Raw physiological data
    output_dir : str
        Directory to save processed data files
        
    Returns
    -------
    pd.DataFrame
        Cleaned and preprocessed data
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Your code here
    # 1. Handle missing values
    # 2. Resample to regular intervals
    # 3. Remove outliers (z-score > 3)
    # 4. Save processed data to CSV files
    
    pass

## 3. Visualization

Implement the `plot_physiological_signals` function to create visualizations of the physiological data.

In [None]:
def plot_physiological_signals(data, subject_id, session, output_dir='plots'):
    """Create plots of physiological signals for a given subject and session.
    
    Parameters
    ----------
    data : pd.DataFrame
        Preprocessed physiological data
    subject_id : str
        Subject identifier (e.g., 'S1')
    session : str
        Session identifier (e.g., 'Midterm 1')
    output_dir : str
        Directory to save plot files
        
    Returns
    -------
    matplotlib.figure.Figure
        Figure object containing the plots
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Your code here
    # 1. Create figure with subplots
    # 2. Plot each physiological signal
    # 3. Add labels and titles
    # 4. Save plot to file
    
    pass