# Part 1: Data Exploration and Preprocessing

In this notebook, you will implement functions to load, preprocess, and visualize physiological data from the Wearable Exam Stress Dataset.

In [5]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from pathlib import Path
import os

# Set style for plots
sns.set_theme(style='darkgrid')  
%matplotlib inline


## 1. Data Loading

Implement the `load_data` function to read and organize the physiological data from the dataset.

In [27]:
def load_data(data_dir='a-wearable-exam-stress-dataset-for-predicting-cognitive-performance-in-real-world-settings-1.0.0/Data'):
    all_data = []

    for subject_id in os.listdir(data_dir):
        subject_path = os.path.join(data_dir, subject_id)
        if not os.path.isdir(subject_path) or not subject_id.startswith('S'):
            continue  

        for session_name in os.listdir(subject_path):
            session_path = os.path.join(subject_path, session_name)
            if not os.path.isdir(session_path):
                continue  # if its not a folder skip 

            session_data = {}

            for signal_file, freq, colname in [
                ('HR.csv', '1s', 'heart_rate'),
                ('EDA.csv', '250ms', 'eda'),
                ('TEMP.csv', '250ms', 'temperature')
            ]:
                file_path = os.path.join(session_path, signal_file)
                if os.path.exists(file_path):
                    df = pd.read_csv(file_path, header=None)
                    df.columns = [colname]
                    df['timestamp'] = pd.date_range(start='2023-01-01', periods=len(df), freq=freq)
                    df = df.set_index('timestamp').resample('1s').mean()
                    session_data[colname] = df

            if len(session_data) == 3:
                merged = pd.concat(session_data.values(), axis=1)
                merged = merged.reset_index()  
                merged['subject_id'] = subject_id
                merged['session'] = session_name
                all_data.append(merged)

    full_data = pd.concat(all_data, ignore_index=True)
    return full_data
    pass

In [28]:
# Call the function and check the columns of the loaded data
df = load_data()
print(df.columns)

Index(['timestamp', 'heart_rate', 'eda', 'temperature', 'subject_id',
       'session'],
      dtype='object')


## 2. Data Preprocessing

Implement the `preprocess_data` function to clean and prepare the data for analysis.

Columns in raw data: Index(['timestamp', 'heart_rate', 'eda', 'temperature', 'subject_id',
       'session'],
      dtype='object')
Preprocessed data saved to /workspaces/4-it-s-about-time-rishim3000/data/processed/preprocessed_data.csv


In [36]:
def preprocess_data(data, output_dir='/workspaces/4-it-s-about-time-ayabz22/data/processed'):
    """Clean and prepare the physiological data for analysis."""
    
    # Ensure correct columns exist
    required_cols = ['timestamp', 'heart_rate', 'eda', 'temperature']
    missing_cols = [col for col in required_cols if col not in data.columns]
    
    if missing_cols:
        print(f"Warning: Missing columns: {missing_cols}")
        return None
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Remove outliers using z-score (threshold = 3), excluding the 'timestamp' column
    numeric_cols = ['heart_rate', 'eda', 'temperature']
    z_scores = np.abs(stats.zscore(data[numeric_cols]))
    data_clean = data[(z_scores < 3).all(axis=1)]

    # Save the cleaned data to CSV
    output_path = os.path.join(output_dir, 'preprocessed_data.csv')
    data_clean.to_csv(output_path, index=False)
    
    print(f"Preprocessed data saved to {output_path}")
    
    return data_clean

# Load raw data
raw_data = load_data()

# Check for column issues before calling preprocess_data
print(f"Columns in raw data: {raw_data.columns}")

# Process the data
processed_data = preprocess_data(raw_data)


Columns in raw data: Index(['timestamp', 'heart_rate', 'eda', 'temperature', 'subject_id',
       'session'],
      dtype='object')
Preprocessed data saved to /workspaces/4-it-s-about-time-ayabz22/data/processed/preprocessed_data.csv


## 3. Visualization

Implement the `plot_physiological_signals` function to create visualizations of the physiological data.

In [None]:
def plot_physiological_signals(data, subject_id, session, output_dir='plots'):
    """Create plots of physiological signals for a given subject and session.
    
    Parameters
    ----------
    data : pd.DataFrame
        Preprocessed physiological data
    subject_id : str
        Subject identifier (e.g., 'S1')
    session : str
        Session identifier (e.g., 'Midterm 1')
    output_dir : str
        Directory to save plot files
        
    Returns
    -------
    matplotlib.figure.Figure
        Figure object containing the plots
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Your code here
    # 1. Create figure with subplots
    # 2. Plot each physiological signal
    # 3. Add labels and titles
    # 4. Save plot to file
    
    pass