# Part 1: Data Exploration and Preprocessing

In this notebook, you will implement functions to load, preprocess, and visualize physiological data from the Wearable Exam Stress Dataset.

In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from pathlib import Path
import os

# Set style for plots
sns.set()

## 1. Data Loading

Implement the `load_data` function to read and organize the physiological data from the dataset.

In [None]:
from pathlib import Path
import pandas as pd

def load_data(data_dir='data/raw'):
    """Load and organize the physiological data from the dataset."""
    data_dir = Path(data_dir).expanduser()
    all_records = []

    subjects = sorted(data_dir.glob('S*'))  # e.g., S1, S2, S3

    for subject_path in subjects:
        subject_id = subject_path.name  # 'S1', 'S2', etc.

        for session_path in subject_path.iterdir():
            if session_path.is_dir():
                session_name = session_path.name

                try:
                    hr_df = pd.read_csv(session_path / 'HR.csv', header=None, names=['timestamp', 'heart_rate'])
                    eda_df = pd.read_csv(session_path / 'EDA.csv', header=None, names=['timestamp', 'eda'])
                    temp_df = pd.read_csv(session_path / 'TEMP.csv', header=None, names=['timestamp', 'temperature'])

                    # Convert timestamps to datetime
                    hr_df['timestamp'] = pd.to_datetime(hr_df['timestamp'], unit='s')
                    eda_df['timestamp'] = pd.to_datetime(eda_df['timestamp'], unit='s')
                    temp_df['timestamp'] = pd.to_datetime(temp_df['timestamp'], unit='s')

                    # Remove duplicates
                    hr_df = hr_df.groupby('timestamp').mean()
                    eda_df = eda_df.groupby('timestamp').mean()
                    temp_df = temp_df.groupby('timestamp').mean()

                    # EDA is high-frequency: resample it to 1s by taking the mean
                    eda_df = eda_df.resample('1S').mean()

                    # HR and TEMP are already low frequency, no need to resample
                    # We merge them manually using nearest timestamps
                    merged = eda_df.copy()
                    merged = merged.merge(hr_df, left_index=True, right_index=True, how='left')
                    merged = merged.merge(temp_df, left_index=True, right_index=True, how='left')

                    # Reset index
                    merged = merged.reset_index()

                    # Add subject_id and session
                    merged['subject_id'] = subject_id
                    merged['session'] = session_name

                    all_records.append(merged)

                except FileNotFoundError as e:
                    print(f"Warning: Missing file for {subject_id} {session_name}: {e}")
                    continue

    if all_records:
        final_df = pd.concat(all_records, ignore_index=True)
    else:
        final_df = pd.DataFrame(columns=['timestamp', 'eda', 'heart_rate', 'temperature', 'subject_id', 'session'])

    # Reorder columns
    final_df = final_df[['timestamp', 'heart_rate', 'eda', 'temperature', 'subject_id', 'session']]

    return final_df

df = load_data('~/Documents/4-it-s-about-time-kanting6/data')
print(df.head())
print(df.columns)

  eda_df = eda_df.resample('1S').mean()


## 2. Data Preprocessing

Implement the `preprocess_data` function to clean and prepare the data for analysis.

In [None]:
import os
import pandas as pd
import numpy as np
from scipy.stats import zscore

def preprocess_data(data, output_dir='data/processed'):
    """Clean and prepare the physiological data for analysis.
    
    Parameters
    ----------
    data : pd.DataFrame
        Raw physiological data
    output_dir : str
        Directory to save processed data files
        
    Returns
    -------
    pd.DataFrame
        Cleaned and preprocessed data
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # 1. Handle missing values
    # Allow up to 1% missing data per column, otherwise remove the sample
    missing_rate = data.isna().mean()
    if missing_rate.max() > 0.01:
        print("Warning: Some columns have >1% missing values. Dropping rows with NaN.")
    
    # Impute missing values by forward fill, then backward fill if needed
    data = data.sort_values(['subject_id', 'session', 'timestamp'])
    data[['heart_rate', 'eda', 'temperature']] = data[['heart_rate', 'eda', 'temperature']].fillna(method='ffill').fillna(method='bfill')

    # 2. Resample to regular intervals (already done when loading, so just ensure sorted)
    data = data.sort_values(['subject_id', 'session', 'timestamp'])

    # 3. Remove outliers based on z-score threshold (threshold=3.5)
    def remove_outliers(df):
        z = np.abs(zscore(df[['heart_rate', 'eda', 'temperature']]))
        mask = (z < 3.5) | np.isnan(z)  # Keep NaN values (already handled)
        return df[mask.all(axis=1)]

    data = remove_outliers(data)

    # 4. Save processed data
    subjects = data['subject_id'].unique()

    for subject in subjects:
        subject_data = data[data['subject_id'] == subject]
        save_path = os.path.join(output_dir, f'{subject}_processed.csv')
        subject_data.to_csv(save_path, index=False)

    return data

raw_data = load_data('~/Documents/4-it-s-about-time-kanting6/data')
processed_data = preprocess_data(raw_data, output_dir='data/processed')

print(processed_data.head())

## 3. Visualization

Implement the `plot_physiological_signals` function to create visualizations of the physiological data.

In [None]:
import os
import matplotlib.pyplot as plt

def plot_physiological_signals(data, subject_id, session, output_dir='plots'):
    """Create plots of physiological signals for a given subject and session.
    
    Parameters
    ----------
    data : pd.DataFrame
        Preprocessed physiological data
    subject_id : str
        Subject identifier (e.g., 'S1')
    session : str
        Session identifier (e.g., 'Midterm 1')
    output_dir : str
        Directory to save plot files
        
    Returns
    -------
    matplotlib.figure.Figure
        Figure object containing the plots
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Filter data for the specific subject and session
    subset = data[(data['subject_id'] == subject_id) & (data['session'] == session)]

    if subset.empty:
        raise ValueError(f"No data found for subject {subject_id} and session {session}")

    # Create figure with 3 subplots
    fig, axes = plt.subplots(3, 1, figsize=(15, 10), sharex=True)

    # Plot heart rate
    axes[0].plot(subset['timestamp'], subset['heart_rate'], label='Heart Rate')
    axes[0].set_ylabel('Heart Rate (bpm)')
    axes[0].set_title(f'Subject {subject_id} - {session} - Heart Rate')
    axes[0].grid(True)

    # Plot EDA
    axes[1].plot(subset['timestamp'], subset['eda'], label='EDA', color='orange')
    axes[1].set_ylabel('EDA (µS)')
    axes[1].set_title(f'Subject {subject_id} - {session} - Electrodermal Activity')
    axes[1].grid(True)

    # Plot Temperature
    axes[2].plot(subset['timestamp'], subset['temperature'], label='Temperature', color='green')
    axes[2].set_ylabel('Temperature (°C)')
    axes[2].set_title(f'Subject {subject_id} - {session} - Skin Temperature')
    axes[2].set_xlabel('Time')
    axes[2].grid(True)

    # Adjust layout
    plt.tight_layout()

    # Save figure
    filename = f"S{subject_id}_{session.replace(' ', '_')}_signals.png"
    filepath = os.path.join(output_dir, filename)
    fig.savefig(filepath)

    print(f"Plot saved to {filepath}")

    return fig

raw_data = load_data('~/Documents/4-it-s-about-time-kanting6/data')
processed_data = preprocess_data(raw_data, output_dir='data/processed')

fig = plot_physiological_signals(processed_data, subject_id='S1', session='Midterm 1', output_dir='plots')
plt.show() 
