# Part 1: Data Exploration and Preprocessing

In this notebook, you will implement functions to load, preprocess, and visualize physiological data from the Wearable Exam Stress Dataset.

In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from pathlib import Path
import os

%matplotlib inline

## 1. Data Loading

Implement the `load_data` function to read and organize the physiological data from the dataset.

In [9]:
def load_data(data_dir='data/raw'):
    """Load and organize the physiological data from the dataset.
    
    Parameters
    ----------
    data_dir : str
        Path to the directory containing the dataset files
        
    Returns
    -------
    pd.DataFrame
        DataFrame containing the organized physiological data with columns:
        ['timestamp', 'heart_rate', 'eda', 'temperature', 'subject_id', 'session']
    """
    # Your code here
    final_df = pd.DataFrame() # initialize final data frame
    dataset_files = ['EDA.csv','HR.csv','TEMP.csv'] # EDA, HR, temp data
    sessions = ['Final', 'midterm_1', 'midterm_2']
    subjects = pd.Series(range(10))+1
    subjects = 'S' + subjects.astype('str')
    for subject in subjects:
        for session in sessions:
            df = pd.DataFrame() # initialize data frame
            for file in dataset_files: # iterate over each data .csv
                path = [data_dir, subject, session, file]
                current = pd.read_csv('/'.join(path), header=None).add_suffix('_'+file).iloc[:, 0] # read data and treat singular column as series
                init_time = current.iloc[0] # first entry is initial time in unix
                freq = current.iloc[1] # second entry is frequency of record, convert from hz to seconds
                current = current.iloc[2:] # drop the init_time and freq entries to leave just data
                final_time = init_time + (len(current)-1)/freq # last time entry is initial time + number of entries (excluding initial time) divided by frequency
                init_time = pd.to_datetime(init_time, unit='s', origin='unix') # convert times
                final_time = pd.to_datetime(final_time, unit='s', origin='unix')
                to_df = pd.DataFrame(current)
                to_df.index = pd.date_range(start=init_time, end=final_time, periods=len(current)) # timestamp index
                to_df = to_df.resample('1s').mean() # resample to 1 data point per s
                if df.empty == True:
                    df = to_df
                else:
                    df = df.join(to_df, how='outer', sort=True) # outer join with full data so far to include all timestamps in index
            df['session'] = session
            df['subject_id'] = subject
            if final_df.empty == True:
                final_df = df
            else:
                final_df = pd.concat([final_df, df], axis=0)
    final_df = final_df.reset_index()
    final_df.columns = ['timestamp', 'eda', 'heart_rate', 'temperature', 'session', 'subject_id']
    return final_df

df = load_data('data/raw/physionet.org/files/wearable-exam-stress/1.0.0/data')
df

Unnamed: 0,timestamp,eda,heart_rate,temperature,session,subject_id
0,2018-12-05 16:28:57,0.011852,,21.89,Final,S1
1,2018-12-05 16:28:58,0.023064,,21.89,Final,S1
2,2018-12-05 16:28:59,0.023704,,21.91,Final,S1
3,2018-12-05 16:29:00,0.023384,,21.91,Final,S1
4,2018-12-05 16:29:01,0.023704,,21.89,Final,S1
...,...,...,...,...,...,...
443297,2018-11-10 17:55:12,0.020820,115.07,23.05,midterm_2,S10
443298,2018-11-10 17:55:13,0.020180,116.13,22.99,midterm_2,S10
443299,2018-11-10 17:55:14,0.021140,116.07,22.99,midterm_2,S10
443300,2018-11-10 17:55:15,0.020500,116.47,22.95,midterm_2,S10


## 2. Data Preprocessing

Implement the `preprocess_data` function to clean and prepare the data for analysis.

In [10]:
def preprocess_data(data, output_dir='data/processed'):
    """Clean and prepare the physiological data for analysis.
    
    Parameters
    ----------
    data : pd.DataFrame
        Raw physiological data
    output_dir : str
        Directory to save processed data files
        
    Returns
    -------
    pd.DataFrame
        Cleaned and preprocessed data
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Your code here
    # 1. Handle missing values
    # 2. Resample to regular intervals
    # 3. Remove outliers (z-score > 3.5)
    # 4. Save processed data to CSV files

    final_df = pd.DataFrame()
    subjects = pd.Series(range(10))+1
    subjects = 'S' + subjects.astype('str')
    for subject in subjects:
        df = data[data['subject_id']==subject] # select only on subject at a time
        df = df.dropna() # remove na values at beginning/end of time period (already resampled)
        for var in ['eda', 'heart_rate', 'temperature']: # for each variable
            var_mean = df[var].mean() # find mean and SD
            var_sd = df[var].std()
            lower = var_mean - 3.5*var_sd # find upper and lower bounds of within 3.5 SDs of mean
            upper = var_mean + 3.5*var_sd
            df = df[df[var]>=lower] # filter out outliers
            df = df[df[var]<=upper]
        df.to_csv(output_dir + '/' + subject + '_processed.csv')
        if final_df.empty == True:
            final_df = df
        else:
            final_df = pd.concat([final_df, df], axis=0)
    return final_df

df = preprocess_data(data=df)
df

Unnamed: 0,timestamp,eda,heart_rate,temperature,session,subject_id
10,2018-12-05 16:29:07,0.023704,116.00,21.91,Final,S1
11,2018-12-05 16:29:08,0.023384,82.50,21.93,Final,S1
12,2018-12-05 16:29:09,0.023384,96.33,21.93,Final,S1
13,2018-12-05 16:29:10,0.023064,86.25,21.91,Final,S1
14,2018-12-05 16:29:11,0.023384,98.60,21.91,Final,S1
...,...,...,...,...,...,...
443297,2018-11-10 17:55:12,0.020820,115.07,23.05,midterm_2,S10
443298,2018-11-10 17:55:13,0.020180,116.13,22.99,midterm_2,S10
443299,2018-11-10 17:55:14,0.021140,116.07,22.99,midterm_2,S10
443300,2018-11-10 17:55:15,0.020500,116.47,22.95,midterm_2,S10


## 3. Visualization

Implement the `plot_physiological_signals` function to create visualizations of the physiological data.

In [23]:
def plot_physiological_signals(data, subject_id, session, output_dir='plots'):
    """Create plots of physiological signals for a given subject and session.
    
    Parameters
    ----------
    data : pd.DataFrame
        Preprocessed physiological data
    subject_id : str
        Subject identifier (e.g., 'S1')
    session : str
        Session identifier (e.g., 'Midterm 1')
    output_dir : str
        Directory to save plot files
        
    Returns
    -------
    matplotlib.figure.Figure
        Figure object containing the plots
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Your code here
    # 1. Create figure with subplots
    # 2. Plot each physiological signal
    # 3. Add labels and titles
    # 4. Save plot to file
    
    df = data[(data['subject_id']==subject_id) & (data['session']==session)]
    fig, axs = plt.subplots(3, 1, figsize=(10,8))

    axs[0].plot(df['timestamp'], df['eda'])
    axs[0].set_ylabel('EDA (uS)')

    axs[1].plot(df['timestamp'], df['heart_rate'])
    axs[1].set_ylabel('HR (bpm)')

    axs[2].plot(df['timestamp'], df['temperature'])
    axs[2].set_xlabel('Time')
    axs[2].set_ylabel('Temperature (C)')

    fig.suptitle(f'Physiological Signals for Subject {subject_id}, {session}')
    fig.savefig(output_dir + '/' + subject_id + '_' + session + '_signals.png')
    plt.close(fig)

sessions = ['Final', 'midterm_1', 'midterm_2']
subjects = pd.Series(range(10))+1
subjects = 'S' + subjects.astype('str')

for session in sessions:
    for subject in subjects:
        plot_physiological_signals(df, subject, session)