# Part 1: Data Exploration and Preprocessing

In this notebook, you will implement functions to load, preprocess, and visualize physiological data from the Wearable Exam Stress Dataset.

In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from pathlib import Path
import os

%matplotlib inline

## 1. Data Loading

Implement the `load_data` function to read and organize the physiological data from the dataset.

In [2]:
def load_data(data_dir='data/raw'):
    """Load and organize the physiological data from the dataset.
    
    Parameters
    ----------
    data_dir : str
        Path to the directory containing the dataset files
        
    Returns
    -------
    pd.DataFrame
        DataFrame containing the organized physiological data with columns:
        ['timestamp', 'heart_rate', 'eda', 'temperature', 'subject_id', 'session']
    """
    # Your code here
    final_df = pd.DataFrame() # initialize final data frame
    dataset_files = ['EDA.csv','HR.csv','TEMP.csv'] # EDA, HR, temp data
    sessions = ['Final', 'midterm_1', 'midterm_2']
    subjects = pd.Series(range(10))+1
    subjects = 'S' + subjects.astype('str')
    for subject in subjects:
        for session in sessions:
            df = pd.DataFrame() # initialize data frame
            for file in dataset_files: # iterate over each data .csv
                path = [data_dir, subject, session, file]
                current = pd.read_csv('/'.join(path), header=None).add_suffix('_'+file).iloc[:, 0] # read data and treat singular column as series
                init_time = current.iloc[0] # first entry is initial time in unix
                freq = current.iloc[1] # second entry is frequency of record, convert from hz to seconds
                current = current.iloc[2:] # drop the init_time and freq entries to leave just data
                final_time = init_time + (len(current)-1)/freq # last time entry is initial time + number of entries (excluding initial time) divided by frequency
                init_time = pd.to_datetime(init_time, unit='s', origin='unix') # convert times
                final_time = pd.to_datetime(final_time, unit='s', origin='unix')
                to_df = pd.DataFrame(current)
                to_df.index = pd.date_range(start=init_time, end=final_time, periods=len(current)) # timestamp index
                if df.empty == True:
                    df = to_df
                else:
                    df = df.join(to_df, how='outer', sort=True) # outer join with full data so far to include all timestamps in index
            df['session'] = session
            df['subject_id'] = subject
            if final_df.empty == True:
                final_df = df
            else:
                final_df = pd.concat([final_df, df], axis=0)
    final_df = final_df.reset_index()
    final_df.columns = ['timestamp', 'eda', 'heart_rate', 'temperature', 'session', 'subject_id']
    return final_df

df = load_data('data/raw/physionet.org/files/wearable-exam-stress/1.0.0/data')
df[(df['subject_id']=='S1') & (df['session']=='Final')]

Unnamed: 0,timestamp,eda,heart_rate,temperature,session,subject_id
0,2018-12-05 16:28:57.000,0.000000,,21.89,Final,S1
1,2018-12-05 16:28:57.250,0.005125,,21.89,Final,S1
2,2018-12-05 16:28:57.500,0.020501,,21.89,Final,S1
3,2018-12-05 16:28:57.750,0.021783,,21.89,Final,S1
4,2018-12-05 16:28:58.000,0.023064,,21.89,Final,S1
...,...,...,...,...,...,...
93580,2018-12-05 22:58:52.000,0.025627,121.95,26.67,Final,S1
93581,2018-12-05 22:58:52.250,0.026908,,26.67,Final,S1
93582,2018-12-05 22:58:52.500,,,26.67,Final,S1
93583,2018-12-05 22:58:52.750,,,26.67,Final,S1


## 2. Data Preprocessing

Implement the `preprocess_data` function to clean and prepare the data for analysis.

In [3]:
def preprocess_data(data, output_dir='data/processed'):
    """Clean and prepare the physiological data for analysis.
    
    Parameters
    ----------
    data : pd.DataFrame
        Raw physiological data
    output_dir : str
        Directory to save processed data files
        
    Returns
    -------
    pd.DataFrame
        Cleaned and preprocessed data
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Your code here
    # 1. Handle missing values
    # 2. Resample to regular intervals
    # 3. Remove outliers (z-score > 3.5)
    # 4. Save processed data to CSV files

    final_df = pd.DataFrame()
    subjects = pd.Series(range(10))+1
    subjects = 'S' + subjects.astype('str')
    sessions = ['Final', 'midterm_1', 'midterm_2']
    vars = ['eda', 'heart_rate', 'temperature']

    # remove missing rows with NAs
    df = data.dropna()

    # process data for each subject separately to save subject data separately
    for subject in subjects:
        # process each session separately for resampling
        subj_df = pd.DataFrame() # initialize subject dataframe
        for session in sessions:
            subj_sess = df[(df['subject_id']==subject) & (df['session']==session)]
            subj_sess = subj_sess.set_index('timestamp')
            subj_sess = subj_sess[vars] # drop subject and session columns after selection
            subj_sess = subj_sess.resample('1s').mean() # resample down to 1s for all variables
            subj_sess = subj_sess.reset_index(drop=False) # return timestamp index to column
            subj_sess['subject_id'] = subject
            subj_sess['session'] = session
            if subj_df.empty==True:
                subj_df = subj_sess
            else:
                subj_df = pd.concat([subj_df, subj_sess], axis=0)
        subj_df.to_csv(output_dir + '/' + subject + '_processed.csv') # output each subject df to its own file
        if final_df.empty==True:
            final_df = subj_df
        else:
            final_df = pd.concat([final_df, subj_df], axis=0) # concat subject dfs for output
    return final_df

df_preprocess = preprocess_data(data=df)
df_preprocess['heart_rate']

0        116.00
1         82.50
2         96.33
3         86.25
4         98.60
          ...  
12977    115.07
12978    116.13
12979    116.07
12980    116.47
12981    116.42
Name: heart_rate, Length: 442945, dtype: float64

## 3. Visualization

Implement the `plot_physiological_signals` function to create visualizations of the physiological data.

In [4]:
def plot_physiological_signals(data, subject_id, session, output_dir='plots'):
    """Create plots of physiological signals for a given subject and session.
    
    Parameters
    ----------
    data : pd.DataFrame
        Preprocessed physiological data
    subject_id : str
        Subject identifier (e.g., 'S1')
    session : str
        Session identifier (e.g., 'Midterm 1')
    output_dir : str
        Directory to save plot files
        
    Returns
    -------
    matplotlib.figure.Figure
        Figure object containing the plots
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Your code here
    # 1. Create figure with subplots
    # 2. Plot each physiological signal
    # 3. Add labels and titles
    # 4. Save plot to file
    
    df = data[(data['subject_id']==subject_id) & (data['session']==session)]
    fig, axs = plt.subplots(3, 1, figsize=(10,8))

    axs[0].plot(df['timestamp'], df['eda'])
    axs[0].set_ylabel('EDA (uS)')

    axs[1].plot(df['timestamp'], df['heart_rate'])
    axs[1].set_ylabel('HR (bpm)')

    axs[2].plot(df['timestamp'], df['temperature'])
    axs[2].set_xlabel('Time')
    axs[2].set_ylabel('Temperature (C)')

    fig.suptitle(f'Physiological Signals for Subject {subject_id}, {session}')
    fig.savefig(output_dir + '/' + subject_id + '_' + session + '_signals.png')
    plt.close(fig)

sessions = ['Final', 'midterm_1', 'midterm_2']
subjects = pd.Series(range(10))+1
subjects = 'S' + subjects.astype('str')

for session in sessions:
    for subject in subjects:
        plot_physiological_signals(df_preprocess, subject, session)