# Part 1: Data Exploration and Preprocessing

In this notebook, you will implement functions to load, preprocess, and visualize physiological data from the Wearable Exam Stress Dataset.

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from pathlib import Path
import os

# Set style for plots
plt.style.use('seaborn')
%matplotlib inline

## 1. Data Loading

Implement the `load_data` function to read and organize the physiological data from the dataset.

In [52]:
def load_data(data_dir='data'):
    all_data = []
    
    subject_dirs = [d for d in Path(data_dir).iterdir() if d.is_dir() and d.name.startswith('S')]

    # Read grades file with proper encoding
    grades_path = Path(data_dir) / 'StudentGrades.txt'
    
    # Common encodings to try
    encodings = ['utf-8', 'latin1', 'cp1252', 'ISO-8859-1']
    grades = None
    
    for encoding in encodings:
        try:
            grades = pd.read_csv(grades_path, sep='\t', encoding=encoding)
            print(f"Successfully read grades file using {encoding} encoding")
            break
        except UnicodeDecodeError:
            print(f"Failed to read with {encoding} encoding, trying another...")
    
    if grades is None:
        print("Could not read grades file with any of the attempted encodings.")

    for subject_dir in subject_dirs:
        subject_id = subject_dir.name
        
        # process each session (Midterm1, Midterm2, Final)
        for session_name in ['Midterm 1', 'Midterm 2', 'Final']:
            session_dir = subject_dir / session_name
            
            if not session_dir.exists():
                continue
                
            # load heart rate data (BVP.csv)
            bvp_file = session_dir / 'BVP.csv'
            if bvp_file.exists():
                bvp_data = pd.read_csv(bvp_file, header=None, names=['bvp'])
                bvp_data['timestamp'] = pd.date_range(
                    start='2023-01-01', 
                    periods=len(bvp_data), 
                    freq='64ms'  # BVP sampled at 64 Hz
                )
                
            # load heart rate data (HR.csv)
            hr_file = session_dir / 'HR.csv'
            if hr_file.exists():
                hr_data = pd.read_csv(hr_file, header=None, names=['heart_rate'])
                hr_data['timestamp'] = pd.date_range(
                    start='2023-01-01', 
                    periods=len(hr_data), 
                    freq='1s' 
                )
                
            # load EDA data
            eda_file = session_dir / 'EDA.csv'
            if eda_file.exists():
                eda_data = pd.read_csv(eda_file, header=None, names=['eda'])
                eda_data['timestamp'] = pd.date_range(
                    start='2023-01-01', 
                    periods=len(eda_data), 
                    freq='250ms'  
                )
                
            # load temperature data
            temp_file = session_dir / 'TEMP.csv'
            if temp_file.exists():
                temp_data = pd.read_csv(temp_file, header=None, names=['temperature'])
                temp_data['timestamp'] = pd.date_range(
                    start='2023-01-01', 
                    periods=len(temp_data), 
                    freq='250ms' 
                )
                
            # resample all data to the same frequency
            hr_resampled = hr_data.set_index('timestamp')
            eda_resampled = eda_data.set_index('timestamp').resample('1s').mean()
            temp_resampled = temp_data.set_index('timestamp').resample('1s').mean()
            
            # merge data into a single DataFrame
            session_data = pd.concat([
                hr_resampled, 
                eda_resampled,
                temp_resampled
            ], axis=1).reset_index()
            
            # add subject and session information
            session_data['subject_id'] = subject_id
            session_data['session'] = session_name
            
            all_data.append(session_data)
    
    combined_data = pd.concat(all_data, ignore_index=True)
    
    return combined_data

In [None]:
# data loading check
data_dir = 'data'
data_path = Path(data_dir)

# check if the data directory exists
print(f"Data directory exists: {data_path.exists()}")

# check for student directories
subject_dirs = [d for d in data_path.iterdir() if d.is_dir() and d.name.startswith('S')]
print(f"Found {len(subject_dirs)} subject directories: {[d.name for d in subject_dirs]}")

# check if grades file exists
grades_path = data_path / 'StudentGrades.txt'
print(f"Grades file exists: {grades_path.exists()}")

# check one subject directory structure
if subject_dirs:
    sample_subject = subject_dirs[0]
    print(f"\nChecking structure for {sample_subject.name}:")
    sessions = [d.name for d in sample_subject.iterdir() if d.is_dir()]
    print(f"  Sessions found: {sessions}")
    
    # check files in first session
    if sessions:
        first_session = sample_subject / sessions[0]
        files = [f.name for f in first_session.iterdir() if f.is_file()]
        print(f"  Files in {sessions[0]}: {files}")

Data directory exists: True
Found 10 subject directories: ['S1', 'S10', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9']
Grades file exists: True

Checking structure for S1:
  Sessions found: ['Final', 'Midterm 1', 'Midterm 2']
  Files in Final: ['ACC.csv', 'BVP.csv', 'EDA.csv', 'HR.csv', 'IBI.csv', 'info.txt', 'tags.csv', 'TEMP.csv']


## 2. Data Preprocessing

Implement the `preprocess_data` function to clean and prepare the data for analysis.

In [55]:
def preprocess_data(data, output_dir='data/processed'):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Handle missing values
    # Use ffill() and bfill() instead of fillna(method='ffill')
    processed_data = data.copy()
    processed_data = processed_data.ffill().bfill()  # Updated to remove warning
    
    # Remove outliers (z-score > 3) for each numerical column
    for col in ['heart_rate', 'eda', 'temperature']:
        if col in processed_data.columns:
            # Calculate z-scores
            z_scores = stats.zscore(processed_data[col], nan_policy='omit')
            # Identify outliers
            outliers = np.abs(z_scores) > 3
            # Replace outliers with NaN, then interpolate
            processed_data.loc[outliers, col] = np.nan
            processed_data[col] = processed_data[col].interpolate(method='linear')
    
    # Normalize data for better comparison
    for col in ['heart_rate', 'eda', 'temperature']:
        if col in processed_data.columns:
            # Min-max normalization
            min_val = processed_data[col].min()
            max_val = processed_data[col].max()
            processed_data[f'{col}_normalized'] = (processed_data[col] - min_val) / (max_val - min_val)
    
    # Save processed data by subject and session
    for subject in processed_data['subject_id'].unique():
        for session in processed_data['session'].unique():
            subset = processed_data[
                (processed_data['subject_id'] == subject) & 
                (processed_data['session'] == session)
            ]
            
            if not subset.empty:
                filename = f"{subject}_{session}_processed.csv"
                subset.to_csv(f"{output_dir}/{filename}", index=False)
    
    # Save the complete processed dataset
    processed_data.to_csv(f"{output_dir}/all_processed.csv", index=False)
    
    return processed_data

In [None]:
# data preprocessing check
physio_data = load_data(data_dir='data')

# Check the loaded data shape
print(f"Loaded data with shape: {physio_data.shape}")
print("Sample of loaded data:")
print(physio_data.head())
# Process the data
processed_data = preprocess_data(physio_data)

print(f"Preprocessing complete. Processed data shape: {processed_data.shape}")
# Check if the processed data was created successfully
processed_dir = Path('data/processed')

# Check if directory exists
print(f"Processed directory exists: {processed_dir.exists()}")

# List processed files
if processed_dir.exists():
    processed_files = list(processed_dir.glob('*.csv'))
    print(f"\nFound {len(processed_files)} processed files:")
    
    # Group files by type
    all_processed = [f for f in processed_files if f.name.startswith('all_')]
    subject_files = [f for f in processed_files if not f.name.startswith('all_')]
    
    # Output details about the files found
    if processed_files:
        print(f"\nProcessed files created successfully!")
    else:
        print("No processed files were created. Check for errors in the preprocessing function.")

Failed to read with utf-8 encoding, trying another...
Successfully read grades file using latin1 encoding
Loaded data with shape: (443307, 6)
Sample of loaded data:
            timestamp    heart_rate           eda   temperature subject_id  \
0 2023-01-01 00:00:00  1.539435e+09  3.848588e+08  3.848589e+08         S1   
1 2023-01-01 00:00:01  1.000000e+00  2.210425e-02  2.251000e+01         S1   
2 2023-01-01 00:00:02  8.400000e+01  2.242450e-02  2.251000e+01         S1   
3 2023-01-01 00:00:03  8.500000e+01  2.338575e-02  2.251000e+01         S1   
4 2023-01-01 00:00:04  8.600000e+01  2.306525e-02  2.250000e+01         S1   

     session  
0  Midterm 1  
1  Midterm 1  
2  Midterm 1  
3  Midterm 1  
4  Midterm 1  
Preprocessing complete. Processed data shape: (443307, 9)
Processed directory exists: True

Found 31 processed files:

Processed files created successfully!


## 3. Visualization

Implement the `plot_physiological_signals` function to create visualizations of the physiological data.

In [None]:
def plot_physiological_signals(data, subject_id, session, output_dir='plots'):
    # create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    # filter data for specific subject and session
    subject_data = data[
        (data['subject_id'] == subject_id) & 
        (data['session'] == session)
    ]
    
    if subject_data.empty:
        print(f"No data found for subject {subject_id} in session {session}")
        return None
    
    # create figure with subplots for each physiological signal
    fig, axes = plt.subplots(3, 1, figsize=(14, 10), sharex=True)
    
    # plot heart rate
    if 'heart_rate' in subject_data.columns:
        axes[0].plot(subject_data['timestamp'], subject_data['heart_rate'], 'r-')
        axes[0].set_title(f'Heart Rate - Subject {subject_id} - {session}')
        axes[0].set_ylabel('Heart Rate (BPM)')
        axes[0].grid(True)
    
    # plot EDA
    if 'eda' in subject_data.columns:
        axes[1].plot(subject_data['timestamp'], subject_data['eda'], 'b-')
        axes[1].set_title(f'Electrodermal Activity - Subject {subject_id} - {session}')
        axes[1].set_ylabel('EDA (microsiemens)')
        axes[1].grid(True)
    
    # plot skin temperature
    if 'temperature' in subject_data.columns:
        axes[2].plot(subject_data['timestamp'], subject_data['temperature'], 'g-')
        axes[2].set_title(f'Skin Temperature - Subject {subject_id} - {session}')
        axes[2].set_ylabel('Temperature (°C)')
        axes[2].set_xlabel('Time')
        axes[2].grid(True)
    
    # add title
    plt.suptitle(f'Physiological Signals - Subject {subject_id} - {session}', fontsize=16)
    plt.tight_layout()
    plt.subplots_adjust(top=0.92) 
    
    fig_path = f"{output_dir}/S{subject_id}_{session}_signals.png"
    plt.savefig(fig_path, dpi=300, bbox_inches='tight')
    print(f"Figure saved to {fig_path}")
    
    return fig

In [None]:
# function to check and create visualization plots for all subjects

def create_all_subject_plots(processed_data, output_dir='plots'):
    # create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # get unique subjects and sessions
    subjects = processed_data['subject_id'].unique()
    
    # track successful plots
    successful_plots = 0
    failed_plots = 0
    
    # create a DataFrame to store plot status
    plot_status = []
    
    # loop through all subjects
    for subject in subjects:
        # get available sessions for this subject
        sessions = processed_data[processed_data['subject_id'] == subject]['session'].unique()
        
        print(f"Processing Subject {subject}: {len(sessions)} sessions found")
        
        # loop through each available session for this subject
        for session in sessions:
            try:
                # create the plot
                fig = plot_physiological_signals(processed_data, subject, session, output_dir)
                
                plt.close(fig)
                
                successful_plots += 1
                status = "Success"
            except Exception as e:
                print(f"Error creating plot for Subject {subject}, Session {session}: {e}")
                failed_plots += 1
                status = f"Failed: {str(e)}"
            
            # record status
            plot_status.append({
                'subject_id': subject,
                'session': session,
                'status': status
            })
    
    # create a summary DataFrame
    status_df = pd.DataFrame(plot_status)
    
    status_df.to_csv(f"{output_dir}/plot_status_report.csv", index=False)
    
    print(f"\nPlot generation complete:")
    print(f"- Successfully created: {successful_plots} plots")
    print(f"- Failed: {failed_plots} plots")
    print(f"- Status report saved to: {output_dir}/plot_status_report.csv")
    
    return successful_plots

total_plots = create_all_subject_plots(processed_data)

Processing Subject S1: 3 sessions found
Figure saved to plots/SS1_Midterm 1_signals.png
Figure saved to plots/SS1_Midterm 2_signals.png
Figure saved to plots/SS1_Final_signals.png
Processing Subject S10: 3 sessions found
Figure saved to plots/SS10_Midterm 1_signals.png
Figure saved to plots/SS10_Midterm 2_signals.png
Figure saved to plots/SS10_Final_signals.png
Processing Subject S2: 3 sessions found
Figure saved to plots/SS2_Midterm 1_signals.png
Figure saved to plots/SS2_Midterm 2_signals.png
Figure saved to plots/SS2_Final_signals.png
Processing Subject S3: 3 sessions found
Figure saved to plots/SS3_Midterm 1_signals.png
Figure saved to plots/SS3_Midterm 2_signals.png
Figure saved to plots/SS3_Final_signals.png
Processing Subject S4: 3 sessions found
Figure saved to plots/SS4_Midterm 1_signals.png
Figure saved to plots/SS4_Midterm 2_signals.png
Figure saved to plots/SS4_Final_signals.png
Processing Subject S5: 3 sessions found
Figure saved to plots/SS5_Midterm 1_signals.png
Figure s