# Time-Series Forecasting of Engagement Levels - Data Preprocessing

## Problem Statement
Predict future engagement levels from past gaze, EEG, and GSR signals using sequence-to-sequence models.

## Data Overview
- **Target**: Engagement levels (continuous score from ENG.csv)
- **Inputs**: 
  - EEG.csv: Brainwave bands (Delta, Theta, Alpha, Beta, Gamma)
  - EYE.csv + IVT.csv: Eye-tracking signals (fixation, saccade, pupil)
  - GSR.csv: Skin conductance/resistance values

## Preprocessing Pipeline
1. Data synchronization and timestamp alignment
2. Feature extraction from multimodal signals
3. Sliding window approach for sequence-to-value prediction
4. Normalization and label preparation


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")


## 1. Data Loading and Initial Exploration


In [None]:
# Load all data files
print("Loading data files...")

# Load engagement data (target variable)
eng_data = pd.read_csv('../Data/ENG.csv')
print(f"Engagement data shape: {eng_data.shape}")

# Load EEG data
eeg_data = pd.read_csv('../Data/EEG.csv')
print(f"EEG data shape: {eeg_data.shape}")

# Load GSR data
gsr_data = pd.read_csv('../Data/GSR.csv')
print(f"GSR data shape: {gsr_data.shape}")

# Load eye-tracking data
eye_data = pd.read_csv('../Data/EYE.csv')
print(f"Eye-tracking data shape: {eye_data.shape}")

# Load IVT data
ivt_data = pd.read_csv('../Data/IVT.csv')
print(f"IVT data shape: {ivt_data.shape}")

print("\nData loading completed!")


In [None]:
# Explore data structure and basic statistics
print("=== ENGAGEMENT DATA ===")
print(eng_data.head())
print(f"\nEngagement data info:")
print(eng_data.info())
print(f"\nEngagement statistics:")
print(eng_data.describe())

print("\n" + "="*50)
print("=== EEG DATA ===")
print(f"EEG columns: {list(eeg_data.columns)}")
print(f"\nEEG data info:")
print(eeg_data.info())
print(f"\nEEG statistics (first 10 columns):")
print(eeg_data.iloc[:, :10].describe())


In [None]:
# Explore GSR and Eye-tracking data
print("=== GSR DATA ===")
print(f"GSR columns: {list(gsr_data.columns)}")
print(f"\nGSR data info:")
print(gsr_data.info())
print(f"\nGSR statistics:")
print(gsr_data.describe())

print("\n" + "="*50)
print("=== EYE-TRACKING DATA ===")
print(f"Eye-tracking columns: {list(eye_data.columns)}")
print(f"\nEye-tracking data info:")
print(eye_data.info())
print(f"\nEye-tracking statistics (first 10 columns):")
print(eye_data.iloc[:, :10].describe())

print("\n" + "="*50)
print("=== IVT DATA ===")
print(f"IVT columns: {list(ivt_data.columns)}")
print(f"\nIVT data info:")
print(ivt_data.info())
print(f"\nIVT statistics (first 10 columns):")
print(ivt_data.iloc[:, :10].describe())


## 2. Data Synchronization and Timestamp Alignment


In [None]:
# Convert timestamps to datetime and unix time for synchronization
def prepare_timestamps(df, timestamp_col, unix_col=None):
    """Prepare timestamps for synchronization"""
    df = df.copy()
    
    # Convert timestamp to datetime
    if timestamp_col in df.columns:
        df[timestamp_col] = pd.to_datetime(df[timestamp_col])
    
    # Use unix time if available, otherwise convert from datetime
    if unix_col and unix_col in df.columns:
        df['unix_time'] = df[unix_col]
    else:
        df['unix_time'] = df[timestamp_col].astype('int64') // 10**9
    
    return df

# Prepare timestamps for each dataset
print("Preparing timestamps for synchronization...")

# Engagement data
eng_data = prepare_timestamps(eng_data, 'Timestamp')
print(f"Engagement time range: {eng_data['Timestamp'].min()} to {eng_data['Timestamp'].max()}")

# EEG data
eeg_data = prepare_timestamps(eeg_data, 'TimeStamp', 'UnixTime')
print(f"EEG time range: {eeg_data['TimeStamp'].min()} to {eeg_data['TimeStamp'].max()}")

# GSR data
gsr_data = prepare_timestamps(gsr_data, 'Timestamp', 'UnixTime')
print(f"GSR time range: {gsr_data['Timestamp'].min()} to {gsr_data['Timestamp'].max()}")

# Eye-tracking data
eye_data = prepare_timestamps(eye_data, 'Timestamp', 'UnixTime')
print(f"Eye-tracking time range: {eye_data['Timestamp'].min()} to {eye_data['Timestamp'].max()}")

# IVT data
ivt_data = prepare_timestamps(ivt_data, 'Timestamp', 'UnixTime')
print(f"IVT time range: {ivt_data['Timestamp'].min()} to {ivt_data['Timestamp'].max()}")

print("\nTimestamp preparation completed!")


In [None]:
# Find common time range for all datasets
def find_common_time_range(datasets):
    """Find the common time range across all datasets"""
    start_times = [df['unix_time'].min() for df in datasets if not df.empty]
    end_times = [df['unix_time'].max() for df in datasets if not df.empty]
    
    common_start = max(start_times)
    common_end = min(end_times)
    
    return common_start, common_end

# Find common time range
common_start, common_end = find_common_time_range([eng_data, eeg_data, gsr_data, eye_data, ivt_data])

print(f"Common time range:")
print(f"Start: {datetime.fromtimestamp(common_start)}")
print(f"End: {datetime.fromtimestamp(common_end)}")
print(f"Duration: {(common_end - common_start) / 3600:.2f} hours")

# Filter all datasets to common time range
eng_data = eng_data[(eng_data['unix_time'] >= common_start) & (eng_data['unix_time'] <= common_end)].copy()
eeg_data = eeg_data[(eeg_data['unix_time'] >= common_start) & (eeg_data['unix_time'] <= common_end)].copy()
gsr_data = gsr_data[(gsr_data['unix_time'] >= common_start) & (gsr_data['unix_time'] <= common_end)].copy()
eye_data = eye_data[(eye_data['unix_time'] >= common_start) & (eye_data['unix_time'] <= common_end)].copy()
ivt_data = ivt_data[(ivt_data['unix_time'] >= common_start) & (ivt_data['unix_time'] <= common_end)].copy()

print(f"\nFiltered dataset sizes:")
print(f"Engagement: {eng_data.shape[0]} samples")
print(f"EEG: {eeg_data.shape[0]} samples")
print(f"GSR: {gsr_data.shape[0]} samples")
print(f"Eye-tracking: {eye_data.shape[0]} samples")
print(f"IVT: {ivt_data.shape[0]} samples")


## 3. Resampling to Consistent Frequency


In [None]:
# Resample all datasets to a consistent frequency (1 Hz = 1 sample per second)
def resample_to_frequency(df, unix_col, target_freq=1.0):
    """Resample data to target frequency using interpolation"""
    df = df.copy()
    
    # Create a regular time index
    start_time = df[unix_col].min()
    end_time = df[unix_col].max()
    time_index = np.arange(start_time, end_time + 1/target_freq, 1/target_freq)
    
    # Create new dataframe with regular time index
    resampled_df = pd.DataFrame({'unix_time': time_index})
    
    # Merge with original data and interpolate
    df_merged = pd.merge(resampled_df, df, on='unix_time', how='left')
    
    # Interpolate missing values for numeric columns
    numeric_cols = df_merged.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if col != 'unix_time':
            df_merged[col] = df_merged[col].interpolate(method='linear')
    
    return df_merged

print("Resampling datasets to 1 Hz frequency...")

# Resample each dataset
eng_resampled = resample_to_frequency(eng_data, 'unix_time', 1.0)
eeg_resampled = resample_to_frequency(eeg_data, 'unix_time', 1.0)
gsr_resampled = resample_to_frequency(gsr_data, 'unix_time', 1.0)
eye_resampled = resample_to_frequency(eye_data, 'unix_time', 1.0)
ivt_resampled = resample_to_frequency(ivt_data, 'unix_time', 1.0)

print(f"Resampled dataset sizes:")
print(f"Engagement: {eng_resampled.shape[0]} samples")
print(f"EEG: {eeg_resampled.shape[0]} samples")
print(f"GSR: {gsr_resampled.shape[0]} samples")
print(f"Eye-tracking: {eye_resampled.shape[0]} samples")
print(f"IVT: {ivt_resampled.shape[0]} samples")
