In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
# from tqdm.notebook import tqdm
import glob
from sklearn.preprocessing import StandardScaler

BASE_DIR = Path("../data")
OUTPUT_DIR = BASE_DIR / "processed_data"
MERGED_OUTPUT = BASE_DIR / "merged_eye_data.csv"
MAIN_FOLDER = "../STData"

In [2]:
def load_eye_participant_data(participant_dir):
    """
        Loads EYE dataset for a single participant.
    """
    try:
        eye_path = glob.glob(os.path.join(participant_dir, "**", "*_EYE.csv"), recursive=True)
    except:
        print(f"Missing files in {participant_dir}")
        return None
    
    eye_df_list = []
    for eye_file in eye_path:
        temp_eye_df = pd.read_csv(eye_file)

        participant_id = os.path.basename(os.path.dirname(eye_file))
        temp_eye_df["participant_id"] = participant_id

        eye_df_list.append(temp_eye_df)   

    merged_eye_df = pd.concat(eye_df_list, ignore_index=True)
    merged_eye_df.to_csv("../data/raw/all_participants_eye_raw_dataset.csv", index=False)
    print("Merged Eye CSV created successfull!")

    return merged_eye_df

In [3]:
def load_ivt_participant_data(participant_dir):
    """
        Loads IVT dataset for a single participant.
    """
    try:
        ivt_path = glob.glob(os.path.join(participant_dir, "**", "*_IVT.csv"), recursive=True)
    except:
        print(f"Missing files in {participant_dir}")
        return None

    ivt_df_list = []
    for ivt_file in ivt_path:
        temp_ivt_df = pd.read_csv(ivt_file)

        participant_id = os.path.basename(os.path.dirname(ivt_file))
        temp_ivt_df["participant_id"] = participant_id

        ivt_df_list.append(temp_ivt_df)   

    merged_ivt_df = pd.concat(ivt_df_list, ignore_index=True)
    merged_ivt_df.to_csv("../data/raw/all_participants_ivt_raw_dataset.csv", index=False)
    print("Merged IVT CSV created successfull!")

    return merged_ivt_df


In [None]:
def feature_engineering(main_folder):
    all_participant_features = []
    for participant_folder in glob.glob(os.path.join(main_folder, '[0-9]*')):
        participant_id = os.path.basename(participant_folder)
        try:
            eye_path = glob.glob(os.path.join(participant_folder, '**', '*_EYE.csv'), recursive=True)[0]
            ivt_path = glob.glob(os.path.join(participant_folder, '**', '*_IVT.csv'), recursive=True)[0]
        except IndexError:
            print(f"Skipping participant {participant_id} due to missing EYE or IVT files.")
            continue

        eye_df = pd.read_csv(eye_path)
        ivt_df = pd.read_csv(ivt_path)

        eye_df['Timestamp'] = pd.to_datetime(eye_df['Timestamp'], errors='coerce')
        ivt_df['Timestamp'] = pd.to_datetime(ivt_df['Timestamp'], errors='coerce')

        eye_numeric_cols = eye_df.select_dtypes(include=[np.number]).columns
        eye_outlier_cols = []
        for col in eye_numeric_cols:
            Q1 = eye_df[col].quantile(0.25)
            Q3 = eye_df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            if((eye_df[col] < lower) | (eye_df[col] > upper)).any():
                eye_outlier_cols.append(col)

        for col in eye_df.columns:
            if eye_df[col].isnull().any():
                if col in eye_outlier_cols:
                    eye_df[col] = eye_df[col].fillna(eye_df[col].median())
                elif eye_df[col].dtype in ["float64", "int64"]:
                    eye_df[col] = eye_df[col].fillna(eye_df[col].mean())
                else:
                    eye_df[col] = eye_df[col].fillna(eye_df[col].mode()[0])

        eye_df['PupilAvg'] = (eye_df['ET_PupilLeft'] + eye_df['ET_PupilRight']) / 2
        pupil_mean = eye_df['PupilAvg'].mean()
        pupil_std = eye_df['PupilAvg'].std()

        ivt_numeric_cols = ivt_df.select_dtypes(include=[np.number]).columns
        ivt_outlier_cols = []
        for col in ivt_numeric_cols:
            Q1 = ivt_df[col].quantile(0.25)
            Q3 = ivt_df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            if((ivt_df[col] < lower) | (ivt_df[col] > upper)).any():
                ivt_outlier_cols.append(col)

        for col in ivt_df.columns:
            if ivt_df[col].isnull().any():
                if col in ivt_outlier_cols:
                    ivt_df[col] = ivt_df[col].fillna(ivt_df[col].median())
                elif ivt_df[col].dtype in ["float64", "int64"]:
                    ivt_df[col] = ivt_df[col].fillna(ivt_df[col].mean())
                else:
                    ivt_df[col] = ivt_df[col].fillna(ivt_df[col].mode()[0])

        fixation_mean = ivt_df['Fixation Duration'].mean()
        fixation_std = ivt_df['Fixation Duration'].std()
        n_fix = ivt_df['Fixation Index'].nunique()

        saccade_mean = ivt_df['Saccade Amplitude'].mean()
        saccade_std = ivt_df['Saccade Amplitude'].std()
        n_sacc = ivt_df['Saccade Index'].nunique()

        participant_features = {
            'participant_id': participant_id,
            'fixation_mean': fixation_mean,
            'fixation_std': fixation_std,
            'n_fixations': n_fix,
            'saccade_mean': saccade_mean,
            'saccade_std': saccade_std,
            'n_saccades': n_sacc,
            'pupil_mean': pupil_mean,
            'pupil_std': pupil_std
        }
        all_participant_features.append(participant_features)

    return all_participant_features

In [None]:
participant_features = feature_engineering(MAIN_FOLDER)
merged_participants_df = pd.DataFrame(participant_features)
merged_participants_df.to_csv('../data/processed/merged_participants.csv', index=False)
print('Merged participants CSV created successfully!')