In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from glob import glob


In [2]:
data_path = Path("dataset")

In [3]:
metadeta = pd.read_csv(data_path / "metadata.csv")
print(metadeta.head(), end="\n\n")
print(metadeta.shape, end="\n\n")


   subjectID    class
0          1  Patient
1          2  Patient
2          3  Patient
3          4  Patient
4          5  Patient

(100, 2)



In [4]:
assert "subjectID" in metadeta.columns and "class" in metadeta.columns, "Metadata file must contain 'subjectID' and 'class' columns."

In [5]:
dupes=metadeta['subjectID'].duplicated().sum()
print(dupes)

0


In [6]:
subjectpath=sorted(glob(str(data_path / "subject_*.csv")))
print(len(subjectpath))

100


In [7]:
required_cols = ["trialID", "time", "stimulus", "pupil_diameter"]
frames = []

for p in subjectpath:
    df = pd.read_csv(p)
    # Parse subjectID from filename: "subject_97.csv" -> 97
    sid = int(Path(p).stem.split("_")[1])
    df["subjectID"] = sid

    # Ensure required columns exist (create empty if missing)
    for col in required_cols:
        if col not in df.columns:
            df[col] = np.nan  # will handle later; this keeps schema consistent

    # Normalize dtypes (keeps ML steps predictable)
    df["trialID"] = pd.to_numeric(df["trialID"], errors="coerce").astype("Int64")
    df["time"] = pd.to_numeric(df["time"], errors="coerce")
    df["pupil_diameter"] = pd.to_numeric(df["pupil_diameter"], errors="coerce")
    # leave 'stimulus' as-is (can be 'on'/'off' or 0/1); we’ll interpret later

    frames.append(df[["subjectID"] + required_cols])


In [8]:
raw_merged = pd.concat(frames, ignore_index=True)
print("Merged shape:", raw_merged.shape)
raw_merged.head()


Merged shape: (900302, 5)


Unnamed: 0,subjectID,trialID,time,stimulus,pupil_diameter
0,1,1,0.010234,no_stimulus,1.883395
1,1,1,0.019125,no_stimulus,1.749996
2,1,1,0.028648,no_stimulus,1.7367
3,1,1,0.039063,no_stimulus,1.718723
4,1,1,0.049582,no_stimulus,1.702478


In [9]:
merged_labeled = raw_merged.merge(metadeta, on="subjectID", how="left")

# Quick checks
missing_labels = merged_labeled["class"].isna().sum()
print("Rows with missing class:", missing_labels)
if missing_labels > 0:
    print("subjectIDs with missing labels:",
          merged_labeled.loc[merged_labeled["class"].isna(), "subjectID"].unique())


Rows with missing class: 0


In [10]:
# Per-subject row counts
rows_per_subject = merged_labeled.groupby("subjectID").size().rename("rows").reset_index()
print(rows_per_subject.head())

# Class distribution
print("Class distribution:\n", merged_labeled["class"].value_counts(dropna=False))

# Quick null check
null_summary = merged_labeled.isna().mean().sort_values(ascending=False)
print("Null fraction per column:\n", null_summary)


   subjectID  rows
0          1  8136
1          2  8901
2          3  9429
3          4  8637
4          5  9631
Class distribution:
 class
Healthy    610854
Patient    289448
Name: count, dtype: int64
Null fraction per column:
 subjectID         0.0
trialID           0.0
time              0.0
stimulus          0.0
pupil_diameter    0.0
class             0.0
dtype: float64


In [11]:
out_csv ="pupil_raw_merged.csv"
merged_labeled.to_csv(out_csv, index=False)
print("Saved CSV ->", out_csv)

# Optional: Parquet (faster I/O, better types)
try:
    out_parquet = "pupil_raw_merged.parquet"
    merged_labeled.to_parquet(out_parquet, index=False)
    print("Saved Parquet ->", out_parquet)
except Exception as e:
    print("Parquet save skipped (install pyarrow or fastparquet to enable):", e)


Saved CSV -> pupil_raw_merged.csv
Saved Parquet -> pupil_raw_merged.parquet
