## Step 0: Experiment Directory Structure and Naming Convention

**Overview**
Each experiment variant is organized in its own subdirectory within `sequential_classification/`.
This ensures that all inputs, models, and logs remain self-contained and reproducible.

Two main experiment pipelines are used:

- `experiment_manual_selection/`: Uses manually selected gaze and linguistic features informed by prior literature.
- `experiment_feature_analysis/`: Uses an extended feature set derived for exploratory and ablation studies.

**Directory Layout**
````text
sequential_classification/
├── 05_BEyeLSTM_variation.ipynb # This notebook implementing the hybrid BiLSTM model
├── experiment_manual_selection/
│   ├── inputs/     # Preprocessed feature arrays and label files
│   ├── outputs/    # Trained models, metrics, and result tables
│   └── logs/       # Training logs and configuration metadata
└── experiment_feature_analysis/
    ├── inputs/
    ├── outputs/
    └── logs/
````

**Contents of Each `inputs/` Folder**
- `X_fix.npy` – Fixation-level feature tensor (duration, saccades, regressions).
- `X_pos.npy` – Encoded part-of-speech sequences.
- `X_con.npy` – Encoded content-word indices.
- `y_labels.npy` – Binary class labels (1 = expert, 0 = non-expert).
- `participant_ids.npy` – Participant identifiers matching input sequences.
- `screen_ids.npy` – Screen or trial identifiers for grouped cross-validation.

**Usage**
To select an experiment configuration, specify the folder name in the configuration cell:

In [1]:
# --- Step 0: Imports, Experiment Configuration and Global Paths ---

# --- Standard Library ---
import ast
import glob
import json
import os
import sys
from collections import Counter
from typing import Dict, List, Optional, Tuple

# --- Third-Party Libraries ---
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
from sklearn.metrics import (
    confusion_matrix,
    precision_recall_fscore_support,
    roc_auc_score,
)
from sklearn.model_selection import GroupKFold
from tensorflow.keras import Input, Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import (
    Bidirectional,
    Dense,
    Dropout,
    Embedding,
    LSTM,
    concatenate,
)
from tensorflow.keras.preprocessing.sequence import pad_sequences

# -------------------------------------------------------------------------
# Select active experiment variant
# -------------------------------------------------------------------------
EXPERIMENT = "experiment_manual_selection"  # "experiment_feature_analysis" or "experiment_manual_selection"

# -------------------------------------------------------------------------
# Base directory resolution
# -------------------------------------------------------------------------
# If run from inside sequential_classification/, go one level up for project root
CWD = os.getcwd()
if os.path.basename(CWD) == "sequential_classification":
    PROJECT_ROOT = os.path.dirname(CWD)
else:
    PROJECT_ROOT = CWD

# Experiment directory lives inside sequential_classification/
EXPERIMENT_DIR = os.path.join(PROJECT_ROOT, "sequential_classification", EXPERIMENT)

# -------------------------------------------------------------------------
# Define core folders
# -------------------------------------------------------------------------
PATHS = {
    "project_root": PROJECT_ROOT,
    "experiment": EXPERIMENT_DIR,
    "inputs": os.path.join(EXPERIMENT_DIR, "inputs"),
    "outputs": os.path.join(EXPERIMENT_DIR, "outputs"),
    "logs": os.path.join(EXPERIMENT_DIR, "logs"),
}

# Ensure required folders exist
for key in ["inputs", "outputs", "logs"]:
    os.makedirs(PATHS[key], exist_ok=True)

# -------------------------------------------------------------------------
# Define key file paths used across the pipeline
# -------------------------------------------------------------------------
PATHS.update({
    # Raw merged data (project_root/data/raw/)
    "data": os.path.join(PROJECT_ROOT, "data", "raw", "et_data_merged_with_ann_materials_dummy.csv"),

    # Label-related files
    "labels": os.path.join(PATHS["inputs"], "labels.csv"),
    "participant_ids": os.path.join(PATHS["inputs"], "participant_ids.npy"),
    "screen_ids": os.path.join(PATHS["inputs"], "screen_ids.npy"),
    "y_labels": os.path.join(PATHS["inputs"], "y_labels.npy"),

    # Feature arrays (to be generated later)
    "X_fix": os.path.join(PATHS["inputs"], "X_fix.npy"),
    "X_pos": os.path.join(PATHS["inputs"], "X_pos.npy"),
    "X_con": os.path.join(PATHS["inputs"], "X_con.npy"),

    # Model artifacts
    "results_csv": os.path.join(PATHS["outputs"], "results.csv"),
    "final_model": os.path.join(PATHS["outputs"], "final_model.keras"),
})

# -------------------------------------------------------------------------
# Summary
# -------------------------------------------------------------------------
print(f"Active experiment: {EXPERIMENT}")
for name, path in PATHS.items():
    print(f"{name:>18}: {path}")

2025-10-25 16:21:51.707777: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-25 16:21:51.709107: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-25 16:21:51.736454: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-25 16:21:51.737086: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Active experiment: experiment_manual_selection
      project_root: /mnt/c/Users/Consti/PycharmProjects/BachelorCode
        experiment: /mnt/c/Users/Consti/PycharmProjects/BachelorCode/sequential_classification/experiment_manual_selection
            inputs: /mnt/c/Users/Consti/PycharmProjects/BachelorCode/sequential_classification/experiment_manual_selection/inputs
           outputs: /mnt/c/Users/Consti/PycharmProjects/BachelorCode/sequential_classification/experiment_manual_selection/outputs
              logs: /mnt/c/Users/Consti/PycharmProjects/BachelorCode/sequential_classification/experiment_manual_selection/logs
              data: /mnt/c/Users/Consti/PycharmProjects/BachelorCode/data/raw/et_data_merged_with_ann_materials_dummy.csv
            labels: /mnt/c/Users/Consti/PycharmProjects/BachelorCode/sequential_classification/experiment_manual_selection/inputs/labels.csv
   participant_ids: /mnt/c/Users/Consti/PycharmProjects/BachelorCode/sequential_classification/experiment_man

## Step 0.5: Enrich Dataset with Linguistic and Structural Columns

**Overview**

This step enriches the merged eye-tracking dataset with the linguistic and categorical
columns required for the BEyeLSTM manual-feature pipeline.
It aligns the raw fixation data with linguistic annotations and prepares the feature space
used in subsequent preprocessing and modeling steps.

**Process**
1. Adds simplified part-of-speech (PoS) tags and a PoS list per AOI.
2. Identifies content-word and medical-term occurrences.
3. Assigns AOI composition categories (`Content-Only`, `Function-Only`, `Mixed`).
4. Computes prioritized lemma frequencies (`N` > `V` > `A`).
5. Ensures all columns required for subsequent feature engineering and model preparation exist.

**Input**
- Merged eye-tracking and annotation dataset (`PATHS["data"]`).

**Output**
- Enriched dataset with linguistic and structural columns, stored under the current experiment’s `inputs/` directory.


In [51]:
# --- Step 0.5: Enrich dummy dataset with unified linguistic columns ---

dummy_path = PATHS["data"]

if not os.path.exists(dummy_path):
    raise FileNotFoundError(f"Dummy dataset not found at {dummy_path}")

print(f"Loading dummy dataset from {dummy_path}...")
df = pd.read_csv(dummy_path, sep="\t", low_memory=False)
print(f"Initial shape: {df.shape}")

# -------------------------------------------------------------------------
# 1. POS mappings
# -------------------------------------------------------------------------
simplified_pos_mapping = {
    'PROPN': 'N', 'NOUN': 'N', 'VERB': 'V', 'ADV': 'A', 'ADJ': 'A',
    'PUNCT': 'FUNC', 'PRON': 'FUNC', 'SCONJ': 'FUNC', 'NUM': 'FUNC',
    'DET': 'FUNC', 'CCONJ': 'FUNC', 'ADP': 'FUNC', 'AUX': 'FUNC',
    'INTJ': 'FUNC', 'X': 'FUNC', 'PART': 'FUNC'
}
content_word_mapping = {
    'PROPN': True, 'NOUN': True, 'VERB': True, 'ADV': True, 'ADJ': True,
    'PUNCT': False, 'PRON': False, 'SCONJ': False, 'NUM': False,
    'DET': False, 'CCONJ': False, 'ADP': False, 'AUX': False,
    'INTJ': False, 'X': False, 'PART': False
}

# -------------------------------------------------------------------------
# 2. Derived columns: simplified_pos, is_content_word, simplified_pos_list
# -------------------------------------------------------------------------
def aggregate_pos_features(raw_pos_string: str):
    if not isinstance(raw_pos_string, str):
        return pd.Series({'simplified_pos': 'UNK', 'is_content_word': False})
    tags = [t.strip() for t in raw_pos_string.replace('+', '|').split('|') if t.strip()]
    if not tags:
        return pd.Series({'simplified_pos': 'UNK', 'is_content_word': False})
    simplified_tags = [simplified_pos_mapping.get(t, 'UNK') for t in tags]
    is_content = any(content_word_mapping.get(t, False) for t in tags)
    tags_to_consider = [t for t in simplified_tags if t != 'FUNC'] or simplified_tags
    counts = Counter(tags_to_consider)
    final_tag = counts.most_common(1)[0][0] if counts else 'UNK'
    return pd.Series({'simplified_pos': final_tag, 'is_content_word': is_content})

def create_simplified_pos_list(pos_string: str):
    if not isinstance(pos_string, str):
        return []
    original_tags = [tag.strip() for tag in pos_string.replace('+', '|').split('|') if tag.strip()]
    if not original_tags:
        return []
    simplified_tags = [simplified_pos_mapping.get(tag, 'UNK') for tag in original_tags]
    return simplified_tags

if 'pos_merged' in df.columns:
    print("Deriving simplified POS features and lists...")
    enriched = df['pos_merged'].apply(aggregate_pos_features)
    df = pd.concat([df, enriched], axis=1)
    df['simplified_pos_list'] = df['pos_merged'].apply(create_simplified_pos_list)
else:
    df['simplified_pos'] = 'UNK'
    df['is_content_word'] = False
    df['simplified_pos_list'] = [[] for _ in range(len(df))]

# -------------------------------------------------------------------------
# 3. AOI composition and medical term flag
# -------------------------------------------------------------------------
def get_aoi_composition(raw_pos_string: str) -> str:
    if not isinstance(raw_pos_string, str):
        return 'Unknown'
    tags = [t.strip() for t in raw_pos_string.replace('+', '|').split('|') if t.strip()]
    if not tags:
        return 'Unknown'
    flags = [content_word_mapping.get(t, False) for t in tags]
    if all(flags):
        return 'Content-Only'
    elif not any(flags):
        return 'Function-Only'
    return 'Mixed'

df['aoi_composition'] = df.get('pos_merged', '').apply(get_aoi_composition)
if 'tag.type' in df.columns:
    df['is_medical_term'] = df['tag.type'].isin([1, 2, 3, 4, 5, 7])
else:
    df['is_medical_term'] = False

# -------------------------------------------------------------------------
# 4. Prioritized lemma frequency
# -------------------------------------------------------------------------
POS_TAGS_COLUMN = 'simplified_pos_list'
LEMMA_FREQUENCIES_COLUMN = 'lemma_frequency_text'
NOUN_TAGS, VERB_TAGS, ADJECTIVE_TAGS = ['N'], ['V'], ['A']

def safe_literal_eval(val):
    try:
        if not isinstance(val, str):
            return []
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return []

def get_prioritized_frequency(row):
    try:
        pos_tags = row[POS_TAGS_COLUMN]
        freq_strings = row[LEMMA_FREQUENCIES_COLUMN]
        if not isinstance(pos_tags, list) or not isinstance(freq_strings, list) or len(pos_tags) != len(freq_strings):
            return 0
        tagged_freqs = []
        for tag, freq_str in zip(pos_tags, freq_strings):
            try:
                tagged_freqs.append((tag, float(freq_str)))
            except (ValueError, TypeError):
                continue
        noun_freqs = [f for t, f in tagged_freqs if t in NOUN_TAGS]
        verb_freqs = [f for t, f in tagged_freqs if t in VERB_TAGS]
        adj_freqs  = [f for t, f in tagged_freqs if t in ADJECTIVE_TAGS]
        if noun_freqs: return min(noun_freqs)
        if verb_freqs: return min(verb_freqs)
        if adj_freqs:  return min(adj_freqs)
        return 0
    except (TypeError, AttributeError):
        return 0

if LEMMA_FREQUENCIES_COLUMN in df.columns:
    print("Computing prioritized lemma frequencies...")
    df[LEMMA_FREQUENCIES_COLUMN] = df[LEMMA_FREQUENCIES_COLUMN].apply(safe_literal_eval)
    df['simplified_pos_list'] = df['simplified_pos_list'].apply(lambda pos_list: [tag for tag in pos_list if tag != 'UNK'])
    df['prioritized_lemma_frequency'] = df.apply(get_prioritized_frequency, axis=1)
else:
    print("No lemma frequency column found; initializing prioritized_lemma_frequency = 0.0")
    df['prioritized_lemma_frequency'] = 0.0

# -------------------------------------------------------------------------
# 5. Save the enriched dummy dataset
# -------------------------------------------------------------------------
expected_cols = [
    'simplified_pos', 'simplified_pos_list', 'is_content_word',
    'is_medical_term', 'aoi_composition', 'prioritized_lemma_frequency'
]
for col in expected_cols:
    if col not in df.columns:
        df[col] = 0 if 'is_' not in col else False

df.to_csv(dummy_path, sep="\t", index=False)
print(f"Enriched dummy dataset saved to: {dummy_path}")
print("\nPreview of added/verified columns:")
print(df[expected_cols].head())

Loading dummy dataset from /mnt/c/Users/Consti/PycharmProjects/BachelorCode/data/raw/et_data_merged_with_ann_materials_dummy.csv...
Initial shape: (30, 110)
Deriving simplified POS features and lists...
Computing prioritized lemma frequencies...
Enriched dummy dataset saved to: /mnt/c/Users/Consti/PycharmProjects/BachelorCode/data/raw/et_data_merged_with_ann_materials_dummy.csv

Preview of added/verified columns:
  simplified_pos simplified_pos simplified_pos_list  is_content_word  \
0              N              N                 [N]             True   
1              N              N           [N, FUNC]             True   
2              N              N                 [N]             True   
3           FUNC           FUNC              [FUNC]            False   
4              N              N                 [N]             True   

   is_content_word  is_medical_term aoi_composition  \
0             True             True    Content-Only   
1             True            False     

## Step 1: Create Participant-Level Labels (`labels.csv`)

**Overview**

This step extracts and binarizes participant expertise information from the merged
eye-tracking and annotation dataset.
It produces a compact label table containing one entry per participant, which is
used to align participant-level expertise labels with session- or screen-level
data in subsequent stages of the pipeline.

**Input**
- Enriched eye-tracking and annotation dataset (`PATHS["data"]`).

**Output**
- Participant-level label file (`labels.csv`), stored in the current experiment’s `inputs/` directory.


In [52]:
# --- Step 1: Create participant-level label file ---

print("Loading merged participant annotation data...")
main_df = pd.read_csv(PATHS["data"], sep="\t", low_memory=False)

# Select and binarize participant-level labels
labels_df = main_df[["Participant_unique", "is.expert"]].copy()
labels_df["is.expert"] = labels_df["is.expert"].map({"expert": 1, "non-expert": 0})

# Standardize column names
labels_df.rename(columns={
    "Participant_unique": "participant_id",
    "is.expert": "class_label"
}, inplace=True)

# Remove duplicates and reset index
labels_df.drop_duplicates(inplace=True)
labels_df.reset_index(drop=True, inplace=True)

# Save participant-level labels
labels_df.to_csv(PATHS["labels"], sep="\t", index=False)

print(f"Saved participant labels to: {PATHS['labels']}")
print(f"Total unique participants: {len(labels_df)}")
print(labels_df.head())

Loading merged participant annotation data...
Saved participant labels to: /mnt/c/Users/Consti/PycharmProjects/BachelorCode/sequential_classification/experiment_manual_selection/inputs/labels.csv
Total unique participants: 3
      participant_id  class_label
0  Participant10-1_A            0
1  Participant11-1_B            1
2  Participant12-2_A            0


## Step 2: Prepare Sequential Fixation Features (`X_fix.npy`, `participant_ids.npy`, `screen_ids.npy`)

**Overview**

This step transforms fixation-level behavioral data into fixed-length sequences suitable
for the Hybrid BiLSTM architecture.
It groups fixations by participant and screen, extracts relevant gaze and linguistic
metrics, and standardizes sequence length through padding to ensure consistent input
dimensions across all samples.

**Input**
- Merged fixation-level dataset containing per-word gaze features (`PATHS["data"]`).
- Experiment configuration from Step 0 (defines output directories and file naming).

**Process**
1. Selects and prepares the relevant fixation and gaze-related feature columns.
2. Encodes categorical and boolean features as numerical arrays.
3. Groups data by participant × screen and pads all sequences to a fixed length.
4. Saves the resulting NumPy arrays for model input.

**Output**
- `PATHS["inputs"]/X_fix.npy`
- `PATHS["inputs"]/participant_ids.npy`
- `PATHS["inputs"]/screen_ids.npy`


In [53]:
# --- Step 2: Prepare sequential fixation data arrays ---

# -------------------------------------------------------------------------
# Utility: Sequence preparation
# -------------------------------------------------------------------------
def prepare_sequential_data(
    df: pd.DataFrame,
    feature_cols: List[str],
    participant_col: str,
    screen_col: str,
    sequence_length: int = 128,
    sort_col: Optional[str] = None
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:

    if sort_col:
        print(f"Sorting by '{sort_col}'...")
        df = df.sort_values(by=sort_col).reset_index(drop=True)

    print("Grouping by participant × screen...")
    X_list, p_list, s_list = [], [], []

    for (pid, sid), group in tqdm(df.groupby([participant_col, screen_col]), desc="Screens"):
        arr = group[feature_cols].to_numpy(dtype=np.float32)
        X_list.append(arr)
        p_list.append(pid)
        s_list.append(sid)

    print(f"Padding {len(X_list)} sequences to length {sequence_length}...")
    X_padded = pad_sequences(
        X_list, maxlen=sequence_length, padding="post", truncating="post", dtype="float32"
    )

    return X_padded, np.array(p_list), np.array(s_list)

# -------------------------------------------------------------------------
# Configuration
# -------------------------------------------------------------------------

RAW_FILE = PATHS["data"]
PARTICIPANT_COLUMN = "Participant_unique"
SCREEN_COLUMN = "screenid"
SORT_COLUMN = "index"
SEQUENCE_LENGTH = 128

# Define feature sets
FEATURE_COLUMNS_MANUAL = [
    "Total_duration_of_fixations", "First-pass_duration", "First-pass_regression",
    "text_version", "text.type", "is_medical_term", "simplified_pos",
    "is_content_word", "prioritized_lemma_frequency"
]

FEATURE_COLUMNS_FEAT_ANALYSIS = [
    "Average_pupil_diameter", "word_frequency_screen", "Peak_velocity_of_entry_saccade",
    "Peak_velocity_of_exit_saccade", "Total_duration_of_Visit", "Time_to_exit_saccade",
    "word_count_text", "AOI_length", "Duration_of_first_Visit", "Regression-path_duration",
    "tag.id", "First-pass_first_fixation_duration", "Maximum_duration_of_fixations",
    "Duration_of_first_whole_fixation"
]

BOOLEAN_COLS = ["is_medical_term", "is_content_word"]
CATEGORICAL_COLS = ["text_version", "text.type", "simplified_pos"]

# Determine mode automatically from global EXPERIMENT variable
if "manual" in EXPERIMENT.lower():
    feature_mode = "manual"
    FEATURE_COLUMNS = FEATURE_COLUMNS_MANUAL
elif "feature" in EXPERIMENT.lower():
    feature_mode = "feature_analysis"
    FEATURE_COLUMNS = FEATURE_COLUMNS_FEAT_ANALYSIS
else:
    sys.exit(f"Error: Could not infer feature mode from EXPERIMENT='{EXPERIMENT}'.")

print(f"\n--- Preparing sequential data in '{feature_mode}' mode ---")

# -------------------------------------------------------------------------
# Load and validate data
# -------------------------------------------------------------------------
print(f"Loading raw data from {RAW_FILE} ...")
try:
    df = pd.read_csv(RAW_FILE, sep="\t", low_memory=False)
except FileNotFoundError:
    sys.exit(f"Error: File not found at {RAW_FILE}")

missing = [c for c in [PARTICIPANT_COLUMN, SCREEN_COLUMN] + FEATURE_COLUMNS if c not in df.columns]
if missing:
    sys.exit(f"Error: Missing required columns: {missing}")
print("All required columns present.")

# Create an artificial index column if not present
if 'index' not in df.columns:
    df['index'] = range(len(df))
# This step assumes the raw data is already sorted by participant, screen, and word order and should be removed / altered accordingly.

# -------------------------------------------------------------------------
# Encoding and feature engineering
# -------------------------------------------------------------------------
if feature_mode == "manual":
    print("Encoding boolean and categorical features...")
    df[BOOLEAN_COLS] = df[BOOLEAN_COLS].astype(int)
    before = set(df.columns)
    df = pd.get_dummies(df, columns=CATEGORICAL_COLS, dtype=int)
    new_cols = list(set(df.columns) - before)

    numerical_cols = [c for c in FEATURE_COLUMNS_MANUAL if c not in BOOLEAN_COLS + CATEGORICAL_COLS]
    final_features = numerical_cols + BOOLEAN_COLS + new_cols
else:
    final_features = FEATURE_COLUMNS_FEAT_ANALYSIS

print(f"Total features used: {len(final_features)}")
df[final_features] = df[final_features].fillna(0)

# -------------------------------------------------------------------------
# Run sequence preparation
# -------------------------------------------------------------------------
print("\nGenerating padded fixation sequences...")
X_fix, participant_ids, screen_ids = prepare_sequential_data(
    df=df,
    feature_cols=final_features,
    participant_col=PARTICIPANT_COLUMN,
    screen_col=SCREEN_COLUMN,
    sequence_length=SEQUENCE_LENGTH,
    sort_col=SORT_COLUMN
)

print("\n--- Verification ---")
print(f"X_fix shape: {X_fix.shape}")
print(f"Participants: {len(np.unique(participant_ids))}, Screens: {len(screen_ids)}")

# -------------------------------------------------------------------------
# Save arrays
# -------------------------------------------------------------------------
np.save(PATHS["X_fix"], X_fix)
np.save(PATHS["participant_ids"], participant_ids)
np.save(PATHS["screen_ids"], screen_ids)
print(f"\nSaved prepared arrays to: {PATHS['inputs']}")



--- Preparing sequential data in 'manual' mode ---
Loading raw data from /mnt/c/Users/Consti/PycharmProjects/BachelorCode/data/raw/et_data_merged_with_ann_materials_dummy.csv ...
All required columns present.
Encoding boolean and categorical features...
Total features used: 13

Generating padded fixation sequences...
Sorting by 'index'...
Grouping by participant × screen...


Screens: 100%|██████████| 6/6 [00:00<00:00, 2223.13it/s]

Padding 6 sequences to length 128...

--- Verification ---
X_fix shape: (6, 128, 13)
Participants: 3, Screens: 6

Saved prepared arrays to: /mnt/c/Users/Consti/PycharmProjects/BachelorCode/sequential_classification/experiment_manual_selection/inputs





## Step 3: Create Session-Aligned Ground Truth Labels (`y_labels.npy`)

**Overview**

This step aligns participant-level expertise annotations with the session-level fixation
feature arrays.
It merges the participant expertise information from `labels.csv` with the unique
`(participant_id, screen_id)` pairs extracted from the prepared fixation sequences,
ensuring exact alignment between behavioral features and target labels.

**Input**
- Participant-level label file (`PATHS["inputs"]/labels.csv`)
- Participant identifiers (`PATHS["inputs"]/participant_ids.npy`)
- Screen identifiers (`PATHS["inputs"]/screen_ids.npy`)

**Process**
1. Load participant-level expertise labels.
2. Load the participant and screen identifiers corresponding to each fixation sequence.
3. Create one entry per unique session (`participant_id × screen_id`).
4. Merge the data sources to produce a label array aligned with the model inputs.

**Output**
- Session-aligned label array (`PATHS["inputs"]/y_labels.npy`)

In [54]:
# --- Step 3: Create session-aligned y_labels.npy ---

# -------------------------------------------------------------------------
# 1. Load participant-level labels
# -------------------------------------------------------------------------
if not os.path.exists(PATHS["labels"]):
    raise FileNotFoundError(f"labels.csv not found at {PATHS['labels']}. Run Step 1 first.")

labels_df = pd.read_csv(PATHS["labels"], sep="\t", low_memory=False)
print(f"Loaded labels.csv with {len(labels_df)} participants.")

# -------------------------------------------------------------------------
# 2. Load participant and screen IDs
# -------------------------------------------------------------------------
if not (os.path.exists(PATHS["participant_ids"]) and os.path.exists(PATHS["screen_ids"])):
    raise FileNotFoundError(
        "participant_ids.npy or screen_ids.npy not found in inputs/. "
        "Run Step 2 (sequence preparation) before this step."
    )

participant_ids = np.load(PATHS["participant_ids"])
screen_ids = np.load(PATHS["screen_ids"])

print(f"Loaded {len(participant_ids)} participant IDs and {len(screen_ids)} screen IDs.")

# -------------------------------------------------------------------------
# 3. Build session DataFrame and merge with labels
# -------------------------------------------------------------------------
events_df = pd.DataFrame({
    "participant_id": participant_ids,
    "screen_id": screen_ids
})

# Drop duplicate sessions (participant × screen)
sessions_df = (
    events_df
    .drop_duplicates(subset=["participant_id", "screen_id"])
    .sort_values(by=["participant_id", "screen_id"])
    .reset_index(drop=True)
)

print(f"Identified {len(sessions_df)} unique participant-screen sessions.")

# Merge session list with expertise labels
merged_df = pd.merge(
    sessions_df,
    labels_df,
    on="participant_id",
    how="left"
)

# -------------------------------------------------------------------------
# 4. Extract and verify y_labels array
# -------------------------------------------------------------------------
if merged_df["class_label"].isnull().any():
    missing = merged_df["class_label"].isnull().sum()
    print(f"Warning: {missing} sessions missing labels (filled with 0).")
    merged_df["class_label"].fillna(0, inplace=True)

y_labels = merged_df["class_label"].astype(int).values

print("\n--- Verification ---")
print(f"y_labels shape: {y_labels.shape}")
unique, counts = np.unique(y_labels, return_counts=True)
print(f"Class distribution: {dict(zip(unique, counts))}")

# -------------------------------------------------------------------------
# 5. Save aligned labels
# -------------------------------------------------------------------------
np.save(PATHS["y_labels"], y_labels)
print(f"\nSaved session-aligned labels to: {PATHS['y_labels']}")

Loaded labels.csv with 3 participants.
Loaded 6 participant IDs and 6 screen IDs.
Identified 6 unique participant-screen sessions.

--- Verification ---
y_labels shape: (6,)
Class distribution: {0: 4, 1: 2}

Saved session-aligned labels to: /mnt/c/Users/Consti/PycharmProjects/BachelorCode/sequential_classification/experiment_manual_selection/inputs/y_labels.npy


### Validation: Label Alignment Check

**Purpose**
Ensures that:
1. The number of labels matches the number of sessions in `X_fix.npy`.
2. The order of participants and screens is consistent across arrays.
3. No missing or unexpected label values are present.

In [55]:
# ==============================================================================
# 1. LOAD ARRAYS
# ==============================================================================
X_fix_path = PATHS["X_fix"]
y_labels_path = PATHS["y_labels"]
p_ids_path = PATHS["participant_ids"]
s_ids_path = PATHS["screen_ids"]

X_fix = np.load(X_fix_path)
y_labels = np.load(y_labels_path)
participant_ids = np.load(p_ids_path)
screen_ids = np.load(s_ids_path)

print(f"Loaded X_fix: {X_fix.shape}")
print(f"Loaded y_labels: {y_labels.shape}")
print(f"Loaded participant_ids: {participant_ids.shape}")
print(f"Loaded screen_ids: {screen_ids.shape}")

# ==============================================================================
# 2. BASIC CONSISTENCY CHECKS
# ==============================================================================
n_screens_fix = X_fix.shape[0]
n_labels = y_labels.shape[0]
n_participants = participant_ids.shape[0]

assert n_screens_fix == n_labels == n_participants, (
    f"Mismatch detected:\n"
    f"X_fix = {n_screens_fix}, y_labels = {n_labels}, participant_ids = {n_participants}"
)
print("Array lengths are perfectly aligned.")

# ==============================================================================
# 3. CLASS BALANCE & LABEL VALIDATION
# ==============================================================================
unique, counts = np.unique(y_labels, return_counts=True)
print("\nLabel distribution:")
for u, c in zip(unique, counts):
    print(f"  Class {u}: {c} ({(c / len(y_labels)) * 100:.1f}%)")

if np.isnan(y_labels).any():
    print("Warning: NaN values detected in labels.")
else:
    print("No missing label values found.")

# ==============================================================================
# 4. PARTICIPANT–SCREEN ORDER CHECK
# ==============================================================================
unique_pairs = len({tuple(x) for x in zip(participant_ids, screen_ids)})
if unique_pairs != len(y_labels):
    print(f"Warning: Found {len(y_labels) - unique_pairs} duplicate session pairs.")
else:
    print("Each (participant_id, screen_id) pair is unique and aligned.")

Loaded X_fix: (6, 128, 13)
Loaded y_labels: (6,)
Loaded participant_ids: (6,)
Loaded screen_ids: (6,)
Array lengths are perfectly aligned.

Label distribution:
  Class 0: 4 (66.7%)
  Class 1: 2 (33.3%)
No missing label values found.
Each (participant_id, screen_id) pair is unique and aligned.


## Step 4: Prepare Integer-Encoded Linguistic Input Sequences

**Purpose**

This step prepares the categorical linguistic input features used by the Hybrid BiLSTM model:
- `X_pos.npy` → Integer-encoded simplified Part-of-Speech (PoS) sequences
- `X_con.npy` → Integer-encoded content-word indicator sequences

These complement the fixation-level features (`X_fix.npy`) and label arrays (`y_labels.npy`) created earlier.

**Outputs**
| File | Description | Shape |
|------|--------------|--------|
| `X_pos.npy` | Integer-encoded PoS tag sequences | `(n_screens, 128)` |
| `X_con.npy` | Integer-encoded content-word sequences | `(n_screens, 128)` |
| `pos_vocab.json` | PoS vocabulary mapping | `{tag: index}` |
| `con_vocab.json` | Content-word vocabulary mapping | `{category: index}` |

All output files are stored in the current experiment’s `inputs/` directory
(e.g., `experiment_manual_selection/inputs/`).

In [56]:
# --------------------------------------------------------------------------
# 1. Configuration
# --------------------------------------------------------------------------
DATA_DIR = PATHS["inputs"]  # your experiment’s input directory
os.makedirs(DATA_DIR, exist_ok=True)

# Source file (either real merged ET data or dummy file)
RAW_FILE = PATHS["data"]  # Use dummy dataset for reproducibility

# Key columns
PARTICIPANT_COLUMN = "Participant_unique"
SCREEN_COLUMN = "screenid"
SORT_COLUMN = "index"  # ensures within-screen fixation order
SIMPL_POS_COLUMN = "simplified_pos"
CONTENT_COLUMN = "is_content_word"
SEQUENCE_LENGTH = 128


# --------------------------------------------------------------------------
# 2. Helper Function
# --------------------------------------------------------------------------
def prepare_categorical_sequence(df, category_col, participant_col, screen_col,
                                 sequence_length=128, sort_col=None):
    """
    Converts a categorical text column to integer sequences grouped by
    participant × screen, padded to uniform length.
    """
    if sort_col and sort_col in df.columns:
        df = df.sort_values(by=sort_col).reset_index(drop=True)

    print(f"Encoding categorical column: {category_col}")
    integer_codes, unique_categories = pd.factorize(df[category_col].astype(str))
    df[f"{category_col}_id"] = integer_codes
    vocab = {cat: idx for idx, cat in enumerate(unique_categories)}
    print(f" → {len(vocab)} unique categories")

    # Group by participant × screen
    grouped = df.groupby([participant_col, screen_col])[f"{category_col}_id"].apply(list)

    # Pad each screen’s sequence
    X_padded = pad_sequences(
        grouped.tolist(),
        maxlen=sequence_length,
        padding="post",
        truncating="post"
    )
    return X_padded, vocab


# --------------------------------------------------------------------------
# 3. Load DataFrame and Apply Encoding
# --------------------------------------------------------------------------
print("Loading dataset...")
main_df = pd.read_csv(RAW_FILE, sep="\t", low_memory=False)
if "index" not in df.columns:
    df["index"] = range(len(df))

# --- Define required linguistic columns ---
REQUIRED_COLS = ["pos_merged", "aoi_composition"]
missing_cols = [c for c in REQUIRED_COLS if c not in df.columns]
if missing_cols:
    raise KeyError(f"Missing required columns: {missing_cols}")

print(f"All required columns present: {REQUIRED_COLS}")

print("\nPreparing integer-encoded POS sequences...")
X_pos, pos_vocab = prepare_categorical_sequence(
    main_df, SIMPL_POS_COLUMN,
    participant_col=PARTICIPANT_COLUMN,
    screen_col=SCREEN_COLUMN,
    sequence_length=SEQUENCE_LENGTH,
    sort_col=SORT_COLUMN
)

print("\nPreparing integer-encoded Content-Word sequences...")
X_con, con_vocab = prepare_categorical_sequence(
    main_df, CONTENT_COLUMN,
    participant_col=PARTICIPANT_COLUMN,
    screen_col=SCREEN_COLUMN,
    sequence_length=SEQUENCE_LENGTH,
    sort_col=SORT_COLUMN
)

# --------------------------------------------------------------------------
# 4. Save Outputs
# --------------------------------------------------------------------------
np.save(os.path.join(DATA_DIR, "X_pos.npy"), X_pos)
np.save(os.path.join(DATA_DIR, "X_con.npy"), X_con)

with open(os.path.join(DATA_DIR, "pos_vocab.json"), "w") as f:
    json.dump(pos_vocab, f, indent=2)
with open(os.path.join(DATA_DIR, "con_vocab.json"), "w") as f:
    json.dump(con_vocab, f, indent=2)

print("\nSaved integer-encoded arrays and vocabularies.")
print(f"X_pos shape: {X_pos.shape}")
print(f"X_con shape: {X_con.shape}")

Loading dataset...
All required columns present: ['pos_merged', 'aoi_composition']

Preparing integer-encoded POS sequences...
Encoding categorical column: simplified_pos
 → 3 unique categories

Preparing integer-encoded Content-Word sequences...
Encoding categorical column: is_content_word
 → 2 unique categories

Saved integer-encoded arrays and vocabularies.
X_pos shape: (6, 128)
X_con shape: (6, 128)


## Step 5: Model Training — Stacked Multi-Input BiLSTM (BEyeLSTM)

**Overview**

This step defines and trains the final Hybrid BiLSTM (BEyeLSTM) model,
which integrates fixation-based, syntactic, and semantic information through
three modality-specific subnetworks.
Each branch (Fixation, Part-of-Speech, and Content-Word) consists of two
stacked Bidirectional LSTM layers followed by fully connected layers,
before being merged into a unified representation for classification.

Training is performed using grouped cross-validation (`GroupKFold`) to ensure
participant-level independence across folds.
The evaluation includes standard performance metrics such as Accuracy, AUC,
PR-AUC, Precision, Recall, and F1-score for both expert and non-expert classes.

**Input**
- Preprocessed sequential feature arrays and label data (`PATHS["inputs"]`).

**Output**
- Cross-validation performance summary (`PATHS["outputs"]`).
- Trained model checkpoints and logs (`PATHS["outputs"]`, `PATHS["logs"]`).

In [57]:
# ==============================================================================
# 1. REPRODUCIBILITY CONTROL
# ==============================================================================
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"  # Suppress INFO and WARNING messages

# Fix all random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
tf.keras.utils.set_random_seed(42)


# ==============================================================================
# 2. MODEL DEFINITION
# ==============================================================================

def build_hybrid_bilstm_model(
    fixation_shape: tuple,
    pos_vocab_size: int,
    content_vocab_size: int,
    sequence_length: int = 128,
    lstm_units: int = 16,
    embedding_dim: int = 8,
    dropout_rate: float = 0.5,
    dense_units_branch: tuple = (32, 16)
) -> tf.keras.Model:
    """
    Build a hybrid BiLSTM model combining fixation-based, syntactic, and
    content-level information. The architecture corresponds to the "Hybrid
    BiLSTM" described in Chapter 5.1 of the thesis.

    Parameters
    ----------
    fixation_shape : tuple
        Shape of fixation input (timesteps, features).
    pos_vocab_size : int
        Vocabulary size for POS tokens.
    content_vocab_size : int
        Vocabulary size for content word tokens.
    sequence_length : int, optional
        Sequence length for token inputs (default=128).
    lstm_units : int, optional
        Number of LSTM units per direction (default=16).
    embedding_dim : int, optional
        Embedding dimension for token branches (default=8).
    dropout_rate : float, optional
        Dropout rate for regularization (default=0.5).
    dense_units_branch : tuple, optional
        Dense-layer configuration for each subnetwork (default=(32,16)).

    Returns
    -------
    model : tf.keras.Model
        Compiled Keras model ready for training and evaluation.
    """

    # ----- Fixation branch (AOI-level eye-tracking features) -----
    fixation_input = Input(shape=fixation_shape, name="fixation_input")
    fix = Bidirectional(LSTM(lstm_units, return_sequences=True), name="fix_bilstm_1")(fixation_input)
    fix = Bidirectional(LSTM(lstm_units), name="fix_bilstm_2")(fix)
    fix = Dropout(dropout_rate)(fix)
    fix = Dense(dense_units_branch[0], activation="relu", name="fix_dense_1")(fix)
    fix = Dropout(dropout_rate)(fix)
    fix = Dense(dense_units_branch[1], activation="relu", name="fix_dense_2")(fix)

    # ----- POS branch (syntactic representation) -----
    pos_input = Input(shape=(sequence_length,), name="pos_input")
    pos = Embedding(input_dim=pos_vocab_size, output_dim=embedding_dim, name="pos_embedding")(pos_input)
    pos = Bidirectional(LSTM(lstm_units, return_sequences=True), name="pos_bilstm_1")(pos)
    pos = Bidirectional(LSTM(lstm_units), name="pos_bilstm_2")(pos)
    pos = Dropout(dropout_rate)(pos)
    pos = Dense(dense_units_branch[0], activation="relu", name="pos_dense_1")(pos)
    pos = Dropout(dropout_rate)(pos)
    pos = Dense(dense_units_branch[1], activation="relu", name="pos_dense_2")(pos)

    # ----- Content-word branch (semantic information) -----
    content_input = Input(shape=(sequence_length,), name="content_input")
    con = Embedding(input_dim=content_vocab_size, output_dim=embedding_dim, name="content_embedding")(content_input)
    con = Bidirectional(LSTM(lstm_units, return_sequences=True), name="content_bilstm_1")(con)
    con = Bidirectional(LSTM(lstm_units), name="content_bilstm_2")(con)
    con = Dropout(dropout_rate)(con)
    con = Dense(dense_units_branch[0], activation="relu", name="content_dense_1")(con)
    con = Dropout(dropout_rate)(con)
    con = Dense(dense_units_branch[1], activation="relu", name="content_dense_2")(con)

    # ----- Fusion and classification head -----
    merged_features = concatenate([fix, pos, con], name="merged_features")
    classifier = Dense(32, activation="relu", name="classifier_dense")(merged_features)
    output = Dense(1, activation="sigmoid", name="output")(classifier)

    model = Model(inputs=[fixation_input, pos_input, content_input], outputs=output, name="HybridBiLSTM")
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model


# ==============================================================================
# 3. CONFIGURATION
# ==============================================================================

DATA_DIR = PATHS["inputs"]
RESULTS_DIR = PATHS["outputs"]
LOG_DIR = PATHS["logs"]
os.makedirs(RESULTS_DIR, exist_ok=True)

# File paths (as described in Chapter 3: Data)
FIXATION_PATH = os.path.join(DATA_DIR, "X_fix.npy")
POS_PATH = os.path.join(DATA_DIR, "X_pos.npy")
CONTENT_PATH = os.path.join(DATA_DIR, "X_con.npy")
LABELS_PATH = os.path.join(DATA_DIR, "y_labels.npy")
PARTICIPANT_IDS_PATH = os.path.join(DATA_DIR, "participant_ids.npy")

# Cross-validation and training parameters (Chapter 4.3)
NUM_FOLDS = 5
EPOCHS = 100
BATCH_SIZE = 32
EARLY_STOP_PATIENCE = 10
RANDOM_SEED = 42


# ==============================================================================
# 4. DATA LOADING
# ==============================================================================

print("Loading input arrays...")
X_fix = np.load(FIXATION_PATH)
X_pos = np.load(POS_PATH)
X_content = np.load(CONTENT_PATH)
y = np.load(LABELS_PATH)
participant_ids = np.load(PARTICIPANT_IDS_PATH)

# Vocabulary sizes derived from maximum token ID
pos_vocab_size = int(X_pos.max()) + 1
content_vocab_size = int(X_content.max()) + 1

print(f"POS vocabulary size: {pos_vocab_size}")
print(f"Content-word vocabulary size: {content_vocab_size}")


import csv

# ==============================================================================
# 5. GROUPED K-FOLD CROSS-VALIDATION TRAINING
# ==============================================================================

print(f"\nStarting {NUM_FOLDS}-fold grouped cross-validation...")
unique_participants = np.unique(participant_ids)
NUM_FOLDS = min(NUM_FOLDS, len(unique_participants))

gkf = GroupKFold(n_splits=NUM_FOLDS)
print(f"Using {NUM_FOLDS}-fold cross-validation for {len(unique_participants)} participants.")
cv_results = []

for fold_idx, (train_idx, test_idx) in enumerate(gkf.split(X_fix, y, groups=participant_ids), start=1):
    print(f"\n--- Fold {fold_idx}/{NUM_FOLDS} ---")

    # Participant-independent data split
    X_train = [X_fix[train_idx], X_pos[train_idx], X_content[train_idx]]
    X_test = [X_fix[test_idx], X_pos[test_idx], X_content[test_idx]]
    y_train, y_test = y[train_idx], y[test_idx]

    # Ensure model independence between folds
    tf.keras.backend.clear_session()
    model = build_hybrid_bilstm_model(
        fixation_shape=(X_fix.shape[1], X_fix.shape[2]),
        pos_vocab_size=pos_vocab_size,
        content_vocab_size=content_vocab_size
    )

    # Define callbacks
    fold_log_dir = os.path.join(PATHS["logs"], f"fold_{fold_idx}")
    os.makedirs(fold_log_dir, exist_ok=True)

    early_stopping = EarlyStopping(
        monitor="val_loss",
        patience=EARLY_STOP_PATIENCE,
        restore_best_weights=True
    )

    tensorboard_cb = tf.keras.callbacks.TensorBoard(
        log_dir=fold_log_dir,
        histogram_freq=0,
        write_graph=False,
        write_images=False
    )

    # ----- Training -----
    print("Training model...")
    model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks=[early_stopping, tensorboard_cb],
        verbose=2
    )

    # ----- Evaluation -----
    print("Evaluating model...")
    y_pred_proba = model.predict(X_test).ravel()
    y_pred = (y_pred_proba >= 0.5).astype(int)

    loss, acc = model.evaluate(X_test, y_test, verbose=0)

    # Handle missing-class folds
    if len(np.unique(y_test)) < 2:
        print(f"Fold {fold_idx}: only one class present in y_test — skipping AUC.")
        auc = np.nan
    else:
        auc = roc_auc_score(y_test, y_pred_proba)

    prec, rec, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, labels=[0, 1], zero_division=0
    )
    cm = confusion_matrix(y_test, y_pred, labels=[0, 1])

    # Ensure confusion matrix is 2×2 even if one class missing
    if cm.shape != (2, 2):
        full_cm = np.zeros((2, 2), dtype=int)
        for i, label in enumerate(np.unique(y_test)):
            full_cm[label, label] = cm[i, i]
        cm = full_cm

    # Pad metrics for absent classes
    while len(prec) < 2:
        prec, rec, f1 = np.append(prec, 0.0), np.append(rec, 0.0), np.append(f1, 0.0)

    cv_results.append({
        "fold": fold_idx,
        "accuracy": round(acc, 4),
        "auc": round(auc, 4) if not np.isnan(auc) else "",
        "loss": round(loss, 4),
        "precision_expert": round(prec[1], 4),
        "recall_expert": round(rec[1], 4),
        "f1_expert": round(f1[1], 4),
        "precision_nonexpert": round(prec[0], 4),
        "recall_nonexpert": round(rec[0], 4),
        "f1_nonexpert": round(f1[0], 4),
        "cm_TN": int(cm[0, 0]),
        "cm_FP": int(cm[0, 1]),
        "cm_FN": int(cm[1, 0]),
        "cm_TP": int(cm[1, 1]),
    })

    fold_model_path = os.path.join(PATHS["outputs"], f"hybrid_bilstm_fold{fold_idx}.keras")
    model.save(fold_model_path)


# ==============================================================================
# 6. AGGREGATE RESULTS AND SAVE OUTPUTS
# ==============================================================================

print("\n--- Cross-validation complete ---")
results_df = pd.DataFrame(cv_results)

# Compute summary stats
summary = results_df.describe().transpose()[["mean", "std"]].round(3)
summary.index.name = "metric"
print("\n--- Summary ---")
print(summary)

# Save detailed results safely
results_csv_path = os.path.join(PATHS["outputs"], "hybrid_bilstm_cv_results.csv")
results_df.to_csv(
    results_csv_path,
    sep=",",
    index=False,
    quoting=csv.QUOTE_MINIMAL,
    float_format="%.4f"
)
print(f"Detailed results saved to {results_csv_path}")

# Save summary table
summary_csv_path = os.path.join(PATHS["outputs"], "hybrid_bilstm_summary.csv")
summary.to_csv(summary_csv_path)
print(f"Summary statistics saved to {summary_csv_path}")

# Save final model instance
final_model_path = os.path.join(PATHS["outputs"], "hybrid_bilstm_final.keras")
model.save(final_model_path)
print(f"Final model saved to {final_model_path}")

Loading input arrays...
POS vocabulary size: 3
Content-word vocabulary size: 2

Starting 5-fold grouped cross-validation...
Using 3-fold cross-validation for 3 participants.

--- Fold 1/3 ---
Training model...
Epoch 1/100
1/1 - 9s - loss: 0.6820 - accuracy: 0.7500 - val_loss: 0.6750 - val_accuracy: 1.0000 - 9s/epoch - 9s/step
Epoch 2/100
1/1 - 0s - loss: 0.6621 - accuracy: 0.7500 - val_loss: 0.6740 - val_accuracy: 1.0000 - 74ms/epoch - 74ms/step
Epoch 3/100
1/1 - 0s - loss: 0.7029 - accuracy: 0.2500 - val_loss: 0.6748 - val_accuracy: 1.0000 - 69ms/epoch - 69ms/step
Epoch 4/100
1/1 - 0s - loss: 0.6579 - accuracy: 1.0000 - val_loss: 0.6742 - val_accuracy: 1.0000 - 68ms/epoch - 68ms/step
Epoch 5/100
1/1 - 0s - loss: 0.6986 - accuracy: 0.5000 - val_loss: 0.6733 - val_accuracy: 1.0000 - 68ms/epoch - 68ms/step
Epoch 6/100
1/1 - 0s - loss: 0.6581 - accuracy: 0.7500 - val_loss: 0.6714 - val_accuracy: 1.0000 - 68ms/epoch - 68ms/step
Epoch 7/100
1/1 - 0s - loss: 0.6880 - accuracy: 0.5000 - val_l

## Step 6: Evaluation and Aggregation of Cross-Validation Results

**Overview**

This step aggregates and compares BiLSTM cross-validation results across all
experiment versions (e.g., *manual selection* and *feature analysis*).
It scans the `sequential_classification/experiment_*/outputs/` directories for
`hybrid_bilstm_cv_results.csv` files, computes mean ± standard-deviation metrics,
and exports a unified summary table.

**Input**
- `sequential_classification/experiment_manual_selection/outputs/hybrid_bilstm_cv_results.csv`
- `sequential_classification/experiment_feature_analysis/outputs/hybrid_bilstm_cv_results.csv`

**Output**
- Combined comparison table
  → `{PROJECT_ROOT}/sequential_classification/bilstm_summary_comparison.csv`

In [58]:
# ==============================================================================
# 1. CONFIGURATION
# ==============================================================================
PROJECT_ROOT = PATHS["project_root"]
SEQ_CLASS_DIR = os.path.join(PROJECT_ROOT, "sequential_classification")
SUMMARY_FILE = os.path.join(SEQ_CLASS_DIR, "bilstm_summary_comparison.csv")

# Pattern to find result CSVs in both experiment subdirectories
pattern = os.path.join(SEQ_CLASS_DIR, "experiment_*", "outputs", "hybrid_bilstm_cv_results.csv")
result_files = glob.glob(pattern)

print(f"Found {len(result_files)} result file(s) matching pattern:\n  {pattern}")

# ==============================================================================
# 2. LOAD AND SUMMARIZE
# ==============================================================================
if not result_files:
    print("No result files found. Skipping summary generation.")
else:
    summary_list = []

    for file in result_files:
        exp_name = os.path.basename(os.path.dirname(os.path.dirname(file)))  # e.g. 'experiment_manual_selection'
        df = pd.read_csv(file)

        # Determine which metrics exist before summarizing
        available_metrics = [col for col in ["accuracy", "auc", "precision_expert", "recall_expert", "f1_expert"] if col in df.columns]

        summary = {
            "experiment": exp_name,
            "file": os.path.basename(file),
        }
        for metric in available_metrics:
            summary[f"mean_{metric}"] = df[metric].mean()
            summary[f"std_{metric}"] = df[metric].std()

        summary_list.append(summary)

    summary_df = pd.DataFrame(summary_list)
    print("\n--- Aggregated Cross-Experiment Summary ---")
    print(summary_df.round(3))

    # ==============================================================================
    # 3. SAVE SUMMARY
    # ==============================================================================
    summary_df.to_csv(SUMMARY_FILE, index=False)
    print(f"\nSummary comparison file saved to: {SUMMARY_FILE}")


Found 2 result file(s) matching pattern:
  /mnt/c/Users/Consti/PycharmProjects/BachelorCode/sequential_classification/experiment_*/outputs/hybrid_bilstm_cv_results.csv

--- Aggregated Cross-Experiment Summary ---
                    experiment                          file  mean_accuracy  \
0  experiment_feature_analysis  hybrid_bilstm_cv_results.csv        0.50000   
1  experiment_manual_selection  hybrid_bilstm_cv_results.csv        1.00000   

   std_accuracy  mean_auc  std_auc  mean_precision_expert  \
0       0.50000       NaN      NaN                0.00000   
1       0.00000       NaN      NaN                0.33300   

   std_precision_expert  mean_recall_expert  std_recall_expert  \
0               0.00000             0.00000            0.00000   
1               0.57700             0.33300            0.57700   

   mean_f1_expert  std_f1_expert  
0         0.00000        0.00000  
1         0.33300        0.57700  

Summary comparison file saved to: /mnt/c/Users/Consti/Pychar