In [1]:
# ===== Mount Google Drive =====
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# WITH CONTEXT

## Dataset Generation

In [2]:
# ===== Imports =====
import os
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

# ===== Paths =====
INPUT_CSV = "/content/drive/MyDrive/Colab Notebooks/Flanker-GPT/trialdata74.csv"
OUT_DIR   = "/content/drive/MyDrive/Colab Notebooks/Flanker-GPT/human_dataset"
os.makedirs(OUT_DIR, exist_ok=True)

# ===== Config =====
RNG_SEED = 42 # reproducibility
VAL_RATIO = 0.10 # 10% validation
WINDOW_SIZE = 4 # seq length for context window
NO_CONTEXT = False # True -> trial 4 only (no-context baseline)
MAX_SESSIONS = 20000 # limit number of sessions for speed; set None for all
KEEP_RT_COL = True # export human RT into meta for later calibration
DROP_NAN_RT_META = False # if True, drop rows with NaN RT from meta (sequence still saved)

# ===== Tokens (4 directions + '0' + '=>') =====
TOKENS = ['R', 'U', 'L', 'D', '0', '=>']  # NOTE: label is response_direction ∈ {R,U,L,D}
stoi = {tok: i for i, tok in enumerate(TOKENS)}
itos = {i: tok for tok, i in stoi.items()}

# ===== Layout mapping (layout_id → 5 positions in 5x5) =====
LAYOUTS = {
    0: [  2,  7, 12, 17, 22 ], # Vertical
    1: [ 10, 11, 12, 13, 14 ], # Horizontal
    2: [  4,  7, 10, 17, 24 ], # Left hook
    3: [  0,  7, 14, 17, 20 ], # Right hook
    4: [ 11, 13,  2, 20, 24 ], # Top hook
    5: [  0,  4, 22, 11, 13 ], # Bottom hook
    6: [ 10,  2, 12, 14, 22 ], # Surround
}

# ===== Helpers =====
def build_matrix(layout_id, target_dir, flanker_dir):
    """Build a 5x5 matrix (flattened to length 25) for one trial."""
    mat = ['0'] * 25
    pos = LAYOUTS[int(layout_id)]
    # pos[2] is target; others are flankers
    mat[pos[0]] = flanker_dir
    mat[pos[1]] = flanker_dir
    mat[pos[2]] = target_dir
    mat[pos[3]] = flanker_dir
    mat[pos[4]] = flanker_dir
    return mat

def encode_sequence(trials_tokens, label_token):
    """Flatten trials (each 25 tokens) + '=>' + label into token ids."""
    seq = []
    for t in trials_tokens:
        seq.extend(t)
    seq.append('=>')
    seq.append(label_token)
    return [stoi[t] for t in seq]

def is_congruent_row(target_dir, flanker_dir):
    return str(target_dir) == str(flanker_dir)

def valid_dir(tok):
    return str(tok) in {'R','U','L','D'}

# ===== Load RAW CSV =====
dtype_map = {
    'game_result_id': 'int64',
    'trial_num': 'int64',
    'target_direction': 'string',
    'flanker_direction': 'string',
    'stimulus_layout': 'int64',
    'response_direction': 'string',
    'correct': 'string',
    'response_time': 'float64',
    'user_id': 'int64',
}
print("Loading raw CSV ...")
df = pd.read_csv(INPUT_CSV, dtype=dtype_map)

# keep only complete direction rows
df = df[ df['target_direction'].apply(valid_dir)
       & df['flanker_direction'].apply(valid_dir)
       & df['response_direction'].apply(valid_dir) ].copy()

# sort by user/session/trial
df = df.sort_values(by=['user_id', 'game_result_id', 'trial_num']).reset_index(drop=True)
print(f"✅ Loaded rows after filtering: {len(df):,}")

# ===== Build sequences =====
random.seed(RNG_SEED)
np.random.seed(RNG_SEED)

encoded_sequences = []
meta_rows = []

grouped = df.groupby(['game_result_id'], sort=False)
total_sessions = df['game_result_id'].nunique()
limit_sessions = min(MAX_SESSIONS, total_sessions) if MAX_SESSIONS is not None else total_sessions

print(f"Building sequences (WINDOW_SIZE={WINDOW_SIZE}, NO_CONTEXT={NO_CONTEXT}) ...")
for i, (gid, g) in enumerate(tqdm(grouped, total=total_sessions, desc="Sessions")):
    if i >= limit_sessions:
        break

    rows = g.to_dict('records')
    if len(rows) < WINDOW_SIZE:
        continue

    for end_idx in range(WINDOW_SIZE-1, len(rows)):
        start_idx = end_idx - (WINDOW_SIZE - 1)
        window_rows = rows[start_idx:end_idx+1]
        use_rows = [window_rows[-1]] if NO_CONTEXT else window_rows

        trials_tokens = []
        skip = False
        for r in use_rows:
            try:
                mat = build_matrix(r['stimulus_layout'], r['target_direction'], r['flanker_direction'])
            except Exception as e:
                skip = True
                break
            trials_tokens.append(mat)
        if skip:
            continue

        # label is the *human response* on trial 4
        label_token = str(window_rows[-1]['response_direction'])
        token_ids = encode_sequence(trials_tokens, label_token)
        encoded_sequences.append(token_ids)

        # meta for trial 4 (end of window)
        t4 = window_rows[-1]
        rt_val = float(t4['response_time']) if pd.notna(t4['response_time']) else math.nan

        meta_rows.append({
            "user_id":             int(t4['user_id']),
            "game_result_id":      int(t4['game_result_id']),
            "end_trial_num":       int(t4['trial_num']),
            "context_used":        int(not NO_CONTEXT),
            "trial4_layout":       int(t4['stimulus_layout']),
            "trial4_target":       str(t4['target_direction']),
            "trial4_flanker":      str(t4['flanker_direction']),
            "trial4_is_congruent": int(is_congruent_row(t4['target_direction'], t4['flanker_direction'])),
            "trial4_response":     str(t4['response_direction']),
            "trial4_correct":      1 if str(t4['correct']).upper() == 'T' else 0,
            "trial4_response_time": rt_val if KEEP_RT_COL else np.nan,
            # raw text (optional, helpful for debugging; keep or drop as needed)
            "trial_1_4_text": " ".join([tok for trial in trials_tokens for tok in trial]),
            "label_response_token_id": int(stoi[label_token]),
        })

print(f"✅ Total sequences built: {len(encoded_sequences):,}")

# ===== Sanity: length alignment =====
assert len(encoded_sequences) == len(meta_rows), "encoded_sequences and meta_rows length mismatch!"

# ===== Shuffle & Split with aligned meta =====
print("Shuffling and splitting with aligned meta ...")
N = len(encoded_sequences)
perm = np.random.RandomState(RNG_SEED).permutation(N)
encoded_sequences = [encoded_sequences[i] for i in perm]
meta_rows        = [meta_rows[i]        for i in perm]

split_idx = int(N * (1 - VAL_RATIO))
train_set = encoded_sequences[:split_idx]
val_set   = encoded_sequences[split_idx:]

train_meta = meta_rows[:split_idx]
val_meta   = meta_rows[split_idx:]

# (Optional) drop rows with NaN RT from meta ONLY (sequence remains; usually keep them)
if DROP_NAN_RT_META:
    def _drop_nan_rt(meta_list):
        kept = []
        for m in meta_list:
            rt = m.get("trial4_response_time", np.nan)
            if not (isinstance(rt, float) and math.isnan(rt)):
                kept.append(m)
        return kept
    train_meta = _drop_nan_rt(train_meta)
    val_meta   = _drop_nan_rt(val_meta)

# ===== Save .npy =====
np.save(os.path.join(OUT_DIR, 'train.npy'), np.array(train_set, dtype=np.int32))
np.save(os.path.join(OUT_DIR, 'val.npy'),   np.array(val_set,   dtype=np.int32))
print(f"Saved .npy to: {OUT_DIR}")
print(f"train.npy: {len(train_set):,} | val.npy: {len(val_set):,}")

# ===== Save aligned meta CSV =====
train_meta_df = pd.DataFrame(train_meta)
val_meta_df   = pd.DataFrame(val_meta)

train_meta_path = os.path.join(OUT_DIR, 'train_meta.csv')
val_meta_path   = os.path.join(OUT_DIR, 'val_meta.csv')
train_meta_df.to_csv(train_meta_path, index=False)
val_meta_df.to_csv(val_meta_path, index=False)
print("Saved aligned meta CSV:")
print("   -", train_meta_path)
print("   -", val_meta_path)

# ===== Save vocab =====
with open(os.path.join(OUT_DIR, 'vocab.txt'), 'w') as f:
    f.write(",".join(TOKENS))
print("Saved vocab.txt")

# ===== Quick preview =====
print("\nPreview meta (val):")
print(val_meta_df.head(5).to_string(index=False))

print("\n✅ DONE.")

Loading raw CSV ...
✅ Loaded rows after filtering: 11,245,819
Building sequences (WINDOW_SIZE=4, NO_CONTEXT=False) ...


Sessions:  10%|▉         | 20000/201894 [00:37<05:43, 529.70it/s]


✅ Total sequences built: 1,070,621
Shuffling and splitting with aligned meta ...
Saved .npy to: /content/drive/MyDrive/Colab Notebooks/Flanker-GPT/human_dataset
train.npy: 963,558 | val.npy: 107,063
Saved aligned meta CSV:
   - /content/drive/MyDrive/Colab Notebooks/Flanker-GPT/human_dataset/train_meta.csv
   - /content/drive/MyDrive/Colab Notebooks/Flanker-GPT/human_dataset/val_meta.csv
Saved vocab.txt

Preview meta (val):
 user_id  game_result_id  end_trial_num  context_used  trial4_layout trial4_target trial4_flanker  trial4_is_congruent trial4_response  trial4_correct  trial4_response_time                                                                                                                                                                                          trial_1_4_text  label_response_token_id
   11983      3435201562             35             1              6             U              D                    0               U               1                1007.0 0

## Training

In [3]:
# ===== Imports =====
import os
import numpy as np
import torch
import torch.nn.functional as F
from tqdm import tqdm
import sys

# Add model path
sys.path.append('/content/drive/MyDrive/Colab Notebooks/Flanker-GPT')
from model import GPT, GPTConfig

# ===== Paths =====
data_dir = "/content/drive/MyDrive/Colab Notebooks/Flanker-GPT/human_dataset"
out_dir = "/content/drive/MyDrive/Colab Notebooks/Flanker-GPT/checkpoints"
os.makedirs(out_dir, exist_ok=True)

# ===== Settings =====
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 64
block_size = 101  # 4 trials × 25 tokens + 1 label
max_iters = 2000
eval_interval = 200
learning_rate = 3e-4
vocab_size = 6  # ['R','U','L','D','0','=>']

# ===== Load Dataset =====
train_data = np.load(os.path.join(data_dir, 'train.npy'))
val_data = np.load(os.path.join(data_dir, 'val.npy'))

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = np.random.randint(len(data), size=batch_size)
    x = torch.tensor(np.stack([d[:-1] for d in data[ix]]), dtype=torch.long, device=device)
    y = torch.tensor([d[-1] for d in data[ix]], dtype=torch.long, device=device)
    return x, y

# ===== Create Model =====
config = GPTConfig(
    vocab_size=vocab_size,
    block_size=block_size,
    n_layer=4,
    n_head=4,
    n_embd=128
)
model = GPT(config).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# ===== Eval Loss =====
@torch.no_grad()
def estimate_loss():
    model.eval()
    out = {}
    for split in ['train', 'val']:
        losses = torch.zeros(10)
        for k in range(10):
            X, Y = get_batch(split)
            logits, _ = model(X)
            logits = logits[:, -1, :]  # use only last position
            loss = F.cross_entropy(logits, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# ===== Train Loop =====
print("Training Human-Trained GPT-Flanker model...")
for iter in range(max_iters):
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"Step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')
    logits, _ = model(xb)
    logits = logits[:, -1, :]  # Predict only the final token
    loss = F.cross_entropy(logits, yb)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# ===== Save Model =====
ckpt_path = os.path.join(out_dir, "flanker_gpt_human.pt")
torch.save(model.state_dict(), ckpt_path)
print(f"✅ Model saved to {ckpt_path}")

number of parameters: 0.79M
Training Human-Trained GPT-Flanker model...
Step 0: train loss 1.9709, val loss 1.9768
Step 200: train loss 0.2331, val loss 0.1650
Step 400: train loss 0.1259, val loss 0.1522
Step 600: train loss 0.1137, val loss 0.1261
Step 800: train loss 0.1660, val loss 0.1294
Step 1000: train loss 0.1737, val loss 0.1658
Step 1200: train loss 0.1393, val loss 0.1629
Step 1400: train loss 0.1241, val loss 0.1422
Step 1600: train loss 0.1312, val loss 0.1086
Step 1800: train loss 0.1041, val loss 0.2276
Step 1999: train loss 0.1474, val loss 0.0935
✅ Model saved to /content/drive/MyDrive/Colab Notebooks/Flanker-GPT/checkpoints/flanker_gpt_human.pt


## Evaluation

In [4]:
# ===== Imports =====
import os, sys, csv, math
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

# Add model path
sys.path.append('/content/drive/MyDrive/Colab Notebooks/Flanker-GPT')
from model import GPT, GPTConfig

# ===== Paths =====
base_dir   = "/content/drive/MyDrive/Colab Notebooks/Flanker-GPT"
data_dir   = f"{base_dir}/human_dataset"
ckpt_path  = f"{base_dir}/checkpoints/flanker_gpt_human.pt"
val_npy    = os.path.join(data_dir, "val.npy")
val_meta   = os.path.join(data_dir, "val_meta.csv")
vocab_txt  = os.path.join(data_dir, "vocab.txt")  # optional check
output_csv = f"{base_dir}/gpt_human_val_predictions_stopping.csv"

# ===== Settings =====
device = 'cuda' if torch.cuda.is_available() else 'cpu'
TOKENS = ['R','U','L','D','0','=>']  # must match builder TOKENS
RESPONSE_TOKENS = ['R','U','L','D']  # restrict sampling to directions
stoi = {tok: i for i, tok in enumerate(TOKENS)}
itos = {i: tok for i, tok in enumerate(TOKENS)}
response_token_ids = [stoi[t] for t in RESPONSE_TOKENS]

# ----- Stopping-Rule Params -----
delta = 3          # stop when (top_count - second_count) >= delta
max_samples = 100  # safety cap
restrict_to_response_tokens = True

# ===== Load Model =====
# infer block_size from val.npy to support with/no-context automatically
val_data = np.load(val_npy)
block_size = val_data.shape[1] - 1
vocab_size = len(TOKENS)

config = GPTConfig(
    vocab_size=vocab_size,
    block_size=block_size,
    n_layer=4,
    n_head=4,
    n_embd=128
)
model = GPT(config).to(device)
model.load_state_dict(torch.load(ckpt_path, map_location=device))
model.eval()
print("✅ Loaded Human-trained GPT model")
print(f"Loaded {len(val_data)} validation samples | block_size={block_size}")

# ===== Load aligned meta (for congruency & human RT) =====
assert os.path.exists(val_meta), f"val_meta.csv not found at {val_meta}"
meta = pd.read_csv(val_meta)
assert len(meta) == len(val_data), "val_meta.csv must align with val.npy length"

# ===== Utils =====
def calculate_entropy(prob_dist_np):
    return -float(np.sum(prob_dist_np * np.log(prob_dist_np + 1e-12)))

def sample_until_threshold(probs_full, delta=3, max_samples=100, restrict_ids=None):
    """
    probs_full: 1D torch tensor over full vocab (sum=1)
    restrict_ids: list of token ids to sample from (renormalize over this subset)
    Returns: winner_id, k_used, stopped_bool, gap_at_stop
    """
    if restrict_ids is not None:
        sub = probs_full[restrict_ids]
        sub = sub / sub.sum()
        id_map = restrict_ids
        def draw():
            idx_local = torch.multinomial(sub, num_samples=1, replacement=True).item()
            return id_map[idx_local]
    else:
        def draw():
            return torch.multinomial(probs_full, num_samples=1, replacement=True).item()

    counts = {}
    for s in range(1, max_samples + 1):
        tok = draw()
        counts[tok] = counts.get(tok, 0) + 1

        sorted_pairs = sorted(counts.items(), key=lambda kv: kv[1], reverse=True)
        top_count = sorted_pairs[0][1]
        second_count = sorted_pairs[1][1] if len(sorted_pairs) > 1 else 0
        gap = top_count - second_count
        if gap >= delta:
            winner_id = sorted_pairs[0][0]
            return winner_id, s, True, gap

    # fallback: argmax over restricted or full distribution
    if restrict_ids is not None:
        sub = probs_full[restrict_ids]
        winner_local = torch.argmax(sub).item()
        winner_id = restrict_ids[winner_local]
    else:
        winner_id = torch.argmax(probs_full).item()

    if len(counts) > 0:
        sorted_pairs = sorted(counts.items(), key=lambda kv: kv[1], reverse=True)
        top_count = sorted_pairs[0][1]
        second_count = sorted_pairs[1][1] if len(sorted_pairs) > 1 else 0
        gap = top_count - second_count
    else:
        gap = 0
    return winner_id, max_samples, False, gap

# ===== Evaluate =====
records = []
correct = 0

for i in range(len(val_data)):
    seq = torch.tensor(val_data[i][:-1], dtype=torch.long, device=device).unsqueeze(0)
    true_id = int(val_data[i][-1])
    true_tok = itos[true_id]

    with torch.no_grad():
        logits, _ = model(seq)
        logits = logits[:, -1, :]  # only final position
        probs = F.softmax(logits, dim=-1).squeeze(0).cpu()

        winner_id, k_used, stopped, gap_at_stop = sample_until_threshold(
            probs_full=probs,
            delta=delta,
            max_samples=max_samples,
            restrict_ids=response_token_ids if restrict_to_response_tokens else None
        )
        pred_tok = itos[winner_id]

        is_correct = (winner_id == true_id)
        if is_correct:
            correct += 1

        confidence = float(probs[winner_id].item())
        prob_true  = float(probs[true_id].item())
        entropy    = calculate_entropy(probs.numpy())

    # meta from aligned CSV
    is_congruent = int(meta.loc[i, "trial4_is_congruent"])
    trial4_layout = int(meta.loc[i, "trial4_layout"])
    human_rt = float(meta.loc[i, "trial4_response_time"]) if "trial4_response_time" in meta.columns else np.nan

    row = {
        "example": i + 1,
        "true_response": true_tok,
        "predicted_response": pred_tok,
        "is_correct": bool(is_correct),

        "confidence": round(confidence, 6),
        "prob_of_true_token": round(prob_true, 6),
        "entropy": round(entropy, 6),

        # stopping diagnostics
        "k_samples": int(k_used),
        "stopped_by_delta": bool(stopped),
        "gap_at_stop": int(gap_at_stop),
        "delta": int(delta),
        "max_samples": int(max_samples),

        # meta
        "is_congruent": is_congruent,
        "trial4_layout": trial4_layout,
        "human_rt_ms": human_rt,
    }

    # per-token probs (directions only)
    for tok in RESPONSE_TOKENS:
        row[f"{tok}_prob"] = round(float(probs[stoi[tok]].item()), 6)

    records.append(row)

# ===== Save Results =====
df = pd.DataFrame(records)
df.to_csv(output_csv, index=False)
acc = correct / len(val_data) * 100.0
print("\n✅ Stopping-rule evaluation (human) complete.")
print(f"Accuracy = {acc:.2f}% (Δ={delta}, max={max_samples})")
print(f"Results saved to: {output_csv}")

number of parameters: 0.79M
✅ Loaded Human-trained GPT model
Loaded 107063 validation samples | block_size=101

✅ Stopping-rule evaluation (human) complete.
Accuracy = 97.21% (Δ=3, max=100)
Results saved to: /content/drive/MyDrive/Colab Notebooks/Flanker-GPT/gpt_human_val_predictions_stopping.csv


### Congruency Effect Analysis

In [5]:
# ===== Analyze Congruency Effect (No RT Version) =====
# This script evaluates accuracy, entropy, and k_samples by congruency.
# It assumes that your prediction CSV has: is_congruent, is_correct, entropy, confidence, k_samples

import os, numpy as np, pandas as pd

# ==== INPUT (update to the model you want to analyze) ====
base_dir   = "/content/drive/MyDrive/Colab Notebooks/Flanker-GPT"
pred_csv   = f"{base_dir}/gpt_human_val_predictions_stopping.csv"

out_dir    = os.path.join(base_dir, "congruency_effect")
os.makedirs(out_dir, exist_ok=True)

# ==== LOAD ====
df = pd.read_csv(pred_csv)
req = {"is_congruent","is_correct","entropy","confidence","k_samples"}
missing = list(req - set(df.columns))
assert not missing, f"Missing columns: {missing}"

# ==== BASIC SUMMARIES ====
acc_overall = df["is_correct"].mean()*100
acc_by_c = df.groupby("is_congruent")["is_correct"].mean().rename({0:"incongruent",1:"congruent"})*100

ent_by_c = df.groupby("is_congruent")["entropy"].mean().rename({0:"incongruent",1:"congruent"})
k_by_c   = df.groupby("is_congruent")["k_samples"].mean().rename({0:"incongruent",1:"congruent"})

print("=== Congruency Effect (Model, No RT) ===")
print(f"Accuracy overall: {acc_overall:.2f}%")
print(f"Accuracy by congruency (%):\n{acc_by_c.to_string()}")
print(f"\nEntropy mean by congruency:\n{ent_by_c.to_string()}")
print(f"\nk_samples mean by congruency:\n{k_by_c.to_string()}")

# ==== SAVE TABLE ====
rows = []
rows.append({"metric":"Accuracy_overall_%", "value": acc_overall})
rows.append({"metric":"Accuracy_congruent_%", "value": acc_by_c.get("congruent", np.nan)})
rows.append({"metric":"Accuracy_incongruent_%", "value": acc_by_c.get("incongruent", np.nan)})
rows.append({"metric":"Entropy_mean_congruent", "value": ent_by_c.get("congruent", np.nan)})
rows.append({"metric":"Entropy_mean_incongruent", "value": ent_by_c.get("incongruent", np.nan)})
rows.append({"metric":"k_mean_congruent", "value": k_by_c.get("congruent", np.nan)})
rows.append({"metric":"k_mean_incongruent", "value": k_by_c.get("incongruent", np.nan)})

out_csv = os.path.join(out_dir, os.path.basename(pred_csv).replace(".csv","_congruency_stats_NoRT.csv"))
pd.DataFrame(rows).to_csv(out_csv, index=False)

print("\nSaved:", out_csv)

=== Congruency Effect (Model, No RT) ===
Accuracy overall: 97.21%
Accuracy by congruency (%):
is_congruent
incongruent    95.389022
congruent      99.015788

Entropy mean by congruency:
is_congruent
incongruent    0.205579
congruent      0.090269

k_samples mean by congruency:
is_congruent
incongruent    3.294631
congruent      3.081887

Saved: /content/drive/MyDrive/Colab Notebooks/Flanker-GPT/congruency_effect/gpt_human_val_predictions_stopping_congruency_stats_NoRT.csv


# WITHOUT CONTEXT

## Dataset Generation

In [6]:
# ===== Imports =====
import os
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

# ===== Paths =====
INPUT_CSV = "/content/drive/MyDrive/Colab Notebooks/Flanker-GPT/trialdata74.csv"
OUT_DIR = "/content/drive/MyDrive/Colab Notebooks/Flanker-GPT/human_dataset_noctx"
os.makedirs(OUT_DIR, exist_ok=True)

# ===== Config =====
RNG_SEED = 42 # reproducibility
VAL_RATIO = 0.10 # 10% validation
WINDOW_SIZE = 4 # seq length for context window
NO_CONTEXT = True # True -> trial 4 only (no-context baseline) #*
MAX_SESSIONS = 20000 # limit number of sessions for speed; set None for all
KEEP_RT_COL = True # export human RT into meta for later calibration
DROP_NAN_RT_META = False # if True, drop rows with NaN RT from meta (sequence still saved)

# ===== Tokens (4 directions + '0' + '=>') =====
TOKENS = ['R', 'U', 'L', 'D', '0', '=>']  # NOTE: label is response_direction ∈ {R,U,L,D}
stoi = {tok: i for i, tok in enumerate(TOKENS)}
itos = {i: tok for tok, i in stoi.items()}

# ===== Layout mapping (layout_id → 5 positions in 5x5) =====
LAYOUTS = {
    0: [  2,  7, 12, 17, 22 ],  # Vertical
    1: [ 10, 11, 12, 13, 14 ],  # Horizontal
    2: [  4,  7, 10, 17, 24 ],  # Left hook
    3: [  0,  7, 14, 17, 20 ],  # Right hook
    4: [ 11, 13,  2, 20, 24 ],  # Top hook
    5: [  0,  4, 22, 11, 13 ],  # Bottom hook
    6: [ 10,  2, 12, 14, 22 ],  # Surround
}

# ===== Helpers =====
def build_matrix(layout_id, target_dir, flanker_dir):
    """Build a 5x5 matrix (flattened to length 25) for one trial."""
    mat = ['0'] * 25
    pos = LAYOUTS[int(layout_id)]
    # pos[2] is target; others are flankers
    mat[pos[0]] = flanker_dir
    mat[pos[1]] = flanker_dir
    mat[pos[2]] = target_dir
    mat[pos[3]] = flanker_dir
    mat[pos[4]] = flanker_dir
    return mat

def encode_sequence(trials_tokens, label_token):
    """Flatten trials (each 25 tokens) + '=>' + label into token ids."""
    seq = []
    for t in trials_tokens:
        seq.extend(t)
    seq.append('=>')
    seq.append(label_token)
    return [stoi[t] for t in seq]

def is_congruent_row(target_dir, flanker_dir):
    return str(target_dir) == str(flanker_dir)

def valid_dir(tok):
    return str(tok) in {'R','U','L','D'}

# ===== Load RAW CSV =====
dtype_map = {
    'game_result_id': 'int64',
    'trial_num': 'int64',
    'target_direction': 'string',
    'flanker_direction': 'string',
    'stimulus_layout': 'int64',
    'response_direction': 'string',
    'correct': 'string',
    'response_time': 'float64',
    'user_id': 'int64',
}
print("Loading raw CSV ...")
df = pd.read_csv(INPUT_CSV, dtype=dtype_map)

# keep only complete direction rows
df = df[ df['target_direction'].apply(valid_dir)
       & df['flanker_direction'].apply(valid_dir)
       & df['response_direction'].apply(valid_dir) ].copy()

# sort by user/session/trial
df = df.sort_values(by=['user_id', 'game_result_id', 'trial_num']).reset_index(drop=True)
print(f"✅ Loaded rows after filtering: {len(df):,}")

# ===== Build sequences =====
random.seed(RNG_SEED)
np.random.seed(RNG_SEED)

encoded_sequences = []
meta_rows = []

grouped = df.groupby(['game_result_id'], sort=False)
total_sessions = df['game_result_id'].nunique()
limit_sessions = min(MAX_SESSIONS, total_sessions) if MAX_SESSIONS is not None else total_sessions

print(f"Building sequences (WINDOW_SIZE={WINDOW_SIZE}, NO_CONTEXT={NO_CONTEXT}) ...")
for i, (gid, g) in enumerate(tqdm(grouped, total=total_sessions, desc="Sessions")):
    if i >= limit_sessions:
        break

    rows = g.to_dict('records')
    if len(rows) < WINDOW_SIZE:
        continue

    for end_idx in range(WINDOW_SIZE-1, len(rows)):
        start_idx = end_idx - (WINDOW_SIZE - 1)
        window_rows = rows[start_idx:end_idx+1]
        use_rows = [window_rows[-1]] if NO_CONTEXT else window_rows

        trials_tokens = []
        skip = False
        for r in use_rows:
            try:
                mat = build_matrix(r['stimulus_layout'], r['target_direction'], r['flanker_direction'])
            except Exception as e:
                skip = True
                break
            trials_tokens.append(mat)
        if skip:
            continue

        # label is the *human response* on trial 4
        label_token = str(window_rows[-1]['response_direction'])
        token_ids = encode_sequence(trials_tokens, label_token)
        encoded_sequences.append(token_ids)

        # meta for trial 4 (end of window)
        t4 = window_rows[-1]
        rt_val = float(t4['response_time']) if pd.notna(t4['response_time']) else math.nan

        meta_rows.append({
            "user_id":             int(t4['user_id']),
            "game_result_id":      int(t4['game_result_id']),
            "end_trial_num":       int(t4['trial_num']),
            "context_used":        int(not NO_CONTEXT),
            "trial4_layout":       int(t4['stimulus_layout']),
            "trial4_target":       str(t4['target_direction']),
            "trial4_flanker":      str(t4['flanker_direction']),
            "trial4_is_congruent": int(is_congruent_row(t4['target_direction'], t4['flanker_direction'])),
            "trial4_response":     str(t4['response_direction']),
            "trial4_correct":      1 if str(t4['correct']).upper() == 'T' else 0,
            "trial4_response_time": rt_val if KEEP_RT_COL else np.nan,
            # raw text (optional, helpful for debugging; keep or drop as needed)
            "trial_1_4_text": " ".join([tok for trial in trials_tokens for tok in trial]),
            "label_response_token_id": int(stoi[label_token]),
        })

print(f"Total sequences built: {len(encoded_sequences):,}")

# ===== Sanity: length alignment =====
assert len(encoded_sequences) == len(meta_rows), "encoded_sequences and meta_rows length mismatch!"

# ===== Shuffle & Split with aligned meta =====
print("Shuffling and splitting with aligned meta ...")
N = len(encoded_sequences)
perm = np.random.RandomState(RNG_SEED).permutation(N)
encoded_sequences = [encoded_sequences[i] for i in perm]
meta_rows        = [meta_rows[i]        for i in perm]

split_idx = int(N * (1 - VAL_RATIO))
train_set = encoded_sequences[:split_idx]
val_set   = encoded_sequences[split_idx:]

train_meta = meta_rows[:split_idx]
val_meta   = meta_rows[split_idx:]

# (Optional) drop rows with NaN RT from meta ONLY (sequence remains; usually keep them)
if DROP_NAN_RT_META:
    def _drop_nan_rt(meta_list):
        kept = []
        for m in meta_list:
            rt = m.get("trial4_response_time", np.nan)
            if not (isinstance(rt, float) and math.isnan(rt)):
                kept.append(m)
        return kept
    train_meta = _drop_nan_rt(train_meta)
    val_meta   = _drop_nan_rt(val_meta)

# ===== Save .npy =====
np.save(os.path.join(OUT_DIR, 'train.npy'), np.array(train_set, dtype=np.int32))
np.save(os.path.join(OUT_DIR, 'val.npy'),   np.array(val_set,   dtype=np.int32))
print(f"Saved .npy to: {OUT_DIR}")
print(f"train.npy: {len(train_set):,} | val.npy: {len(val_set):,}")

# ===== Save aligned meta CSV =====
train_meta_df = pd.DataFrame(train_meta)
val_meta_df   = pd.DataFrame(val_meta)

train_meta_path = os.path.join(OUT_DIR, 'train_meta.csv')
val_meta_path   = os.path.join(OUT_DIR, 'val_meta.csv')
train_meta_df.to_csv(train_meta_path, index=False)
val_meta_df.to_csv(val_meta_path, index=False)
print("Saved aligned meta CSV:")
print("   -", train_meta_path)
print("   -", val_meta_path)

# ===== Save vocab =====
with open(os.path.join(OUT_DIR, 'vocab.txt'), 'w') as f:
    f.write(",".join(TOKENS))
print("Saved vocab.txt")

# ===== Quick preview =====
print("\n🔎 Preview meta (val):")
print(val_meta_df.head(5).to_string(index=False))

print("\n✅ DONE.")

Loading raw CSV ...
✅ Loaded rows after filtering: 11,245,819
Building sequences (WINDOW_SIZE=4, NO_CONTEXT=True) ...


Sessions:  10%|▉         | 20000/201894 [00:37<05:36, 540.24it/s]


Total sequences built: 1,070,621
Shuffling and splitting with aligned meta ...
Saved .npy to: /content/drive/MyDrive/Colab Notebooks/Flanker-GPT/human_dataset_noctx
train.npy: 963,558 | val.npy: 107,063
Saved aligned meta CSV:
   - /content/drive/MyDrive/Colab Notebooks/Flanker-GPT/human_dataset_noctx/train_meta.csv
   - /content/drive/MyDrive/Colab Notebooks/Flanker-GPT/human_dataset_noctx/val_meta.csv
Saved vocab.txt

🔎 Preview meta (val):
 user_id  game_result_id  end_trial_num  context_used  trial4_layout trial4_target trial4_flanker  trial4_is_congruent trial4_response  trial4_correct  trial4_response_time                                    trial_1_4_text  label_response_token_id
   11983      3435201562             35             0              6             U              D                    0               U               1                1007.0 0 0 D 0 0 0 0 0 0 0 D 0 U 0 D 0 0 0 0 0 0 0 D 0 0                        1
    2204      3513483047             25             0     

## Training

In [7]:
# ===== Imports =====
import os
import numpy as np
import torch
import torch.nn.functional as F
from tqdm import tqdm
import sys

# Add model path
sys.path.append('/content/drive/MyDrive/Colab Notebooks/Flanker-GPT')
from model import GPT, GPTConfig

# ===== Paths =====
data_dir = "/content/drive/MyDrive/Colab Notebooks/Flanker-GPT/human_dataset_noctx" #*
out_dir = "/content/drive/MyDrive/Colab Notebooks/Flanker-GPT/checkpoints"
os.makedirs(out_dir, exist_ok=True)

# ===== Settings =====
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 64
block_size = 26  # 1 trial × 25 tokens + 1 label #*
max_iters = 2000
eval_interval = 200
learning_rate = 3e-4
vocab_size = 6  # ['R','U','L','D','0','=>']

# ===== Load Dataset =====
train_data = np.load(os.path.join(data_dir, 'train.npy'))
val_data = np.load(os.path.join(data_dir, 'val.npy'))

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = np.random.randint(len(data), size=batch_size)
    x = torch.tensor(np.stack([d[:-1] for d in data[ix]]), dtype=torch.long, device=device)
    y = torch.tensor([d[-1] for d in data[ix]], dtype=torch.long, device=device)
    return x, y

# ===== Create Model =====
config = GPTConfig(
    vocab_size=vocab_size,
    block_size=block_size,
    n_layer=4,
    n_head=4,
    n_embd=128
)
model = GPT(config).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# ===== Eval Loss =====
@torch.no_grad()
def estimate_loss():
    model.eval()
    out = {}
    for split in ['train', 'val']:
        losses = torch.zeros(10)
        for k in range(10):
            X, Y = get_batch(split)
            logits, _ = model(X)
            logits = logits[:, -1, :]
            loss = F.cross_entropy(logits, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# ===== Train Loop =====
print("Training Human-Trained GPT-Flanker model...")
for iter in range(max_iters):
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"Step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')
    logits, _ = model(xb)
    logits = logits[:, -1, :]  # Predict only the final token
    loss = F.cross_entropy(logits, yb)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# ===== Save Model =====
ckpt_path = os.path.join(out_dir, "flanker_gpt_human_noctx.pt") #*
torch.save(model.state_dict(), ckpt_path)
print(f"✅ Model saved to {ckpt_path}")


number of parameters: 0.79M
Training Human-Trained GPT-Flanker model...
Step 0: train loss 2.0304, val loss 2.0159
Step 200: train loss 0.2285, val loss 0.1568
Step 400: train loss 0.1241, val loss 0.1472
Step 600: train loss 0.1048, val loss 0.1171
Step 800: train loss 0.1609, val loss 0.1270
Step 1000: train loss 0.1733, val loss 0.1594
Step 1200: train loss 0.1313, val loss 0.1502
Step 1400: train loss 0.1269, val loss 0.1468
Step 1600: train loss 0.1320, val loss 0.1061
Step 1800: train loss 0.1073, val loss 0.2277
Step 1999: train loss 0.1471, val loss 0.0945
✅ Model saved to /content/drive/MyDrive/Colab Notebooks/Flanker-GPT/checkpoints/flanker_gpt_human_noctx.pt


## Evaluation

In [8]:
# ===== Imports =====
import os, sys, csv, math
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

# Add model path
sys.path.append('/content/drive/MyDrive/Colab Notebooks/Flanker-GPT')
from model import GPT, GPTConfig

# ===== Paths =====
base_dir   = "/content/drive/MyDrive/Colab Notebooks/Flanker-GPT"
data_dir   = f"{base_dir}/human_dataset_noctx"            #*
ckpt_path  = f"{base_dir}/checkpoints/flanker_gpt_human_noctx.pt"  #*
val_npy    = os.path.join(data_dir, "val.npy")
val_meta   = os.path.join(data_dir, "val_meta.csv")       #*
output_csv = f"{base_dir}/gpt_human_noctx_val_predictions_stopping.csv"  #*


# ===== Settings =====
device = 'cuda' if torch.cuda.is_available() else 'cpu'
TOKENS = ['R','U','L','D','0','=>']  # must match builder TOKENS
RESPONSE_TOKENS = ['R','U','L','D']  # restrict sampling to directions
stoi = {tok: i for i, tok in enumerate(TOKENS)}
itos = {i: tok for i, tok in enumerate(TOKENS)}
response_token_ids = [stoi[t] for t in RESPONSE_TOKENS]

# ----- Stopping-Rule Params -----
delta = 3          # stop when (top_count - second_count) >= delta
max_samples = 100  # safety cap
restrict_to_response_tokens = True

# ===== Load Model =====
# infer block_size from val.npy to support with/no-context automatically
val_data = np.load(val_npy)
block_size = val_data.shape[1] - 1
vocab_size = len(TOKENS)

config = GPTConfig(
    vocab_size=vocab_size,
    block_size=block_size,
    n_layer=4,
    n_head=4,
    n_embd=128
)
model = GPT(config).to(device)
model.load_state_dict(torch.load(ckpt_path, map_location=device))
model.eval()
print("✅ Loaded Human-trained GPT model")
print(f"Loaded {len(val_data)} validation samples | block_size={block_size}")

# ===== Load aligned meta (for congruency & human RT) =====
assert os.path.exists(val_meta), f"val_meta.csv not found at {val_meta}"
meta = pd.read_csv(val_meta)
assert len(meta) == len(val_data), "val_meta.csv must align with val.npy length"

# ===== Utils =====
def calculate_entropy(prob_dist_np):
    return -float(np.sum(prob_dist_np * np.log(prob_dist_np + 1e-12)))

def sample_until_threshold(probs_full, delta=3, max_samples=100, restrict_ids=None):
    """
    probs_full: 1D torch tensor over full vocab (sum=1)
    restrict_ids: list of token ids to sample from (renormalize over this subset)
    Returns: winner_id, k_used, stopped_bool, gap_at_stop
    """
    if restrict_ids is not None:
        sub = probs_full[restrict_ids]
        sub = sub / sub.sum()
        id_map = restrict_ids
        def draw():
            idx_local = torch.multinomial(sub, num_samples=1, replacement=True).item()
            return id_map[idx_local]
    else:
        def draw():
            return torch.multinomial(probs_full, num_samples=1, replacement=True).item()

    counts = {}
    for s in range(1, max_samples + 1):
        tok = draw()
        counts[tok] = counts.get(tok, 0) + 1

        sorted_pairs = sorted(counts.items(), key=lambda kv: kv[1], reverse=True)
        top_count = sorted_pairs[0][1]
        second_count = sorted_pairs[1][1] if len(sorted_pairs) > 1 else 0
        gap = top_count - second_count
        if gap >= delta:
            winner_id = sorted_pairs[0][0]
            return winner_id, s, True, gap

    # fallback: argmax over restricted or full distribution
    if restrict_ids is not None:
        sub = probs_full[restrict_ids]
        winner_local = torch.argmax(sub).item()
        winner_id = restrict_ids[winner_local]
    else:
        winner_id = torch.argmax(probs_full).item()

    if len(counts) > 0:
        sorted_pairs = sorted(counts.items(), key=lambda kv: kv[1], reverse=True)
        top_count = sorted_pairs[0][1]
        second_count = sorted_pairs[1][1] if len(sorted_pairs) > 1 else 0
        gap = top_count - second_count
    else:
        gap = 0
    return winner_id, max_samples, False, gap

# ===== Evaluate =====
records = []
correct = 0

for i in range(len(val_data)):
    seq = torch.tensor(val_data[i][:-1], dtype=torch.long, device=device).unsqueeze(0)
    true_id = int(val_data[i][-1])
    true_tok = itos[true_id]

    with torch.no_grad():
        logits, _ = model(seq)
        logits = logits[:, -1, :]  # only final position
        probs = F.softmax(logits, dim=-1).squeeze(0).cpu()

        winner_id, k_used, stopped, gap_at_stop = sample_until_threshold(
            probs_full=probs,
            delta=delta,
            max_samples=max_samples,
            restrict_ids=response_token_ids if restrict_to_response_tokens else None
        )
        pred_tok = itos[winner_id]

        is_correct = (winner_id == true_id)
        if is_correct:
            correct += 1

        confidence = float(probs[winner_id].item())
        prob_true  = float(probs[true_id].item())
        entropy    = calculate_entropy(probs.numpy())

    # meta from aligned CSV
    is_congruent = int(meta.loc[i, "trial4_is_congruent"])
    trial4_layout = int(meta.loc[i, "trial4_layout"])
    human_rt = float(meta.loc[i, "trial4_response_time"]) if "trial4_response_time" in meta.columns else np.nan

    row = {
        "example": i + 1,
        "true_response": true_tok,
        "predicted_response": pred_tok,
        "is_correct": bool(is_correct),

        "confidence": round(confidence, 6),
        "prob_of_true_token": round(prob_true, 6),
        "entropy": round(entropy, 6),

        # stopping diagnostics
        "k_samples": int(k_used),
        "stopped_by_delta": bool(stopped),
        "gap_at_stop": int(gap_at_stop),
        "delta": int(delta),
        "max_samples": int(max_samples),

        # meta
        "is_congruent": is_congruent,
        "trial4_layout": trial4_layout,
        "human_rt_ms": human_rt,
    }

    # per-token probs (directions only)
    for tok in RESPONSE_TOKENS:
        row[f"{tok}_prob"] = round(float(probs[stoi[tok]].item()), 6)

    records.append(row)

# ===== Save Results =====
df = pd.DataFrame(records)
df.to_csv(output_csv, index=False)
acc = correct / len(val_data) * 100.0
print("\n✅ Stopping-rule evaluation (human) complete.")
print(f"Accuracy = {acc:.2f}% (Δ={delta}, max={max_samples})")
print(f"Results saved to: {output_csv}")


number of parameters: 0.79M
✅ Loaded Human-trained GPT model
Loaded 107063 validation samples | block_size=26

✅ Stopping-rule evaluation (human) complete.
Accuracy = 97.21% (Δ=3, max=100)
Results saved to: /content/drive/MyDrive/Colab Notebooks/Flanker-GPT/gpt_human_noctx_val_predictions_stopping.csv


### Congruency Effect Analysis

In [9]:
# ===== Analyze Congruency Effect (No RT Version) =====
# This script evaluates accuracy, entropy, and k_samples by congruency.
# It assumes that your prediction CSV has: is_congruent, is_correct, entropy, confidence, k_samples

from google.colab import drive
drive.mount('/content/drive')

import os, numpy as np, pandas as pd

# ==== INPUT (update to the model you want to analyze) ====
base_dir   = "/content/drive/MyDrive/Colab Notebooks/Flanker-GPT"
pred_csv   = f"{base_dir}/gpt_human_noctx_val_predictions_stopping.csv" #*

out_dir    = os.path.join(base_dir, "congruency_effect")
os.makedirs(out_dir, exist_ok=True)

# ==== LOAD ====
df = pd.read_csv(pred_csv)
req = {"is_congruent","is_correct","entropy","confidence","k_samples"}
missing = list(req - set(df.columns))
assert not missing, f"Missing columns: {missing}"

# ==== BASIC SUMMARIES ====
acc_overall = df["is_correct"].mean()*100
acc_by_c = df.groupby("is_congruent")["is_correct"].mean().rename({0:"incongruent",1:"congruent"})*100

ent_by_c = df.groupby("is_congruent")["entropy"].mean().rename({0:"incongruent",1:"congruent"})
k_by_c   = df.groupby("is_congruent")["k_samples"].mean().rename({0:"incongruent",1:"congruent"})

print("=== Congruency Effect (Model, No RT) ===")
print(f"Accuracy overall: {acc_overall:.2f}%")
print(f"Accuracy by congruency (%):\n{acc_by_c.to_string()}")
print(f"\nEntropy mean by congruency:\n{ent_by_c.to_string()}")
print(f"\nk_samples mean by congruency:\n{k_by_c.to_string()}")

# ==== SAVE TABLE ====
rows = []
rows.append({"metric":"Accuracy_overall_%", "value": acc_overall})
rows.append({"metric":"Accuracy_congruent_%", "value": acc_by_c.get("congruent", np.nan)})
rows.append({"metric":"Accuracy_incongruent_%", "value": acc_by_c.get("incongruent", np.nan)})
rows.append({"metric":"Entropy_mean_congruent", "value": ent_by_c.get("congruent", np.nan)})
rows.append({"metric":"Entropy_mean_incongruent", "value": ent_by_c.get("incongruent", np.nan)})
rows.append({"metric":"k_mean_congruent", "value": k_by_c.get("congruent", np.nan)})
rows.append({"metric":"k_mean_incongruent", "value": k_by_c.get("incongruent", np.nan)})

out_csv = os.path.join(out_dir, os.path.basename(pred_csv).replace(".csv","_congruency_stats_NoRT.csv"))
pd.DataFrame(rows).to_csv(out_csv, index=False)

print("\nSaved:", out_csv)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
=== Congruency Effect (Model, No RT) ===
Accuracy overall: 97.21%
Accuracy by congruency (%):
is_congruent
incongruent    95.398383
congruent      99.015788

Entropy mean by congruency:
is_congruent
incongruent    0.218666
congruent      0.080407

k_samples mean by congruency:
is_congruent
incongruent    3.315318
congruent      3.070554

Saved: /content/drive/MyDrive/Colab Notebooks/Flanker-GPT/congruency_effect/gpt_human_noctx_val_predictions_stopping_congruency_stats_NoRT.csv


# HUMAN BASELINE

### Human RT Analysis

In [11]:
# ===== Human RT Analysis (distribution + stats, patched) =====

import os, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ===== Paths =====
base_dir = "/content/drive/MyDrive/Colab Notebooks/Flanker-GPT/human_dataset_noctx"
train_meta_path = os.path.join(base_dir, "train_meta.csv")
val_meta_path   = os.path.join(base_dir, "val_meta.csv")
out_dir         = os.path.join(base_dir, "rt_analysis")
os.makedirs(out_dir, exist_ok=True)

# ===== Load =====
train_meta = pd.read_csv(train_meta_path)
val_meta   = pd.read_csv(val_meta_path)
df = pd.concat([train_meta, val_meta], ignore_index=True)

# ===== Check required columns =====
required_cols = ["trial4_is_congruent","trial4_response_time","trial4_correct"]
missing = [c for c in required_cols if c not in df.columns]
assert not missing, f"Missing columns in meta: {missing}"

# Force integer dtype (avoid weird mixed dtypes)
df["trial4_is_congruent"] = df["trial4_is_congruent"].astype(int)
df["trial4_correct"]      = df["trial4_correct"].astype(int)

# ===== Detect RT units & clean =====
rt_raw = df["trial4_response_time"].astype(float)

# Heuristic: if median < 10, assume seconds → convert to ms
if rt_raw.median(skipna=True) < 10:
    df["human_rt_ms"] = rt_raw * 1000.0
else:
    df["human_rt_ms"] = rt_raw

# Drop NaN values
df = df.dropna(subset=["human_rt_ms"]).copy()

# Trim outliers with a simple cutoff
RT_MIN, RT_MAX = 150, 3000  # ms
df["rt_keep"] = (df["human_rt_ms"] >= RT_MIN) & (df["human_rt_ms"] <= RT_MAX)
kept_ratio = df["rt_keep"].mean()
df = df[df["rt_keep"]].copy()

# ===== Helper functions =====
def rt_summary(x: pd.Series):
    x = x.dropna()
    q = x.quantile([0,.1,.25,.5,.75,.9,.95,.99])
    return {
        "n":   int(x.count()),
        "mean": float(x.mean()),
        "std":  float(x.std(ddof=1)),
        "min":  float(x.min()),
        "p10":  float(q.loc[0.10]),
        "p25":  float(q.loc[0.25]),
        "p50":  float(q.loc[0.50]),
        "p75":  float(q.loc[0.75]),
        "p90":  float(q.loc[0.90]),
        "p95":  float(q.loc[0.95]),
        "p99":  float(q.loc[0.99]),
        "max":  float(x.max()),
    }

def cohens_d(a, b):
    a = np.asarray(a, dtype=float); b = np.asarray(b, dtype=float)
    if len(a) < 2 or len(b) < 2: return np.nan
    m1, m2 = a.mean(), b.mean()
    s1, s2 = a.std(ddof=1), b.std(ddof=1)
    sp = np.sqrt(((len(a)-1)*s1*s1 + (len(b)-1)*s2*s2) / (len(a)+len(b)-2))
    return (m1 - m2)/sp if sp > 0 else np.nan

# ===== Overall summary =====
overall = rt_summary(df["human_rt_ms"])

# ===== By congruency =====
cong_map = {0: "incongruent", 1: "congruent"}
by_cong_df = df.groupby("trial4_is_congruent")["human_rt_ms"] \
               .apply(lambda s: pd.Series(rt_summary(s))) \
               .reset_index()
by_cong_named = {
    cong_map[int(row["trial4_is_congruent"])]: row.drop("trial4_is_congruent").to_dict()
    for _, row in by_cong_df.iterrows()
}

# ===== By correctness =====
corr_map = {0: "incorrect", 1: "correct"}
by_corr_df = df.groupby("trial4_correct")["human_rt_ms"] \
               .apply(lambda s: pd.Series(rt_summary(s))) \
               .reset_index()
by_correct_named = {
    corr_map[int(row["trial4_correct"])]: row.drop("trial4_correct").to_dict()
    for _, row in by_corr_df.iterrows()
}

# ===== Congruency gap & effect size =====
rt_inc = df.loc[df["trial4_is_congruent"]==0, "human_rt_ms"].values
rt_con = df.loc[df["trial4_is_congruent"]==1, "human_rt_ms"].values
gap_ms = float(np.mean(rt_inc) - np.mean(rt_con))
d_eff  = float(cohens_d(rt_inc, rt_con))

# ===== Save stats table =====
stats_rows = [{"group":"overall", **overall}]
for k, v in by_cong_named.items():
    stats_rows.append({"group": f"congruency:{k}", **v})
for k, v in by_correct_named.items():
    stats_rows.append({"group": f"correctness:{k}", **v})

stats_df = pd.DataFrame(stats_rows)
stats_csv = os.path.join(out_dir, "human_rt_stats.csv")
stats_df.to_csv(stats_csv, index=False)

# ===== Plots =====
plt.figure(figsize=(6,4))
plt.hist(df["human_rt_ms"], bins=60)
plt.xlabel("RT (ms)"); plt.ylabel("Count"); plt.title("Human RT Histogram (trimmed)")
plt.tight_layout(); plt.savefig(os.path.join(out_dir,"rt_hist_overall.png"), dpi=160); plt.close()

# hist by congruency (overlay)
plt.figure(figsize=(6,4))
plt.hist(rt_con, bins=60, alpha=0.6, label="congruent")
plt.hist(rt_inc, bins=60, alpha=0.6, label="incongruent")
plt.xlabel("RT (ms)"); plt.ylabel("Count"); plt.title("RT by Congruency (Histogram)")
plt.legend(); plt.tight_layout()
plt.savefig(os.path.join(out_dir,"rt_hist_by_congruency.png"), dpi=160); plt.close()

# CDF by congruency
def ecdf(x):
    xs = np.sort(x); ys = np.arange(1, len(xs)+1)/len(xs); return xs, ys
x1,y1 = ecdf(rt_con); x2,y2 = ecdf(rt_inc)
plt.figure(figsize=(6,4))
plt.plot(x1,y1,label="congruent")
plt.plot(x2,y2,label="incongruent")
plt.xlabel("RT (ms)"); plt.ylabel("CDF"); plt.title("RT CDF by Congruency")
plt.legend(); plt.tight_layout()
plt.savefig(os.path.join(out_dir,"rt_cdf_by_congruency.png"), dpi=160); plt.close()

# boxplot
plt.figure(figsize=(5,4))
plt.boxplot([rt_con, rt_inc], labels=["congruent","incongruent"])
plt.ylabel("RT (ms)"); plt.title("RT Boxplot by Congruency")
plt.tight_layout(); plt.savefig(os.path.join(out_dir,"rt_box_by_congruency.png"), dpi=160); plt.close()

# ===== Print quick summary =====
print("====== Human RT (trimmed) ======")
print(f"Kept ratio after trimming [{RT_MIN},{RT_MAX}] ms: {kept_ratio*100:.1f}%")
print("Overall:", overall)
print("By congruency:", by_cong_named)
print("By correctness:", by_correct_named)
print(f"Congruency gap (inc - con): {gap_ms:.2f} ms  |  Cohen's d: {d_eff:.3f}")
print(f"\nSaved stats to: {stats_csv}")
print(f"Plots saved in: {out_dir}")
print(by_cong_df)

  plt.boxplot([rt_con, rt_inc], labels=["congruent","incongruent"])


Kept ratio after trimming [150,3000] ms: 100.0%
Overall: {'n': 1070284, 'mean': 788.3175502950619, 'std': 204.9591513113322, 'min': 200.0, 'p10': 592.0, 'p25': 657.0, 'p50': 746.0, 'p75': 865.0, 'p90': 1027.0, 'p95': 1158.0, 'p99': 1527.0, 'max': 3000.0}
By congruency: {'incongruent': {'level_1': 'max', 'human_rt_ms': 3000.0}, 'congruent': {'level_1': 'max', 'human_rt_ms': 2992.0}}
By correctness: {'incorrect': {'level_1': 'max', 'human_rt_ms': 2984.0}, 'correct': {'level_1': 'max', 'human_rt_ms': 3000.0}}
Congruency gap (inc - con): 75.84 ms  |  Cohen's d: 0.377

Saved stats to: /content/drive/MyDrive/Colab Notebooks/Flanker-GPT/human_dataset_noctx/rt_analysis/human_rt_stats.csv
Plots saved in: /content/drive/MyDrive/Colab Notebooks/Flanker-GPT/human_dataset_noctx/rt_analysis
    trial4_is_congruent level_1    human_rt_ms
0                     0       n  533637.000000
1                     0    mean     826.343261
2                     0     std     211.115736
3                     0 

### Congruency Effect Analysis

In [12]:
# ===== Accuracy statistics =====
acc_overall = df["trial4_correct"].mean()
acc_by_cong = df.groupby("trial4_is_congruent")["trial4_correct"].mean().rename({0:"incongruent",1:"congruent"})

# Accuracy vs RT deciles + by congruency (simple CAF)
df["rt_decile"] = pd.qcut(df["human_rt_ms"], 10, labels=False, duplicates="drop")
acc_by_rtdecile = df.groupby("rt_decile")["trial4_correct"].mean()
acc_by_rtdecile_cong = df.groupby(["trial4_is_congruent","rt_decile"])["trial4_correct"].mean().unstack(0).rename(columns={0:"incongruent",1:"congruent"})

# Save accuracy summary
acc_df = pd.DataFrame({
    "metric": ["overall_accuracy"],
    "value": [acc_overall]
})
acc_by_cong_df = acc_by_cong.reset_index().rename(columns={"trial4_is_congruent":"congruency","trial4_correct":"accuracy"})
acc_by_cong_df["congruency"] = acc_by_cong_df["congruency"].map({0:"incongruent",1:"congruent"})
acc_out = os.path.join(out_dir, "human_accuracy_stats.csv")
pd.concat([acc_df, acc_by_cong_df], ignore_index=True).to_csv(acc_out, index=False)

# Plot: accuracy vs RT deciles
plt.figure(figsize=(6,4))
plt.plot(acc_by_rtdecile.index, acc_by_rtdecile.values, marker="o")
plt.xlabel("RT decile (slow → fast)"); plt.ylabel("Accuracy")
plt.title("Accuracy vs RT deciles")
plt.tight_layout(); plt.savefig(os.path.join(out_dir,"acc_vs_rt_deciles.png"), dpi=160); plt.close()

# Plot: Conditional Accuracy Function (CAF) by congruency
plt.figure(figsize=(6,4))
for col in ["congruent","incongruent"]:
    if col in acc_by_rtdecile_cong.columns:
        plt.plot(acc_by_rtdecile_cong.index, acc_by_rtdecile_cong[col].values, marker="o", label=col)
plt.xlabel("RT decile"); plt.ylabel("Accuracy")
plt.title("Conditional Accuracy Function (CAF)")
plt.legend(); plt.tight_layout()
plt.savefig(os.path.join(out_dir,"caf_by_congruency.png"), dpi=160); plt.close()

print("\n====== Accuracy ======")
print(f"Overall accuracy: {acc_overall*100:.2f}%")
print("By congruency (%):")
print((acc_by_cong*100).rename_axis("congruency").to_string())
print(f"\nSaved accuracy stats to: {acc_out}")


Overall accuracy: 97.15%
By congruency (%):
congruency
incongruent    95.451402
congruent      98.829584

Saved accuracy stats to: /content/drive/MyDrive/Colab Notebooks/Flanker-GPT/human_dataset_noctx/rt_analysis/human_accuracy_stats.csv
