# QA Toggle + Metrics Skeleton

Use this notebook to validate config-driven evaluation toggles. It accepts a config file (JSON/TOML), enumerates checkpoints, and writes placeholder metric rows (logit KL, hidden-state cosine, residual RMS) to the configured JSONL output. Real metrics can slot into the stub helpers later without changing the surrounding plumbing.


In [None]:
from __future__ import annotations

import json
import os
from datetime import datetime
from pathlib import Path
from typing import Any, Dict

import numpy as np

from config import EvalConfig, load_eval_config

CONFIG_PATH = Path(
    os.environ.get(
        "OFFLINE_TOUR_GUIDE_EVAL_CONFIG",
        "config/eval_template.json",
    )
)
config: EvalConfig = load_eval_config(CONFIG_PATH if CONFIG_PATH.exists() else None)
config.ensure_output_dirs()

CHECKPOINTS = config.checkpoints or [
    f"runs/{config.run_id}/checkpoints/base.pt",
    f"runs/{config.run_id}/checkpoints/permutations_only.pt",
    f"runs/{config.run_id}/checkpoints/permutations_rotations.pt",
    f"runs/{config.run_id}/checkpoints/permutations_rotations_style.pt",
]
print(f"Using config at: {CONFIG_PATH.resolve() if CONFIG_PATH.exists() else 'defaults'}")
print(f"Writing metrics to: {config.metrics_output_path}")
print(f"Checkpoints ({len(CHECKPOINTS)}):")
for path in CHECKPOINTS:
    print(f" • {path}")


In [None]:
def _stub_metrics(metric_flags: Dict[str, bool], idx: int) -> Dict[str, float | None]:
    rng = np.random.default_rng(seed=idx)
    stub: Dict[str, float | None] = {}
    if metric_flags.get("logit_kl", False):
        stub["logit_kl"] = round(float(rng.uniform(0.05, 0.4)), 4)
    if metric_flags.get("hidden_state_cosine", False):
        stub["hidden_state_cosine"] = round(float(rng.uniform(0.6, 0.95)), 4)
    if metric_flags.get("residual_rms", False):
        stub["residual_rms"] = round(float(rng.uniform(0.8, 1.1)), 4)
    if metric_flags.get("surprisal", False):
        stub["surprisal"] = round(float(rng.uniform(15.0, 35.0)), 2)
    return stub


def evaluate_checkpoint_stub(checkpoint_path: str, config: EvalConfig, idx: int) -> Dict[str, Any]:
    metric_flags = config.metrics.to_dict()
    variant_flags = config.variant.to_dict()
    payload: Dict[str, Any] = {
        "run_id": config.run_id,
        "checkpoint": checkpoint_path,
        "created_at": datetime.utcnow().isoformat() + "Z",
        "metric_flags": metric_flags,
        "variant_flags": variant_flags,
        "metrics": _stub_metrics(metric_flags, idx),
        "notes": config.notes or "stub metrics pending transport eval",
    }
    return payload


def write_metrics(rows: list[Dict[str, Any]], destination: Path) -> None:
    destination.parent.mkdir(parents=True, exist_ok=True)
    with destination.open("w", encoding="utf-8") as handle:
        for row in rows:
            handle.write(json.dumps(row, ensure_ascii=False) + "\n")


In [None]:
rows = [evaluate_checkpoint_stub(path, config, idx) for idx, path in enumerate(CHECKPOINTS)]
write_metrics(rows, config.metrics_path)
print(f"Wrote {len(rows)} metric rows → {config.metrics_path}")
rows[-1]
