# Evaluation Ledger Analysis Dashboard

This notebook aligns with the production evaluation pipeline by reusing
the shared helpers that power `evals/scripts/evaluation/core/production_evaluation.py`.
It is designed to be executed locally with the same configuration gates
used by the CLI workflow.

## Usage
1. Resolve the repository root and import the shared helpers.
2. Inspect the configured passes to confirm the environment deltas.
3. Run a dry-run to verify wiring or execute the passes for real metrics.
4. Open the generated JSON artifacts under `metrics/production_evaluations`.

In [None]:
from __future__ import annotations

from collections.abc import Iterable
from pathlib import Path

from evals.scripts.evaluation.core.production_eval_helpers import (
    DEFAULT_PRODUCTION_PASSES,
    ProductionEvaluationSummary,
    run_production_evaluation,
)


def resolve_project_root(candidates: Iterable[Path] | None = None) -> Path:
    """Best-effort resolution of the repository root for notebook execution."""

    probe = list(candidates or [])
    if not probe:
        cwd = Path.cwd().resolve()
        probe = [cwd, cwd.parent, cwd.parents[1] if len(cwd.parents) > 1 else cwd]

    for candidate in probe:
        marker = candidate / "pyproject.toml"
        if marker.exists():
            return candidate
    raise RuntimeError("Unable to locate project root; add it to candidates.")


PROJECT_ROOT = resolve_project_root()
RESULTS_DIR = PROJECT_ROOT / "metrics/production_evaluations"
PROJECT_ROOT


In [None]:
passes = DEFAULT_PRODUCTION_PASSES
for idx, cfg in enumerate(passes, start=1):
    print(f"Pass {idx}: {cfg.name}")
    print(f"  Description: {cfg.description}")
    for key, value in cfg.env.items():
        print(f"    {key}={value}")
    if cfg.extra_args:
        print(f"    extra_args={cfg.extra_args}")
    print()


In [None]:
# Dry-run to confirm everything resolves (set execute=True to run the full evaluation).
summary: ProductionEvaluationSummary = run_production_evaluation(
    passes,
    project_root=PROJECT_ROOT,
    results_dir=RESULTS_DIR,
    execute=False,
    capture_output=False,
)
summary.overall_status


When you are ready to execute the real evaluation, rerun the previous cell
with `execute=True`. The helper will write per-pass JSON payloads and an
aggregate analysis file (`analysis_<timestamp>.json`) under the metrics
directory.