# 03 â€” Evaluation

This notebook:
1) runs evaluation via `scripts/eval_all.py`
2) loads/inspects the evaluation report artifacts
3) provides basic result summaries

Note: full metrics will appear once `rai_rag.eval.run_eval.run_all` is implemented.

In [None]:
import subprocess, sys
from pathlib import Path
import json

OUT_DIR = Path("runs/eval_001")
cmd = [
    sys.executable, "scripts/eval_all.py",
    "--config", "configs/base.yaml",
    "--bench_dir", "data/benchmarks",
    "--out_dir", str(OUT_DIR),
]
print("Running:", " ".join(cmd))
subprocess.run(cmd, check=True)

print("Files:", [p.name for p in OUT_DIR.glob("*")])

In [None]:
report_path = OUT_DIR / "report.json"
if report_path.exists():
    report = json.loads(report_path.read_text(encoding="utf-8"))
    report
else:
    print("No report.json found. Your internal evaluator may be writing a different output.")

In [None]:
import pandas as pd

# Example: expect future outputs like OUT_DIR / "results.jsonl"
results_path = OUT_DIR / "results.jsonl"
if results_path.exists():
    rows = []
    with results_path.open("r", encoding="utf-8") as f:
        for line in f:
            rows.append(json.loads(line))
    df = pd.DataFrame(rows)
    df.head()
else:
    print("No results.jsonl found (expected once run_all is implemented).")

In [None]:
# Once you implement evaluation, you might store fields like:
# - safe: bool
# - helpful: float
# - action: Retrieve/Restrict/No-Retrieve
# - evidence_kept: int
# Here is a robust placeholder summary.

if 'df' in globals() and len(df) > 0:
    cols = df.columns.tolist()
    print("Columns:", cols)

    if "action" in cols:
        print(df["action"].value_counts())

    if "safe" in cols:
        print("Safety rate:", df["safe"].mean())

    if "evidence_kept" in cols:
        print(df["evidence_kept"].describe())
else:
    print("No evaluation dataframe available yet.")