# Diversify MATH (Notebook)

This notebook now imports the production agents from `debunk_sft.utils.dataset.math.diversify_math` so that any experiments you run here match the CLI pipeline. Configure the knobs below, execute the helper cell, and you can iterate over a handful of samples to inspect their diversification/solution/verification traces inline.


In [None]:
import dataclasses
import json
import os
import sys
from pathlib import Path
from typing import Any, Dict, List

from IPython.display import Markdown, display

NOTEBOOK_DIR = Path.cwd().resolve()
REPO_ROOT = NOTEBOOK_DIR
while REPO_ROOT != REPO_ROOT.parent and not (REPO_ROOT / "debunk_sft").exists():
    REPO_ROOT = REPO_ROOT.parent
if (REPO_ROOT / "debunk_sft").exists() and str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

from debunk_sft.utils.dataset.math import diversify_math as dm

CONFIG: Dict[str, Any] = {
    "model_diversifier": "gpt-4.1",
    "model_solver": "o3",
    "model_verifier": "o3-mini",
    "subset": "algebra",
    "split": "train",
    "sample_limit": 5,  # how many rows to preview inline
    "seed": 42,
    "verifier_passes": 3,
    "solver_attempts": 2,
    "include_failed": True,
    "output_path": "notebook_diversified_math.jsonl",
}

OUTPUT_PATH = Path(CONFIG["output_path"]).expanduser().resolve()
os.makedirs(OUTPUT_PATH.parent, exist_ok=True)

print(
    f"Will write preview runs to {OUTPUT_PATH} (overwrite: yes).\n"
    "Update CONFIG above to change models, subset, or attempt budgets."
)


In [None]:
def _make_agents() -> Dict[str, Any]:
    """Instantiate the production agents once so we can reuse them across samples."""

    diversifier = dm.DiversifierAgent(CONFIG["model_diversifier"])
    solver = dm.SolverAgent(CONFIG["model_solver"])
    verifier = dm.VerifierAgent(
        CONFIG["model_verifier"], passes=CONFIG["verifier_passes"]
    )
    return {
        "diversifier": diversifier,
        "solver": solver,
        "verifier": verifier,
    }


def _iter_preview_samples(limit: int):
    dataset = dm._load_hendrycks_math(CONFIG["subset"], CONFIG["split"])
    yield from dm._iter_samples(dataset, limit=limit)


In [None]:
def _format_record_md(record: Dict[str, Any], idx: int) -> str:
    verdict = record["verification"]["verdict"]
    status = "✅ PASS" if verdict == "pass" else "❌ FAIL"
    solver_attempts = len(record.get("solver_attempt_history", []))
    md = [
        f"### Sample {idx + 1}: {status}",
        f"- **Subset / Level / Type**: {record['subset']} / {record['level']} / {record['type']}",
        f"- **Solver attempts recorded**: {solver_attempts}",
        f"- **Verifier reason**: {record['verification']['reason']}",
        "\n**Diversified Problem**\n",
        f"> {record['diversified_problem']}",
    ]
    if record.get("diversified_solution"):
        md.extend([
            "\n**Solver Solution (truncated)**\n",
            f"````\n{record['diversified_solution'][:1000]}\n````",
        ])
    if record.get("final_answer"):
        md.extend([
            "\n**Final Answer (extracted)**\n",
            f"`{record['final_answer']}`",
        ])
    if record.get("solver_attempt_history"):
        md.append("\n**Attempt History (verdicts only)**")
        for attempt in record["solver_attempt_history"]:
            md.append(
                f"- Attempt {attempt['attempt_index'] + 1}: "
                f"{attempt['verification']['verdict']} – {attempt['verification']['reason']}"
            )
    return "\n".join(md)


def preview_samples(limit: int | None = None, include_failed: bool | None = None) -> List[Dict[str, Any]]:
    limit = limit or CONFIG["sample_limit"]
    include_failed = CONFIG["include_failed"] if include_failed is None else include_failed

    agents = _make_agents()
    records: List[Dict[str, Any]] = []

    with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
        for idx, row in enumerate(_iter_preview_samples(limit)):
            original_problem = str(row.get("problem", "")).strip()
            original_solution = str(row.get("solution", "")).strip()
            level = row.get("level")
            problem_type = row.get("type")

            if not original_problem or not original_solution:
                continue

            raw_diversifier_response: str | None = None
            raw_solver_response: str | None = None
            raw_verifier_response: str | None = None
            solver_attempt_history: List[Dict[str, Any]] = []

            try:
                raw_diversifier_response, diversification = agents["diversifier"].diversify(
                    problem=original_problem,
                    solution=original_solution,
                )
                (
                    diversification,
                    verification,
                    raw_solver_response,
                    raw_verifier_response,
                    solver_attempt_history,
                    passed,
                ) = dm._attempt_solver_with_retries(
                    solver=agents["solver"],
                    verifier=agents["verifier"],
                    original_problem=original_problem,
                    original_solution=original_solution,
                    base_diversification=diversification,
                    solver_attempts=CONFIG["solver_attempts"],
                )
            except Exception as exc:  # pragma: no cover - exploratory notebook
                diversification = None
                verification = dm.Verification(
                    verdict="fail",
                    reason=f"Notebook pipeline error: {type(exc).__name__}: {exc}",
                    consistency_checks={},
                )
                passed = False

            record = {
                "subset": CONFIG["subset"],
                "split": CONFIG["split"],
                "level": level,
                "type": problem_type,
                "original_problem": original_problem,
                "original_solution": original_solution,
                "diversified_problem": getattr(diversification, "diversified_problem", None)
                if diversification
                else None,
                "diversified_solution": getattr(diversification, "diversified_solution", None)
                if diversification
                else None,
                "final_answer": solver_attempt_history[-1].get("final_answer") if solver_attempt_history else None,
                "transformation_type": getattr(diversification, "transformation_type", None)
                if diversification
                else None,
                "change_description": getattr(diversification, "change_description", None)
                if diversification
                else None,
                "changed_quantity_before": getattr(diversification, "changed_quantity_before", None)
                if diversification
                else None,
                "changed_quantity_after": getattr(diversification, "changed_quantity_after", None)
                if diversification
                else None,
                "notes": getattr(diversification, "notes", None) if diversification else None,
                "diversifier_raw_response": raw_diversifier_response,
                "solver_raw_response": raw_solver_response,
                "verifier_raw_response": raw_verifier_response,
                "verification": dataclasses.asdict(verification),
                "solver_attempt_history": solver_attempt_history,
            }

            if passed or include_failed:
                f.write(json.dumps(record, ensure_ascii=False) + "\n")
                records.append(record)
                display(Markdown(_format_record_md(record, idx)))

    print(f"Preview complete — wrote {len(records)} rows to {OUTPUT_PATH}")
    return records


In [None]:
results = preview_samples(limit=CONFIG["sample_limit"])
results[:4]
