In [1]:
# --- 0) bootstrap monorepo root & sys.path ---
import os, sys
from pathlib import Path

ROOT = Path.cwd()
while not (ROOT / "pyproject.toml").exists() and ROOT != ROOT.parent:
    ROOT = ROOT.parent
os.chdir(ROOT)
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

print("Project root:", ROOT)

Project root: d:\IIT BBS\Job Resources\Business Optima\new-pdf-agent


In [2]:
# --- 1) imports & small helpers ---
# pyyaml is needed by the prompt pack registry; install if missing
try:
    import yaml  # type: ignore
except Exception:
    import subprocess, sys as _sys
    subprocess.check_call([_sys.executable, "-m", "pip", "install", "pyyaml"])
    import yaml  # type: ignore

import json
from pathlib import Path

from packages.core_config.config import load_yaml
from packages.sft.generate_pairs import SFTGenConfig, generate_pairs

def peek_jsonl(path: Path, n: int = 3):
    print(f"\n# peek -> {path}")
    try:
        with path.open("r", encoding="utf-8") as f:
            for i, line in enumerate(f, 1):
                if i > n: break
                print(line.rstrip()[:300])
    except FileNotFoundError:
        print("  (not found)")

In [3]:
# --- 2) choose document (same one you used in Phase 1) ---
from pathlib import Path

pdf_path = Path("data/raw/NFS_2019.pdf")
if not pdf_path.exists():
    # fallback: first PDF in data/raw
    cand = sorted(Path("data/raw").glob("*.pdf"))
    assert cand, "No PDFs in data/raw/"
    pdf_path = cand[0]

doc_id = pdf_path.stem
artifacts_root = Path(f"data/artifacts/{doc_id}")

print("doc_id:", doc_id)
print("artifacts_root exists:", artifacts_root.exists())

doc_id: NFS_2019
artifacts_root exists: True


In [4]:
# --- 3) load pipeline & provider YAMLs ---
cfg = load_yaml("configs/providers.yaml", "configs/pipelines/generic_legal.yaml")

# where the prompt pack lives (with profiles + detection rules)
prompts_yaml_path = Path(
    cfg.get("sft.generation.prompts_yaml", "configs/pipelines/prompts/default.yaml")
).resolve()
print("prompts_yaml:", prompts_yaml_path)
assert prompts_yaml_path.exists(), "Prompt pack YAML missing."

# pull 'detection' block from the prompt pack for auto profile selection
with prompts_yaml_path.open("r", encoding="utf-8") as f:
    prompts_yaml = yaml.safe_load(f) or {}
detection_rules = prompts_yaml.get("detection", {}) or {}

prompts_yaml: D:\IIT BBS\Job Resources\Business Optima\new-pdf-agent\configs\pipelines\prompts\default.yaml


In [5]:
# --- 4) build SFTGenConfig from YAML (no LLM by default for quick test) ---
gen_cfg = SFTGenConfig(
    max_qa=int(cfg.get("sft.generation.max_qa", 40)),
    max_summary=int(cfg.get("sft.generation.max_summary", 8)),
    seed=int(cfg.get("sft.generation.seed", 13)),
    min_chunk_chars=int(cfg.get("sft.generation.min_chunk_chars", 300)),
    max_chunk_chars=int(cfg.get("sft.generation.max_chunk_chars", 1600)),
    dedup_regex=r"\s+",

    # LLM off for sampling; turn on later if you want Ollama/company API
    use_llm=bool(cfg.get("sft.generation.use_llm", True)),
    llm_provider=str(cfg.get("sft.generation.llm_provider", "ollama")),
    llm_model=str(cfg.get("sft.generation.llm_model", "llama3.2:latest")),
    llm_url=str(cfg.get("sft.generation.llm_url", "http://localhost:11434")),
    llm_temperature=float(cfg.get("sft.generation.llm_temperature", 0.2)),
    llm_max_new_tokens=int(cfg.get("sft.generation.llm_max_new_tokens", 256)),

    summary_source=str(cfg.get("sft.generation.summary_source", "nodes")),
    summary_target_len=int(cfg.get("sft.generation.summary_target_len", 1200)),

    prompts_yaml=prompts_yaml_path,
    profile_rules=detection_rules,

    datasets_root=Path(cfg.get("sft.generation.datasets_root", "data/datasets")),
)

gen_cfg

SFTGenConfig(max_qa=40, max_summary=8, seed=13, min_chunk_chars=300, max_chunk_chars=1600, dedup_regex='\\s+', use_llm=True, llm_provider='ollama', llm_model='llama3.2:latest', llm_url='http://localhost:11434', llm_temperature=0.4, llm_max_new_tokens=256, llm_connect_timeout=30, llm_timeout_sec=600, llm_retries=1, llm_healthcheck=True, llm_warmup=True, summary_source='nodes', summary_target_len=1200, prompts_yaml=WindowsPath('D:/IIT BBS/Job Resources/Business Optima/new-pdf-agent/configs/pipelines/prompts/default.yaml'), profile_rules_from_prompts_yaml=True, profile_rules={'default': 'generic', 'profiles': {'fee_schedule': {'include_any': ['CPT', 'Relative Value', 'Modifier', 'Medical Fee Schedule']}, 'tax_code': {'include_any': ['Section', 'Schedule', 'sub-section', 'Rule', 'Act']}}}, datasets_root=WindowsPath('data/datasets'))

In [6]:
# --- 5) run pair generation ---
outputs = generate_pairs(
    doc_id=doc_id,
    artifacts_root=artifacts_root,
    cfg=gen_cfg,
)

print("\n=== SFT outputs ===")
for k, v in outputs.items():
    print(f"{k:10s} -> {v}")

# quick peeks
peek_jsonl(outputs["qa"], 5)
peek_jsonl(outputs["summaries"], 3)
peek_jsonl(outputs["combined"], 5)


=== SFT outputs ===
qa         -> data\datasets\NFS_2019\sft\qa.jsonl
summaries  -> data\datasets\NFS_2019\sft\summaries.jsonl
combined   -> data\datasets\NFS_2019\sft\combined.jsonl

# peek -> data\datasets\NFS_2019\sft\qa.jsonl
{"id": "NFS_2019-qa-0001-1", "doc_id": "NFS_2019", "source_chunk_id": "NFS_2019-1618", "page": null, "question": "What types of procedures fall under 'diagnostic procedures'?", "answer": "Endoscopy, arthroscopy, injection procedures, and biopsies", "source_text": "Follow-up Care for Diagnostic Proce
{"id": "NFS_2019-qa-0001-2", "doc_id": "NFS_2019", "source_chunk_id": "NFS_2019-1618", "page": null, "question": "What is included in 'follow-up care for diagnostic procedures'?", "answer": "Care directly related to recovery from the diagnostic procedure itself", "source_text": "Follow-up Care for D
{"id": "NFS_2019-qa-0001-3", "doc_id": "NFS_2019", "source_chunk_id": "NFS_2019-1618", "page": null, "question": "Are nonsurgical treatments of conditions identified b

In [7]:
# --- 6) (optional) tiny sanity checks: schema & counts ---
from collections import Counter

def load_jsonl(path: Path):
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            try:
                rows.append(json.loads(line))
            except Exception:
                pass
    return rows

qa = load_jsonl(outputs["qa"])
sm = load_jsonl(outputs["summaries"])
cb = load_jsonl(outputs["combined"])

print("counts  -> qa:", len(qa), "| summaries:", len(sm), "| combined:", len(cb))

# basic schema checks
assert all("question" in r and "answer" in r for r in qa), "QA schema mismatch"
assert all("summary" in r and "source_text" in r for r in sm), "Summary schema mismatch"
assert all(k in r for r in cb for k in ("kind", "instruction", "output")), "Combined schema mismatch"

print("schema   -> OK")

counts  -> qa: 70 | summaries: 8 | combined: 78
schema   -> OK
