In [1]:
# 0) bootstrap
import os, sys
from pathlib import Path
ROOT = Path.cwd()
while not (ROOT / "pyproject.toml").exists() and ROOT != ROOT.parent:
    ROOT = ROOT.parent
os.chdir(ROOT)
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
print("Project root:", ROOT)

Project root: d:\IIT BBS\Job Resources\Business Optima\new-pdf-agent


In [2]:
# 1) load cfg + ids
from packages.core_config.config import load_yaml
cfg = load_yaml("configs/providers.yaml", "configs/pipelines/generic_legal.yaml")
doc_id = "NFS_2019"
artifacts_root = Path("data/artifacts") / doc_id

In [3]:
# 2) build config object
from packages.retriever.reranker import (
    RerankerConfig, RerankerMiningConfig, RerankerTrainConfig,
    train_reranker, eval_mrr_at_10
)

rr = cfg.get("reranker", {})
rcfg = RerankerConfig(
    enable=bool(rr.get("enable", True)),
    base_model_id=str(rr.get("base_model_id", "cross-encoder/ms-marco-MiniLM-L-6-v2")),
    base_model_local_dir=str(rr.get("base_model_local_dir", "")),
    output_root=str(rr.get("output_root", "data/reranker")),
    mining=RerankerMiningConfig(
        topk_candidates=int(rr.get("mining",{}).get("topk_candidates", 30)),
        negatives_per_pos=int(rr.get("mining",{}).get("negatives_per_pos", 4)),
        min_question_len=int(rr.get("mining",{}).get("min_question_len", 8)),
        max_pairs=int(rr.get("mining",{}).get("max_pairs", 2000)),
        seed=int(rr.get("mining",{}).get("seed", 123)),
    ),
    train=RerankerTrainConfig(
        epochs=int(rr.get("train",{}).get("epochs", 1)),
        batch_size=int(rr.get("train",{}).get("batch_size", 16)),
        lr=float(rr.get("train",{}).get("lr", 2.0e-5)),
        warmup_steps=int(rr.get("train",{}).get("warmup_steps", 50)),
        eval_ratio=float(rr.get("train",{}).get("eval_ratio", 0.1)),
        seed=int(rr.get("train",{}).get("seed", 42)),
    ),
)

  from tqdm.autonotebook import tqdm, trange


In [4]:
# 3) train
job = train_reranker(doc_id=doc_id, artifacts_root=artifacts_root, cfg=rcfg)
job

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

{'id': 'd6810a4c',
 'doc_id': 'NFS_2019',
 'created_at': 1758292502,
 'status': 'COMPLETED',
 'run_dir': 'data\\reranker\\NFS_2019\\d22b1ac3',
 'model_path': 'data\\reranker\\NFS_2019\\d22b1ac3\\model',
 'base_model': 'cross-encoder/ms-marco-MiniLM-L-6-v2',
 'train_stats': {'train_pairs': 63, 'eval_pairs': 7}}

In [5]:
# 4) quick eval
from pathlib import Path
metrics = eval_mrr_at_10(doc_id, artifacts_root, Path(job["model_path"]))
metrics

{'mrr@10': 1.0, 'n': 7}