### **1) Paths & sanity**

In [1]:
from pathlib import Path
import sys, os, platform

CWD  = Path.cwd().resolve()
ROOT = CWD if (CWD / "src").exists() else CWD.parent
if str(ROOT) not in sys.path: sys.path.append(str(ROOT))

DATA      = ROOT / "data"
CHUNKS    = DATA / "chunks"
SFT_DIR   = DATA / "sft"
PAIRS_DIR = DATA / "pairs"
OUT_RR    = ROOT / "outputs" / "reranker" / "title17"
OUT_RR.mkdir(parents=True, exist_ok=True)

print("ROOT:", ROOT)
print("CHUNKS:", CHUNKS)
print("SFT_DIR:", SFT_DIR)
print("PAIRS_DIR:", PAIRS_DIR)
print("OUT_RR:", OUT_RR)
print("Python:", platform.python_version(), "| CPU threads:", os.cpu_count())

ROOT: D:\IIT BBS\Job Resources\Business Optima\pdf-agent
CHUNKS: D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\chunks
SFT_DIR: D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft
PAIRS_DIR: D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\pairs
OUT_RR: D:\IIT BBS\Job Resources\Business Optima\pdf-agent\outputs\reranker\title17
Python: 3.11.13 | CPU threads: 8


### **SFT rows format**

In [2]:
import json, itertools, pathlib
fp = SFT_DIR / "train.jsonl"
print("showing 3 rows from", fp)
for line in itertools.islice(open(fp, "r", encoding="utf-8"), 3):
    obj = json.loads(line); print(list(obj.keys())[:10], "→ type:", type(obj))

showing 3 rows from D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft\train.jsonl
['messages', 'response'] → type: <class 'dict'>
['messages', 'response'] → type: <class 'dict'>
['messages', 'response'] → type: <class 'dict'>


### **02) Mining positive/negative pairs**

In [None]:
import subprocess, sys, shlex
args = [
    sys.executable, str(ROOT / "src/train/retriever_pairs.py"),
    "--chunks_dir", str(CHUNKS),
    "--train_jsonl", str(SFT_DIR / "train.jsonl"),
    "--dev_jsonl",   str(SFT_DIR / "dev.jsonl"),
    "--out_dir",     str(PAIRS_DIR),
    "--topk", "30",
    "--neg_per_q", "4",
    "--min_query_len", "12",
]
print("Launching:", " ".join(map(shlex.quote, args)))
print(subprocess.run(args, capture_output=True, text=True).stdout)


Launching: 'd:\Anaconda\envs\pdf-agent-2\python.exe' 'D:\IIT BBS\Job Resources\Business Optima\pdf-agent\src\train\retriever_pairs.py' --chunks_dir 'D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\chunks' --train_jsonl 'D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft\train.jsonl' --dev_jsonl 'D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft\dev.jsonl' --out_dir 'D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\pairs' --topk 30 --neg_per_q 4 --min_query_len 12
[info] loaded 2683 chunks
[stats] kept=130/130 | skipped_short=0 | page=0 | substr=2 | top1=128
[OK] wrote 130 train pairs → D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\pairs\train.pairs.jsonl
[stats] kept=11/11 | skipped_short=0 | page=0 | substr=0 | top1=11
[OK] wrote 11 dev pairs → D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\pairs\dev.pairs.jsonl



In [None]:
import json, itertools
train_pairs = PAIRS_DIR / "train.pairs.jsonl"
dev_pairs   = PAIRS_DIR / "dev.pairs.jsonl"
for p in (train_pairs, dev_pairs):
    if p.exists():
        print(f"\n{p}:")
        for line in itertools.islice(open(p, "r", encoding="utf-8"), 2):
            print(json.loads(line))



D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\pairs\train.pairs.jsonl:
{'query': '(factual) What is the main rule in Copyright Law United States Copyri > § 115 · Scope of exclusive rights in nondramatic musical works: Compulsory license for making and distributing phonorecords 50?\nPrefer concise, correct answers. End with [pp. 88–92].', 'positive': '§ 115 · Scope of exclusive rights in nondramatic musical works: Compulsory license for making and distributing phonorecords 50', 'negatives': ['t programming by cable … | . . 31 |\n| 112 | Limitations on exclusive rights: Ephemeral recordings … | . . 43 |\n| 113 | Scope of exclusive rights in pictorial, graphic, and sculptural works | . . 48 |\n| 114 | Scope of exclusive rights in sound recordings … | . . 50 |\n| 115 | Scope of exclusive rights in nondramatic musical works: Compulsory license for making and distributing phonorecords . . | . . 68 |\n| 116 | Negotiated licenses for public performances by means of coin-operated pho

### **03) Pre-downloading the base reranker for offline use**

In [None]:
from huggingface_hub import snapshot_download

LOCAL_CE = ROOT / "models" / "msmarco-minilm-l6-v2"
LOCAL_CE.mkdir(parents=True, exist_ok=True)

local_dir = snapshot_download(
    repo_id="cross-encoder/ms-marco-MiniLM-L-6-v2",
    local_dir=str(LOCAL_CE),
    local_dir_use_symlinks=False,
)
print("Local reranker base:", local_dir)


Fetching 23 files:   0%|          | 0/23 [00:00<?, ?it/s]

For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


README.md: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

onnx/model.onnx:   0%|          | 0.00/91.0M [00:00<?, ?B/s]

onnx/model_O2.onnx:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

onnx/model_O1.onnx:   0%|          | 0.00/91.0M [00:00<?, ?B/s]

flax_model.msgpack:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/790 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

onnx/model_O3.onnx:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

onnx/model_qint8_arm64.onnx:   0%|          | 0.00/23.2M [00:00<?, ?B/s]

onnx/model_O4.onnx:   0%|          | 0.00/45.5M [00:00<?, ?B/s]

onnx/model_qint8_avx512.onnx:   0%|          | 0.00/23.2M [00:00<?, ?B/s]

onnx/model_qint8_avx512_vnni.onnx:   0%|          | 0.00/23.2M [00:00<?, ?B/s]

onnx/model_quint8_avx2.onnx:   0%|          | 0.00/23.2M [00:00<?, ?B/s]

openvino/openvino_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

openvino_model.xml: 0.00B [00:00, ?B/s]

openvino/openvino_model_qint8_quantized.(…):   0%|          | 0.00/23.1M [00:00<?, ?B/s]

openvino_model_qint8_quantized.xml: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Local reranker base: D:\IIT BBS\Job Resources\Business Optima\pdf-agent\models\msmarco-minilm-l6-v2


### **04) Train reranker (CPU)**

In [None]:
import subprocess, sys, shlex
from pathlib import Path

LOCAL_CE = ROOT / "models" / "msmarco-minilm-l6-v2"

base_ce_id = str(LOCAL_CE if LOCAL_CE.exists() else "cross-encoder/ms-marco-MiniLM-L-6-v2")
print("[info] using base CE:", base_ce_id)

args = [
    sys.executable, str(ROOT / "src/train/train_reranker.py"),
    "--train_pairs", str(PAIRS_DIR / "train.pairs.jsonl"),
    "--dev_pairs",   str(PAIRS_DIR / "dev.pairs.jsonl"),
    "--base_ce",     "cross-encoder/ms-marco-MiniLM-L-6-v2",
    "--local_base_dir", base_ce_id,
    "--out_dir",     str(OUT_RR),
    "--epochs",      "2",
    "--batch_size",  "8",
    "--lr",          "2e-5",
    "--max_len",     "384",
]

print("Launching:", " ".join(map(shlex.quote, args)))
proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
for line in proc.stdout:
    print(line, end="")
code = proc.wait()
print("\n[proc exit code]", code)
if code != 0:
    raise RuntimeError("Reranker training failed")

[info] using base CE: D:\IIT BBS\Job Resources\Business Optima\pdf-agent\models\msmarco-minilm-l6-v2
Launching: 'd:\Anaconda\envs\pdf-agent-2\python.exe' 'D:\IIT BBS\Job Resources\Business Optima\pdf-agent\src\train\train_reranker.py' --train_pairs 'D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\pairs\train.pairs.jsonl' --dev_pairs 'D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\pairs\dev.pairs.jsonl' --base_ce cross-encoder/ms-marco-MiniLM-L-6-v2 --local_base_dir 'D:\IIT BBS\Job Resources\Business Optima\pdf-agent\models\msmarco-minilm-l6-v2' --out_dir 'D:\IIT BBS\Job Resources\Business Optima\pdf-agent\outputs\reranker\title17' --epochs 2 --batch_size 8 --lr 2e-5 --max_len 384
[info] base CE: D:\IIT BBS\Job Resources\Business Optima\pdf-agent\models\msmarco-minilm-l6-v2
[info] train examples: 650
[info] total_steps=164 | warmup_steps=16
[info] dev queries for rerank eval: 11

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/82 [00:00<?, ?i

### **5) Smoke test: BM25 → rerank top-k**

In [None]:
import json, re
from pathlib import Path
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder

def tokenize(s: str):
    return re.findall(r"[A-Za-z0-9_]+", (s or "").lower())

corpus = []
for fp in sorted(CHUNKS.glob("*.jsonl")):
    with open(fp, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            text = obj.get("text") or obj.get("content") or ""
            if text.strip():
                corpus.append(text)

print("Loaded chunks:", len(corpus))
bm25 = BM25Okapi([tokenize(t) for t in corpus])

reranker = CrossEncoder(str(OUT_RR), device="cpu")

def extract_query_from_row(obj: dict) -> str | None:
    msgs = obj.get("messages")
    if isinstance(msgs, list):
        last_user = ""
        for m in msgs:
            if (m.get("role") or "").lower() == "user":
                last_user = (m.get("content") or "").strip()
        if last_user:
            return last_user
    q = obj.get("instruction") or obj.get("question")
    if isinstance(q, str) and len(q.strip()) > 0:
        return q.strip()
    return None

def search(query: str, k_bm25=20, k_final=3):
    scores = bm25.get_scores(tokenize(query))
    idxs   = sorted(range(len(corpus)), key=lambda i: scores[i], reverse=True)[:k_bm25]
    cands  = [corpus[i] for i in idxs]
    pairs  = [[query, c] for c in cands]
    ce_scores = reranker.predict(pairs)
    ranked = sorted(zip(cands, ce_scores), key=lambda x: x[1], reverse=True)
    return ranked[:k_final]

test_json = SFT_DIR / "test.jsonl"
sample_q = None
if test_json.exists():
    with open(test_json, "r", encoding="utf-8") as f:
        for line in f:
            try:
                obj = json.loads(line)
            except Exception:
                continue
            q = extract_query_from_row(obj)
            if q and len(q) > 30:
                sample_q = q
                break

sample_q = sample_q or "What does § 114 say about performance rights? End with [pp. 67–88]."
print("\nQuery:", sample_q)
for i, (text, s) in enumerate(search(sample_q), 1):
    print(f"\n#{i}  score={float(s):.4f}\n{text[:800]}…")


  from tqdm.autonotebook import tqdm, trange


Loaded chunks: 2683

Query: Summarize the section:
Heading: Copyright Law United States Copyri > (E) Musical works database .-
Summarize as 12–15 detailed bullet points. End with [pp. 98–134].

#1  score=1.8890
Copyright Law United States Copyri…

#2  score=-0.2237
(E) Musical works database .-…

#3  score=-2.9829
- (e) Copyright Office Activities .-The Register of Copyrights shall engage in public outreach and educational activities-
- (1) regarding the amendments made by subsection (a) to section 115 of title 17, United States Code, including the responsibilities of the mechanical licensing collective designated under those amendments;
- (2) which shall include educating songwriters and other interested parties with respect to the process established under section 115(d)(3)(C)(i)(V) of title 17, United States Code, as added by subsection (a), by which-
- (A) a copyright owner may claim ownership of musical works (and shares of such works); and
- (B) royalties for works for which the 