### **Setup**

In [1]:
from pathlib import Path
import sys, os, platform, json

CWD  = Path.cwd().resolve()
ROOT = CWD if (CWD / "src").exists() else CWD.parent
if str(ROOT) not in sys.path: sys.path.append(str(ROOT))

DATA      = ROOT / "data"
CHUNKS    = DATA / "chunks"
GRAPH_OUT = ROOT / "outputs" / "graph" / "graph"
PAIRS_DIR = DATA / "pairs"
OUT_RR    = ROOT / "outputs" / "reranker" / "title17"

print("ROOT:", ROOT)
print("CHUNKS:", CHUNKS)
print("GRAPH_OUT:", GRAPH_OUT)
print("PAIRS_DIR:", PAIRS_DIR)
print("RERANKER:", OUT_RR)

ROOT: D:\IIT BBS\Job Resources\Business Optima\pdf-agent
CHUNKS: D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\chunks
GRAPH_OUT: D:\IIT BBS\Job Resources\Business Optima\pdf-agent\outputs\graph\graph
PAIRS_DIR: D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\pairs
RERANKER: D:\IIT BBS\Job Resources\Business Optima\pdf-agent\outputs\reranker\title17


### **load retriever**

In [2]:
from src.serve.retriever import GraphRetriever

retr = GraphRetriever(
    chunks_dir=str(CHUNKS),
    graph_dir=str(GRAPH_OUT),
    reranker_dir=str(OUT_RR),
)
print("retriever ready.")


  from tqdm.autonotebook import tqdm, trange


retriever ready.


### **Quick manual queries**

In [3]:
q = "Summarize § 114 performance rights caveat. End with [pp. 67–88]."
hits = retr.search(q, k_nodes=40, k_final_nodes=6, k_each_node=12, k_final_chunks=5)
for i, h in enumerate(hits, 1):
    snip = (h["text"][:650] + "…") if len(h["text"]) > 650 else h["text"]
    print(f"\n#{i}  score={h['score']:.3f}  node={h['node_id']}  chunk={h['chunk_id']}  pages={h['pages']}  section={h['section']}\n{snip}")



#1  score=0.507  node=SEC-00017  chunk=title17-h-342  pages=[]  section=
§ 114 · Scope of exclusive rights in sound recordings 48

#2  score=-0.890  node=SEC-00019  chunk=title17-h-689  pages=[]  section=
§ 116 · Negotiated licenses for public performances by means of coin-operated phonorecord players 53

#3  score=-1.469  node=SEC-00016  chunk=title17-h-328  pages=[]  section=
§ 113 · Scope of exclusive rights in pictorial, graphic, and sculptural works 47

#4  score=-1.862  node=SEC-00015  chunk=title17-h-307  pages=[]  section=
§ 112 · Limitations on exclusive rights: Ephemeral recordings 46

#5  score=-1.915  node=SEC-00013  chunk=title17-h-220  pages=[]  section=
§ 110 · Limitations on exclusive rights: Exemption of certain performances and displays 43


### **Retrieval metrics on dev.pairs.jsonl**

In [None]:
import json, statistics

dev_pairs = []
with open(PAIRS_DIR / "dev.pairs.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        dev_pairs.append(json.loads(line))

def eval_retrieval(pairs, k_nodes=40, k_final_nodes=6, k_each_node=12, k_final_chunks=10):
    mrr10 = 0.0
    recall10 = 0.0
    top1 = 0.0
    ranks = []

    for row in pairs:
        q = row["query"]
        pos_chunk_id = (row.get("meta") or {}).get("pos_chunk_id")
        if not pos_chunk_id:
            pos_hint = (row.get("positive") or "").strip()

        hits = retr.search(q, k_nodes, k_final_nodes, k_each_node, k_final_chunks)

        rank = None
        for i, h in enumerate(hits, 1):
            if pos_chunk_id and h["chunk_id"] == pos_chunk_id:
                rank = i
                break
            if not pos_chunk_id and pos_hint and pos_hint[:50] in h["text"]:
                rank = i
                break

        if rank is not None:
            ranks.append(rank)
            if rank == 1:
                top1 += 1
            if rank <= 10:
                recall10 += 1
            mrr10 += 1.0 / rank

    n = len(pairs)
    return {
        "n": n,
        "top1": top1 / n if n else 0.0,
        "recall@10": recall10 / n if n else 0.0,
        "mrr@10": mrr10 / n if n else 0.0,
        "median_rank": (statistics.median(ranks) if ranks else None),
    }

metrics = eval_retrieval(dev_pairs)
print(metrics)


{'n': 11, 'top1': 0.6363636363636364, 'recall@10': 0.7272727272727273, 'mrr@10': 0.6818181818181818, 'median_rank': 1.0}
