### **Setup & paths (strictly offline)**

In [None]:
from pathlib import Path
import sys, os, platform, json, re

CWD  = Path.cwd().resolve()
ROOT = CWD if (CWD / "src").exists() else CWD.parent
if str(ROOT) not in sys.path: sys.path.append(str(ROOT))

DATA       = ROOT / "data"
SFT_DIR    = DATA / "sft"
CHUNKS_DIR = DATA / "chunks"
GRAPH_DIR  = ROOT / "outputs" / "graph" / "graph"
RERANK_DIR = ROOT / "outputs" / "reranker" / "title17"
MERGED_QWEN= ROOT / "outputs" / "lora_hf" / "title17_merged"
OUT_DIR    = ROOT / "outputs" / "eval" / "ragas"
OUT_DIR.mkdir(parents=True, exist_ok=True)

os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

print("ROOT     :", ROOT)
print("SFT_DIR  :", SFT_DIR)
print("CHUNKS   :", CHUNKS_DIR)
print("GRAPH    :", GRAPH_DIR)
print("RERANKER :", RERANK_DIR)
print("Qwen     :", MERGED_QWEN)
print("Python   :", platform.python_version(), "| CPU threads:", os.cpu_count())


ROOT     : D:\IIT BBS\Job Resources\Business Optima\pdf-agent
SFT_DIR  : D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft
CHUNKS   : D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\chunks
GRAPH    : D:\IIT BBS\Job Resources\Business Optima\pdf-agent\outputs\graph\graph
RERANKER : D:\IIT BBS\Job Resources\Business Optima\pdf-agent\outputs\reranker\title17
Qwen     : D:\IIT BBS\Job Resources\Business Optima\pdf-agent\outputs\lora_hf\title17_merged
Python   : 3.11.13 | CPU threads: 8


### **Hierarchical retriever (BM25 -> CE)**

In [None]:
import json
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder

hier = json.loads((GRAPH_DIR / "hierarchy.json").read_text(encoding="utf-8"))
nodes = hier["nodes"]

node_records = [json.loads(l) for l in open(GRAPH_DIR / "node_texts.jsonl", "r", encoding="utf-8")]
node_text_by_id = {r["node_id"]: r["text"] for r in node_records}
node_name_by_id = {r["node_id"]: r["name"] for r in node_records}

chunk_text   = {}
chunk_pages  = {}
chunk_sec    = {}
for fp in sorted(CHUNKS_DIR.glob("*.jsonl")):
    with open(fp, "r", encoding="utf-8") as f:
        for line in f:
            try:
                obj = json.loads(line)
            except Exception:
                continue
            cid = obj.get("id") or obj.get("chunk_id")
            if not cid: 
                continue
            txt = obj.get("text") or obj.get("content") or ""
            if not txt.strip(): 
                continue
            chunk_text[cid]  = txt
            chunk_pages[cid] = obj.get("pages") or []
            chunk_sec[cid]   = obj.get("section") or ""

node2chunks = {}
for nd in nodes:
    nid = nd["id"]
    node2chunks[nid] = nd.get("chunk_ids") or []

def tok(s:str):
    return re.findall(r"[A-Za-z0-9_]+", (s or "").lower())

node_ids  = [r["node_id"] for r in node_records]
node_texts= [node_text_by_id[nid] for nid in node_ids]
bm25_nodes= BM25Okapi([tok(t) for t in node_texts])

reranker = CrossEncoder(str(RERANK_DIR), device="cpu")

def search_hier(
    query: str, 
    k_nodes: int = 40, 
    k_final_nodes: int = 6, 
    k_each_node: int = 12, 
    k_final_chunks: int = 8
):
    scores = bm25_nodes.get_scores(tok(query))
    cand_node_idx = sorted(range(len(node_ids)), key=lambda i: scores[i], reverse=True)[:k_nodes]
    cand_nodes = [(node_ids[i], node_texts[i]) for i in cand_node_idx]

    ce_node_scores = reranker.predict([[query, t] for _, t in cand_nodes])
    ranked_nodes = [cand_nodes[i] for i in sorted(range(len(cand_nodes)), key=lambda j: ce_node_scores[j], reverse=True)[:k_final_nodes]]
    ranked_node_ids = [nid for nid,_ in ranked_nodes]

    final_cands = []
    for nid in ranked_node_ids:
        cids = [cid for cid in node2chunks.get(nid, []) if cid in chunk_text]
        if not cids:
            continue
        bm25_local = BM25Okapi([tok(chunk_text[c]) for c in cids])
        local_scores = bm25_local.get_scores(tok(query))
        local_idx = sorted(range(len(cids)), key=lambda i: local_scores[i], reverse=True)[:k_each_node]
        for i in local_idx:
            cid = cids[i]
            final_cands.append({
                "chunk_id": cid,
                "node_id": nid,
                "node_name": node_name_by_id.get(nid, nid),
                "text": chunk_text[cid],
                "pages": chunk_pages.get(cid) or [],
                "section": chunk_sec.get(cid) or "",
            })

    if not final_cands:
        return []

    ce_scores = reranker.predict([[query, c["text"]] for c in final_cands])
    for c, s in zip(final_cands, ce_scores):
        c["score"] = float(s)
    final = sorted(final_cands, key=lambda x: x["score"], reverse=True)[:k_final_chunks]
    return final

q = "Summarize § 114 performance rights caveat. End with [pp. 67–88]."
hits = search_hier(q, k_nodes=40, k_final_nodes=6, k_each_node=12, k_final_chunks=5)
for i, h in enumerate(hits, 1):
    snip = (h["text"][:600] + "…") if len(h["text"]) > 600 else h["text"]
    print(f"\n#{i}  score={h['score']:.3f}  node={h['node_id']}  chunk={h['chunk_id']}  pages={h['pages']}\n{snip}")


  from tqdm.autonotebook import tqdm, trange



#1  score=0.507  node=SEC-00017  chunk=title17-h-342  pages=[]
§ 114 · Scope of exclusive rights in sound recordings 48

#2  score=-0.890  node=SEC-00019  chunk=title17-h-689  pages=[]
§ 116 · Negotiated licenses for public performances by means of coin-operated phonorecord players 53

#3  score=-1.469  node=SEC-00016  chunk=title17-h-328  pages=[]
§ 113 · Scope of exclusive rights in pictorial, graphic, and sculptural works 47

#4  score=-1.862  node=SEC-00015  chunk=title17-h-307  pages=[]
§ 112 · Limitations on exclusive rights: Ephemeral recordings 46

#5  score=-1.915  node=SEC-00013  chunk=title17-h-220  pages=[]
§ 110 · Limitations on exclusive rights: Exemption of certain performances and displays 43


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

tok = AutoTokenizer.from_pretrained(str(MERGED_QWEN), local_files_only=True, trust_remote_code=True)
if tok.pad_token is None: tok.pad_token = tok.eos_token

model = AutoModelForCausalLM.from_pretrained(
    str(MERGED_QWEN),
    torch_dtype=torch.float32,
    local_files_only=True,
    trust_remote_code=True,
    device_map={"": "cpu"},
)
model.eval()

SYS_PROMPT = (
    "You answer strictly using the provided CONTEXTS. "
    "Cite page numbers if present. If the answer is not in the contexts, say you don't know."
)

def build_prompt(question: str, contexts: list[str]) -> str:
    ctx_block = "\n\n".join([f"[CTX {i+1}]\n{c}" for i, c in enumerate(contexts)])
    return (
        f"System: {SYS_PROMPT}\n\n"
        f"CONTEXTS:\n{ctx_block}\n\n"
        f"User: {question}\nAssistant:"
    )

@torch.no_grad()
def answer_with_contexts(question: str, contexts: list[str], max_new_tokens=220) -> str:
    prompt = build_prompt(question, contexts)
    inputs = tok(prompt, return_tensors="pt")
    out = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        pad_token_id=tok.eos_token_id,
        eos_token_id=tok.eos_token_id,
    )
    return tok.decode(out[0], skip_special_tokens=True).split("Assistant:", 1)[-1].strip()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import itertools

TEST_JSONL = SFT_DIR / "test.jsonl"
assert TEST_JSONL.exists(), f"Missing {TEST_JSONL}"

def extract_q_gt(row: dict) -> tuple[str|None, str|None]:
    msgs = row.get("messages")
    if isinstance(msgs, list):
        q = None
        for m in msgs:
            if (m.get("role") or "").lower() == "user":
                q = (m.get("content") or "").strip()
        gt = (row.get("response") or "").strip()
        return (q, gt) if q else (None, None)
    q = (row.get("instruction") or row.get("question") or "").strip()
    gt = (row.get("response") or row.get("answer") or "").strip()
    return (q if q else None, gt if gt else None)

rows = []
with open(TEST_JSONL, "r", encoding="utf-8") as f:
    for line in f:
        try:
            obj = json.loads(line)
        except Exception:
            continue
        q, gt = extract_q_gt(obj)
        if q and gt and len(q) > 12:
            rows.append((q, gt))

N = min(12, len(rows))
sample = rows[:N]
print(f"Eval examples: {len(sample)}")

eval_records = []
for q, gt in sample:
    hits = search_hier(q, k_nodes=40, k_final_nodes=6, k_each_node=12, k_final_chunks=5)
    contexts = [h["text"] for h in hits]
    ans = answer_with_contexts(q, contexts)
    eval_records.append({
        "question": q,
        "contexts": contexts,
        "answer": ans,
        "ground_truth": gt,
    })

for i, r in enumerate(eval_records[:2], 1):
    print(f"\n[{i}] Q: {r['question'][:120]}...")
    print("Answer:", r["answer"][:200], "...")
    print("GT   :", r["ground_truth"][:200], "...")
    print("#ctx :", len(r["contexts"]))


Eval examples: 12


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p'


[1] Q: Summarize the section:
Heading: Copyright Law United States Copyri > (E) Musical works database .-
Summarize as 12–15 de...
Answer: - The mechanical licensing collective is responsible for establishing and maintaining a comprehensive database of musical works, including information on shares, copyright owners, and sound recordings ...
GT   : - - The mechanical licensing collective must establish and maintain a database containing information about musical works, shares of such works, copyright owners, and sound recordings.
- - The collect ...
#ctx : 5

[2] Q: Summarize the section:
Heading: Copyright Law United States Copyri > (g) Encryption Research. -
Give a brief 2–3 sentenc...
Answer: The passage outlines provisions related to encryption research and its distribution under certain conditions. Noncommercial distributions of unpublished works by researchers are exempt if: The work wa ...
GT   : Encryption research refers to activities that identify and analyze flaws in encryp

In [None]:
import subprocess, sys

def _pip_install(pkg: str):
    print(subprocess.run([sys.executable, "-m", "pip", "install", "-q", pkg],
                         capture_output=True, text=True).stdout)

try:
    _pip_install("langchain-huggingface>=0.1.0")
    from langchain_huggingface import HuggingFacePipeline as HF_PIPELINE_WRAPPER
except Exception:
    try:
        from langchain_community.llms import HuggingFacePipeline as HF_PIPELINE_WRAPPER
    except Exception:
        _pip_install("langchain-community==0.3.2")
        from langchain_community.llms import HuggingFacePipeline as HF_PIPELINE_WRAPPER

from transformers import pipeline

gen_pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tok,
    max_new_tokens=256,
    do_sample=False,
    pad_token_id=tok.eos_token_id,
    eos_token_id=tok.eos_token_id,
    return_full_text=False,
)

judge_llm = HF_PIPELINE_WRAPPER(pipeline=gen_pipe)
print("[OK] judge LLM pipeline ready (CPU)")





Device set to use cpu
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


[OK] judge LLM pipeline ready (CPU)


In [None]:
from transformers.utils import logging as hf_logging
hf_logging.set_verbosity_error()

try:
    model.generation_config.do_sample = False
    model.generation_config.top_p = 1.0
    model.generation_config.temperature = 0.0
    model.generation_config.num_beams = 1
    print("[OK] generation_config pinned")
except Exception as e:
    print("[warn] couldn't pin generation_config:", e)


[OK] generation_config pinned


In [None]:
import os, sys, subprocess, json, re
os.environ["RAGAS_DISABLE_TELEMETRY"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

def _pip_install(pkg: str):
    print(subprocess.run([sys.executable, "-m", "pip", "install", "-q", pkg],
                         capture_output=True, text=True).stdout)

# --- deps
try:
    import ragas
except Exception:
    _pip_install("ragas==0.1.9")
    import ragas

from datasets import Dataset
from ragas import evaluate
from ragas.metrics import context_precision, context_recall

# ---- embeddings
embeddings = None
LOCAL_EMB = ROOT / "models" / "all-MiniLM-L6-v2"
try:
    if LOCAL_EMB.exists():
        try:
            from langchain_community.embeddings import HuggingFaceEmbeddings
        except Exception:
            _pip_install("langchain-community==0.3.2")
            from langchain_community.embeddings import HuggingFaceEmbeddings
        embeddings = HuggingFaceEmbeddings(model_name=str(LOCAL_EMB))
        print("[info] using local embeddings:", LOCAL_EMB)
    else:
        raise FileNotFoundError
except Exception:
    import numpy as np
    from ragas.embeddings.base import BaseRagasEmbeddings
    class LocalHashEmbeddings(BaseRagasEmbeddings):
        def __init__(self, dim: int = 512):
            self.dim = dim
            self._rx = re.compile(r"[A-Za-z0-9_]+")
        def _v(self, text: str):
            v = np.zeros(self.dim, dtype=np.float32)
            for t in self._rx.findall((text or "").lower()):
                v[hash(t) % self.dim] += 1.0
            n = np.linalg.norm(v)
            if n > 0: v /= n
            return v.astype(np.float32).tolist()
        def embed_documents(self, texts): return [self._v(t) for t in texts]
        def embed_query(self, text):      return self._v(text)
        async def aembed_documents(self, texts): return self.embed_documents(texts)
        async def aembed_query(self, text):      return self.embed_query(text)
    embeddings = LocalHashEmbeddings(dim=512)
    print("[info] using offline hashed embeddings (no OpenAI)")

assert "eval_records" in globals() and len(eval_records) > 0, "Run the earlier cells that build eval_records."
ds = Dataset.from_list(eval_records)

metrics = [context_precision, context_recall]

try:
    from ragas.run_config import RunConfig
except Exception:
    try:
        from ragas.config import RunConfig
    except Exception:
        class RunConfig:
            def __init__(self, timeout=None, max_workers=None):
                self.timeout = timeout
                self.max_workers = max_workers

rc = RunConfig(timeout=600, max_workers=1)

result = evaluate(
    ds,
    metrics=metrics,
    embeddings=embeddings,
    run_config=rc,
    batch_size=4,
    show_progress=False,
    raise_exceptions=False,
)

try:
    import pandas as pd
except Exception:
    _pip_install("pandas>=2.0.0")
    import pandas as pd

df = result.to_pandas()

import numpy as np
def _norm(v): 
    v = np.asarray(v, dtype=np.float32); 
    n = np.linalg.norm(v); 
    return v if n == 0 else v / n

def _cos(a, b): 
    a = _norm(a); b = _norm(b); 
    return float(np.dot(a, b))

ans_sim = []
for rec in eval_records:
    a = rec.get("answer") or ""
    ctx = "\n\n".join(rec.get("contexts") or [])
    va = embeddings.embed_query(a)
    vc = embeddings.embed_query(ctx)
    ans_sim.append(_cos(va, vc))

df["answer_similarity"] = ans_sim

# Aggregate means
agg = {}
for col in ["context_precision", "context_recall", "answer_similarity"]:
    if col in df.columns:
        try:
            agg[col] = float(pd.to_numeric(df[col], errors="coerce").mean())
        except Exception:
            pass

print("\n=== RAG (embedding-only) ===")
for k, v in agg.items():
    print(f"{k:>18s}: {v:.4f}")

OUT_DIR.mkdir(parents=True, exist_ok=True)
df.to_csv(OUT_DIR / "ragas_details.csv", index=False, encoding="utf-8")
(OUT_DIR / "ragas_report.json").write_text(json.dumps(agg, indent=2), encoding="utf-8")
print("\n[OK] wrote", OUT_DIR / "ragas_report.json")
print("[OK] wrote", OUT_DIR / "ragas_details.csv")


[info] using offline hashed embeddings (no OpenAI)


Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt n_l_i_statement_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt context_precision_prompt failed to parse output: The output parser failed to 