### **Root + prompts check**

In [1]:
from pathlib import Path
import sys, os, platform

CWD  = Path.cwd().resolve()
ROOT = CWD if (CWD / "src").exists() else CWD.parent
if str(ROOT) not in sys.path: sys.path.append(str(ROOT))

print("ROOT:", ROOT)
print("Python:", platform.python_version())

ROOT: D:\IIT BBS\Job Resources\Business Optima\pdf-agent
Python: 3.11.13


In [2]:
PROMPTS = ROOT / "configs" / "prompts"
PROMPTS.mkdir(parents=True, exist_ok=True)

need = {
    "system.txt": """<PASTE THE system.txt CONTENT FROM THE ANSWER>""",
    "style_rules.txt": """<PASTE THE style_rules.txt CONTENT FROM THE ANSWER>""",
    "answer_with_citations.txt": """<PASTE THE answer_with_citations.txt CONTENT FROM THE ANSWER>""",
}

for name, txt in need.items():
    fp = PROMPTS / name
    if not fp.exists() or fp.stat().st_size == 0:
        fp.write_text(txt.strip() + "\n", encoding="utf-8")
        print("[created]", fp)
    else:
        print("[ok]", fp)

[ok] D:\IIT BBS\Job Resources\Business Optima\pdf-agent\configs\prompts\system.txt
[ok] D:\IIT BBS\Job Resources\Business Optima\pdf-agent\configs\prompts\style_rules.txt
[ok] D:\IIT BBS\Job Resources\Business Optima\pdf-agent\configs\prompts\answer_with_citations.txt


### **Config sanity**

In [3]:
from src.agent.config import CFG
print("Base model dir     :", CFG.base_model_dir)
print("Adapter dir        :", CFG.adapter_dir)
print("Reranker dir       :", CFG.reranker_dir)
print("Graph dir          :", CFG.graph_dir)
print("Chunks dir         :", CFG.chunks_dir)
print("Sessions dir       :", CFG.sessions_dir)
print("SQLite path        :", CFG.sqlite_path)
print("Retrieval knobs    :", CFG.k_nodes, CFG.k_final_nodes, CFG.k_each_node, CFG.k_final_chunks)
print("Gen knobs          :", CFG.max_new_tokens, CFG.temperature)

Base model dir     : D:\IIT BBS\Job Resources\Business Optima\pdf-agent\models\Qwen2.5-1.5B-Instruct
Adapter dir        : D:\IIT BBS\Job Resources\Business Optima\pdf-agent\outputs\lora_hf\title17\adapter
Reranker dir       : D:\IIT BBS\Job Resources\Business Optima\pdf-agent\outputs\reranker\title17
Graph dir          : D:\IIT BBS\Job Resources\Business Optima\pdf-agent\outputs\graph\graph
Chunks dir         : D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\chunks
Sessions dir       : D:\IIT BBS\Job Resources\Business Optima\pdf-agent\outputs\sessions
SQLite path        : D:\IIT BBS\Job Resources\Business Optima\pdf-agent\outputs\logs\agent.sqlite
Retrieval knobs    : 40 6 12 6
Gen knobs          : 320 0.1


### **Policy checks**

In [4]:
from src.agent.policy import guard_title17_scope

tests = [
    "write a python script to scrape facebook",
    "What does §107 say about fair use?",
    "explain patents in india",
    "In Title 17, what’s the compulsory license rule in §115?"
]
for t in tests:
    gd = guard_title17_scope(t)
    print(f"{t[:45]:45} -> allow={gd.allow} ({gd.reason})")

write a python script to scrape facebook      -> allow=False (out-of-scope)
What does §107 say about fair use?            -> allow=True (matched copyright lexicon)
explain patents in india                      -> allow=False (out-of-scope)
In Title 17, what’s the compulsory license ru -> allow=True (matched title17 pattern)


### **Retrieval sanity (BM25→CE)**

In [5]:
from src.agent.retriever import HierBM25CEReranker

retr = HierBM25CEReranker()
q = "Summarize § 114 performance rights caveat. End with [pp. 67–88]."
hits = retr.search(q, k_nodes=40, k_final_nodes=6, k_each_node=12, k_final_chunks=5)
print("hits:", len(hits))
for i, h in enumerate(hits, 1):
    snip = (h["text"][:300] + "…") if len(h["text"]) > 300 else h["text"]
    print(f"\n#{i} score={h['score']:.3f} node={h['node_id']} chunk={h['chunk_id']} pages={h['pages']}")
    print(snip)

hits: 5

#1 score=0.507 node=SEC-00017 chunk=title17-h-342 pages=[]
§ 114 · Scope of exclusive rights in sound recordings 48

#2 score=-0.890 node=SEC-00019 chunk=title17-h-689 pages=[]
§ 116 · Negotiated licenses for public performances by means of coin-operated phonorecord players 53

#3 score=-1.469 node=SEC-00016 chunk=title17-h-328 pages=[]
§ 113 · Scope of exclusive rights in pictorial, graphic, and sculptural works 47

#4 score=-1.862 node=SEC-00015 chunk=title17-h-307 pages=[]
§ 112 · Limitations on exclusive rights: Ephemeral recordings 46

#5 score=-1.915 node=SEC-00013 chunk=title17-h-220 pages=[]
§ 110 · Limitations on exclusive rights: Exemption of certain performances and displays 43


### **Core LLM streaming (closed-book+contexts)**

In [None]:
from src.agent.llms import load_core_llm, stream_generate
from src.agent.prompts import load_prompts, build_rag_prompt

core = load_core_llm()
pb = load_prompts()

contexts = hits[:3]
prompt = build_rag_prompt("What does §107 say about fair use? End with [pp. 40–41].", contexts, pb)

print("---- streaming ----\n")
for tok in stream_generate(core, prompt, max_new_tokens=220, temperature=0.0):
    print(tok, end="", flush=True)
print("\n\n[done]")

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


---- streaming ----

 Answer:
A performance or display of a work under subsection (a)(1), (2), (3), (5), or (7) is fair use if - 

- The primary purpose or character of the use, including whether the use is of commercial advantage, and 
- The amount and significance of the portion used relative to the whole.

[done]


### **Reinit DB - logger bootstrap + event test**

In [7]:
from src.agent.config import CFG
from src.agent.logger import EventLogger
import os

if os.path.exists(CFG.sqlite_path):
    os.remove(CFG.sqlite_path)
elog = EventLogger()
elog.log_event("bootstrap", {"note": "fresh init"})
print("[db] reinitialized:", elog.path)

[db] reinitialized: D:\IIT BBS\Job Resources\Business Optima\pdf-agent\outputs\logs\agent.sqlite


### **One streaming turn**

In [8]:
import uuid
from src.agent.orchestrator import Title17Agent

async def run_once(question: str):
    agent = Title17Agent()
    session_id = f"sess-{uuid.uuid4().hex[:8]}"
    print("[session]", session_id)
    print("[user]", question, "\n")
    final = []
    async for ev in agent.achat_stream(session_id, question):
        if ev["type"] == "token":
            print(ev["text"], end="", flush=True)
            final.append(ev["text"])
        elif ev["type"] == "final":
            print("\n\n[FINAL]\n", ev["text"])
            if ev.get("citations"):
                print("\n[CITATIONS]")
                for c in ev["citations"]:
                    print(c)
        elif ev["type"] == "error":
            print("\n[ERROR]", ev["text"])
    return session_id, "".join(final)

await run_once("What does §107 say about fair use? End with [pp. 40–41].")


  llm = ChatOllama(model=CFG.ollama_summarizer, temperature=0.2)
  return ConversationSummaryBufferMemory(


[session] sess-63dbd528
[user] What does §107 say about fair use? End with [pp. 40–41]. 

 Answer:
The fair use doctrine allows reproduction of copyrighted works under certain conditions, including transmission of instructional materials to students. Reproduction from certain classes, including law classes, is specifically covered (§104A(1)(B)). The purpose of reproduction must be factored into a subjective determination of whether fair use was made (§106(3)).

Citations:
- [pp. 31–33]

[FINAL]
 Answer:
The fair use doctrine allows reproduction of copyrighted works under certain conditions, including transmission of instructional materials to students. Reproduction from certain classes, including law classes, is specifically covered (§104A(1)(B)). The purpose of reproduction must be factored into a subjective determination of whether fair use was made (§106(3)).

Citations:
- [pp. 31–33]

[CITATIONS]
{'chunk_id': 'title17-h-193', 'node_id': 'SEC-00010', 'section': '', 'pages': [], 'sco

('sess-63dbd528',
 ' Answer:\nThe fair use doctrine allows reproduction of copyrighted works under certain conditions, including transmission of instructional materials to students. Reproduction from certain classes, including law classes, is specifically covered (§104A(1)(B)). The purpose of reproduction must be factored into a subjective determination of whether fair use was made (§106(3)).\n\nCitations:\n- [pp. 31–33]')

### **Sessions + DB contents**

In [9]:
import json, sqlite3
from src.agent.config import CFG

sess_files = sorted(CFG.sessions_dir.glob("sess-*.json"))
print("sessions:", [f.name for f in sess_files][-3:])
if sess_files:
    data = json.loads(sess_files[-1].read_text(encoding="utf-8"))
    print("\n[last session sample]")
    print(json.dumps(data, indent=2)[:1000])

con = sqlite3.connect(CFG.sqlite_path)
cur = con.cursor()
for table in ["conversations", "messages", "events"]:
    try:
        row = cur.execute(f"SELECT COUNT(*) FROM {table}").fetchone()
        print(f"{table}: {row[0]} rows")
    except Exception as e:
        print(f"{table}: (not found) {e}")

print("\n[last 5 events]")
try:
    for r in cur.execute("SELECT ts,event,substr(payload,1,120) FROM events ORDER BY id DESC LIMIT 5"):
        print(r)
except Exception as e:
    print("(no events) ->", e)
con.close()


sessions: ['sess-63dbd528.json', 'sess-defdc095.json']

[last session sample]
{
  "session_id": "sess-defdc095",
  "created": 1756032209.200745,
  "messages": [
    {
      "ts": 1756032209.200745,
      "role": "user",
      "content": "What does \u00a7107 say about fair use? End with [pp. 40\u201341]."
    }
  ]
}
conversations: 1 rows
messages: 2 rows
events: 2 rows

[last 5 events]
(1756032824.9794493, 'retrieve_done', '{"session_id": "sess-63dbd528", "n_hits": 6}')
(1756032818.9491704, 'bootstrap', '{"note": "fresh init"}')


### **DB state inspection**

In [10]:
import json, sqlite3
from src.agent.config import CFG

con = sqlite3.connect(CFG.sqlite_path)
cur = con.cursor()
for table in ["conversations", "messages", "events"]:
    row = cur.execute(f"SELECT COUNT(*) FROM {table}").fetchone()
    print(f"{table}: {row[0]} rows")

print("\n[last 5 events]")
for r in cur.execute("SELECT ts,event,substr(payload,1,160) FROM events ORDER BY id DESC LIMIT 5"):
    print(r)
con.close()

conversations: 1 rows
messages: 2 rows
events: 2 rows

[last 5 events]
(1756032824.9794493, 'retrieve_done', '{"session_id": "sess-63dbd528", "n_hits": 6}')
(1756032818.9491704, 'bootstrap', '{"note": "fresh init"}')


### **Tool smoke test (Pydantic v2 OK)**

In [11]:
from src.agent.tools import SummarizeTool
sumtool = SummarizeTool()
print(sumtool.invoke({"text": hits[0]["text"][:1200], "max_bullets": 4}))

• The exclusive rights granted to copyright owners under the Copyright Act of 1976 include reproduction, distribution, public performance, and adaptation of sound recordings (17 U.S.C. § 114).
• These exclusive rights are limited to the term of copyright, which is typically the life of the author plus 70 years for sound recordings (17 U.S.C. § 302).
• The scope of exclusive rights in sound recordings includes both master recordings and sound recordings themselves, including phonograms and other recordings (17 U.S.C. § 114).
• Copyright owners may enforce their exclusive rights through civil actions, including injunctions, damages, and attorney's fees (17 U.S.C. § 114).
