## **02 - Chunking**

In [9]:
import sys
from pathlib import Path
ROOT = Path.cwd().resolve().parent if (Path.cwd() / "src").exists() is False else Path.cwd().resolve()
sys.path.append(str(ROOT))

from src.ingest.md_to_chunks import md_to_chunks
import json, statistics as stats
from rapidfuzz import fuzz
import itertools, json

In [7]:
DOC = "title17"  # change if you process another PDF later
MD_PATH = ROOT / "data" / "md" / f"{DOC}.md"
PAGES_JSONL = ROOT / "data" / "md" / f"{DOC}.pages.jsonl"
CHUNKS = ROOT / "data" / "chunks" / "chunks.jsonl"

print("ROOT:", ROOT)
print("MD_PATH exists?", MD_PATH.exists())
print("PAGES_JSONL exists?", PAGES_JSONL.exists())
print("CHUNKS ->", CHUNKS)

ROOT: D:\IIT BBS\Job Resources\Business Optima\pdf-agent
MD_PATH exists? True
PAGES_JSONL exists? True
CHUNKS -> D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\chunks\chunks.jsonl


In [None]:
n = md_to_chunks(
    MD_PATH,
    CHUNKS,
    pages_jsonl=PAGES_JSONL,
    max_chars=1600,     # tighter granularity for legal text
    overlap=400,
    drop_gibberish=True,
    drop_toc=False,      # set False for ToC chunks included
    min_align_score=70, # only assign page if fuzzy score >= 70
)
print("chunks:", n)

chunks: 2683


In [12]:
import json, itertools

for line in itertools.islice(open(CHUNKS, "r", encoding="utf-8"), 15):
    print(json.loads(line))

{'id': 'title17-h-1', 'text': 'Copyright Law United States Copyri', 'metadata': {'doc_id': 'title17', 'block_type': 'heading', 'heading_path': ['Copyright Law United States Copyri'], 'page_start': 1, 'page_end': 1}}
{'id': 'title17-2', 'text': 'CopyrigChipytC LyawohCfoipyfo efhgiyUnywdyhSiysofhipyDhChiLyawpi', 'metadata': {'doc_id': 'title17', 'block_type': 'para', 'heading_path': ['Copyright Law United States Copyri'], 'page_start': None, 'page_end': None}}
{'id': 'title17-3', 'text': 'circular 9 2', 'metadata': {'doc_id': 'title17', 'block_type': 'para', 'heading_path': ['Copyright Law United States Copyri'], 'page_start': 484, 'page_end': 484}}
{'id': 'title17-h-4', 'text': 'Copyright Law of the United States', 'metadata': {'doc_id': 'title17', 'block_type': 'heading', 'heading_path': ['Copyright Law United States Copyri', 'Copyright Law of the United States'], 'page_start': 3, 'page_end': 3}}
{'id': 'title17-5', 'text': 'and Related Laws Contained in Title 17 of the United States C

In [13]:
lengths = []
types = {}
with open(CHUNKS, "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        t = rec["metadata"]["block_type"]
        lengths.append(len(rec["text"]))
        types[t] = types.get(t, 0) + 1

print("chunks:", len(lengths))
print("avg chars:", round(stats.mean(lengths),1), "| p50:", int(stats.median(lengths)),
      "| p90:", int(sorted(lengths)[int(0.9*len(lengths))-1]))
print("by type:", types)

chunks: 2683
avg chars: 470.3 | p50: 289 | p90: 1320
by type: {'heading': 440, 'para': 1262, 'list': 957, 'table': 24}


In [14]:
PAGES = ROOT / "data/md/title17.pages.jsonl"
pages = [json.loads(x) for x in open(PAGES, "r", encoding="utf-8")]
pages_text = {p["page"]: (p.get("text") or "")[:4000] for p in pages}

def score_first400(txt):
    probe = (txt or "")[:400]
    best = 0
    for pg, ptxt in pages_text.items():
        s = fuzz.partial_ratio(probe, ptxt)
        best = max(best, s)
    return best

scores = []
for line in itertools.islice(open(CHUNKS, "r", encoding="utf-8"), 200):
    rec = json.loads(line)
    scores.append(score_first400(rec["text"]))

print("align p50:", sorted(scores)[len(scores)//2], "p10:", sorted(scores)[len(scores)//10])


align p50: 98.0 p10: 93.47826086956522
