In [1]:
# 0) bootstrap
import os, sys
from pathlib import Path

ROOT = Path.cwd()
while not (ROOT / "pyproject.toml").exists() and ROOT != ROOT.parent:
    ROOT = ROOT.parent
os.chdir(ROOT)
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
print("Project root:", ROOT)

Project root: d:\IIT BBS\Job Resources\Business Optima\new-pdf-agent


In [2]:
from pathlib import Path
import json, collections, itertools

doc_id = "NFS_2019"
chunks_path = Path("data/artifacts/NFS_2019/chunks") / f"{doc_id}.chunks.jsonl"

# peek a handful of rows to see the schema
with chunks_path.open("r", encoding="utf-8") as f:
    sample_lines = list(itertools.islice(f, 20))

rows = [json.loads(x) for x in sample_lines]
print("Sample rows:", len(rows))

# show top-level keys & metadata keys
top_keys = collections.Counter()
meta_keys = collections.Counter()
for r in rows:
    top_keys.update(r.keys())
    md = r.get("metadata") or r.get("meta") or {}
    if isinstance(md, dict):
        meta_keys.update(md.keys())

print("Top-level keys:", top_keys)
print("Metadata keys:", meta_keys)

# show a couple of metadata dicts to eyeball nesting
for r in rows[:3]:
    print("ID:", r.get("id"))
    print("metadata:", json.dumps((r.get("metadata") or r.get("meta") or {}), indent=2)[:800])
    print("---")

Sample rows: 20
Top-level keys: Counter({'id': 20, 'text': 20, 'metadata': 20})
Metadata keys: Counter({'doc_id': 20, 'block_type': 20, 'heading_path': 20, 'page_start': 20, 'page_end': 20})
ID: NFS_2019-1
metadata: {
  "doc_id": "NFS_2019",
  "block_type": "para",
  "heading_path": [],
  "page_start": 1,
  "page_end": 1
}
---
ID: NFS_2019-2
metadata: {
  "doc_id": "NFS_2019",
  "block_type": "para",
  "heading_path": [],
  "page_start": 1,
  "page_end": 1
}
---
ID: NFS_2019-h-3
metadata: {
  "doc_id": "NFS_2019",
  "block_type": "heading",
  "heading_path": [
    "FEE SCHEDULE"
  ],
  "page_start": 1,
  "page_end": 1
}
---


In [None]:
from pathlib import Path
from packages.calculators.miner import MinerConfig, MinerLLMConfig, MinerHeuristics, mine_from_chunks

doc_id = "NFS_2019"
cfg = MinerConfig(
    use_llm=True,
    llm=MinerLLMConfig(base_url="http://localhost:11434", model="llama3.2:latest", temperature=0.1, max_new_tokens=384, retries=1),
    heur=MinerHeuristics(),
    max_md_chars_per_page=6000,
    output_root="data/mined",
    verbose=True
)

chunks_path = Path("data/artifacts/NFS_2019/chunks") / f"{doc_id}.chunks.jsonl"
paths = mine_from_chunks(doc_id, artifacts_root=Path("data/artifacts"), chunks_path=chunks_path, pages=None, cfg=cfg)
paths

[miner] doc=NFS_2019 total_rows=13848 pages_in_scope=[1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]... total_pages=499


In [4]:
# # 1) choose doc + pages to sample
# doc_id = "NFS_2019"
# sample_pages = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]   # <- change to any pages you want to inspect

# from packages.calculators.miner import MinerConfig, MinerLLMConfig, MinerHeuristics, mine_from_chunks

# # you can toggle use_llm quickly here
# cfg = MinerConfig(
#     use_llm=True,                                   # set False to see pure-heuristic output
#     llm=MinerLLMConfig(
#         base_url="http://localhost:11434",          # your eval_ollama or chat port
#         model="llama3.2:latest",
#         temperature=0.1,
#         max_new_tokens=384,
#         retries=1,
#     ),
#     heur=MinerHeuristics(),
#     max_md_chars_per_page=5000,
#     output_root="data/mined",
# )

# paths = mine_from_chunks(doc_id, Path("data/artifacts"), pages=sample_pages, cfg=cfg)
# paths

In [3]:
# 2) quick peek at each file
import json
for name, p in paths.items():
    print("===", name, p)
    if p.exists():
        lines = p.read_text(encoding="utf-8").splitlines()[:5]
        for ln in lines:
            print("  ", ln[:200])
    print()

=== formulas data\mined\NFS_2019\formulas.jsonl

=== notations data\mined\NFS_2019\notations.jsonl

=== abbreviations data\mined\NFS_2019\abbreviations.jsonl

=== tables data\mined\NFS_2019\tables.jsonl



In [4]:
# 3) load into DataFrames for eyeballing
import pandas as pd, json

dfs = {}
for name, p in paths.items():
    if p.exists():
        rows = [json.loads(x) for x in p.read_text(encoding="utf-8").splitlines()]
        dfs[name] = pd.DataFrame(rows)
        print(name, len(rows))
        display(dfs[name].head())


formulas 0


notations 0


abbreviations 0


tables 0


In [6]:
from pathlib import Path
import json, collections

chunks_path = Path("data/artifacts/NFS_2019/chunks/NFS_2019.chunks.jsonl")
rows = [json.loads(x) for x in chunks_path.read_text(encoding="utf-8").splitlines()]
counter = collections.Counter(int((r.get("metadata") or {}).get("page", -1)) for r in rows)
print("Pages present:", sorted([p for p in counter if p >= 0])[:50], "... total:", len(counter))
print("Counts sample:", counter.most_common(10))


Pages present: [] ... total: 1
Counts sample: [(-1, 13848)]
