In [9]:
# Bootstrap & imports
import os, sys
from pathlib import Path

ROOT = Path.cwd()
while not (ROOT / "pyproject.toml").exists() and ROOT != ROOT.parent:
    ROOT = ROOT.parent
os.chdir(ROOT)
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
print("Project root:", ROOT)

Project root: d:\IIT BBS\Job Resources\Business Optima\new-pdf-agent


In [10]:
from packages.core_config.config import load_yaml
from packages.ingest.profiler import profile_document
from packages.ingest.pdf2md_pipeline import PDF2MDConfig, run_pdf_to_markdown
from packages.ingest.tables_pipeline import TablesConfig, extract_tables_from_pdf
from packages.ingest.math_pipeline import MathConfig, extract_formulas_from_pages_jsonl
from packages.ingest.chunks import ChunkingConfig, md_to_chunks
from packages.ingest.hierarchy import HierarchyConfig, build_hierarchy
from packages.retriever.indexer import Indexer

In [3]:
# Cell 2 — Configs & paths
cfg = load_yaml("configs/providers.yaml", "configs/pipelines/generic_legal.yaml")

pdf_path = Path("data/raw/NFS_2019.pdf")
assert pdf_path.exists(), f"Missing PDF: {pdf_path}"
doc_id = pdf_path.stem

art_root   = Path(f"data/artifacts/{doc_id}")
md_dir     = art_root / "md"
chunks_dir = art_root / "chunks"
graph_dir  = art_root
for d in (md_dir, chunks_dir, graph_dir): d.mkdir(parents=True, exist_ok=True)

# optional: dry-run on a subset before the full run
DRY_RUN_PAGES: int | None = None   # e.g., 10 to test quickly; set None for full doc

In [4]:
# Cell 3 — Profile & PDF→MD (pre-TATR path, OCR fallback)
prof = profile_document(pdf_path)
print(f"Profile: {prof.mode} (texty pages ~ {prof.pct_texty_pages:.0%})")

# If fully scanned, force OCR-only MD; else let Marker/Docling try first
force_ocr = (prof.mode == "scanned")

pdf_cfg = PDF2MDConfig(
    ocr_enable=bool(cfg.get("ingest.ocr.enable", True)),
    ocr_dpi=int(cfg.get("ingest.ocr.dpi", 300)),
    ocr_lang=str(cfg.get("ingest.ocr.lang", "eng")),
    min_chars_no_ocr=int(cfg.get("ingest.ocr.min_chars_no_ocr", 60)),
    preserve_footnotes=bool(cfg.get("ingest.pdf2md.preserve_footnotes", True)),
    keep_figure_captions=bool(cfg.get("ingest.pdf2md.keep_figure_captions", True)),
    force_ocr_only=force_ocr,
    sample_n_pages=DRY_RUN_PAGES,
    sample_random_seed=123,
    sample_ocr_dpi=int(cfg.get("ingest.ocr.dpi", 300)) if DRY_RUN_PAGES else None,
)

outs = run_pdf_to_markdown(
    doc_id=doc_id, pdf_path=pdf_path, out_dir=md_dir, cfg=pdf_cfg
)
print("MD:", outs.markdown_path)
print("TOC:", outs.toc_json_path)
print("Pages:", outs.pages_jsonl_path)


Profile: scanned (texty pages ~ 0%)
MD: data\artifacts\NFS_2019\md\NFS_2019.md
TOC: data\artifacts\NFS_2019\md\NFS_2019.toc.json
Pages: data\artifacts\NFS_2019\md\NFS_2019.pages.jsonl


In [5]:
# Cell 4 — Tables (words/Paddle only; TATR disabled)
# Use words-grid fallback by default (robust, no extra models).
# If you have PaddleOCR installed and want to try it, set use_paddle=True below.
tcfg = TablesConfig(
    ocr_dpi=int(cfg.get("ingest.ocr.dpi", 300)),
    export_markdown=bool(cfg.get("ingest.tables.export_markdown", True)),
    max_cols=int(cfg.get("ingest.tables.words.max_cols", 6)),
    merge_wrap_lines=bool(cfg.get("ingest.tables.words.merge_wrap_lines", True)),
    min_row_height=float(cfg.get("ingest.tables.words.min_row_height", 6.0)),
    use_paddle=bool(cfg.get("ingest.tables.paddle.enable", False)),  # keep False if Paddle not installed
)
saved = extract_tables_from_pdf(
    doc_id=doc_id, pdf_path=pdf_path, out_dir=art_root, cfg=tcfg,
    page_subset=outs.processed_pages  # respect dry-run pages if set
)
print("Tables saved:", saved, "→", art_root / "tables")

The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.
You are using a model of type table-transformer to instantiate a model of type detr. This is not supported for all configurations of models and can yield errors.


preprocessor_config.json:   0%|          | 0.00/274 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

You are using a model of type table-transformer to instantiate a model of type detr. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/115M [00:00<?, ?B/s]

Tables saved: 0 → data\artifacts\NFS_2019\tables


In [6]:
# Cell 5 — Math (formula-ish text lines)
mcfg = MathConfig(
    min_digits=int(cfg.get("ingest.math.min_digits", 2)),
    require_operator=bool(cfg.get("ingest.math.require_operator", True)),
)
n_formulas = extract_formulas_from_pages_jsonl(
    doc_id=doc_id, pages_jsonl=outs.pages_jsonl_path, out_dir=art_root, cfg=mcfg
)
print("Formulas detected:", n_formulas, "→", art_root / "math")


Formulas detected: 6336 → data\artifacts\NFS_2019\math


In [7]:
# Cell 6 — Chunking & Hierarchy
chunk_cfg = ChunkingConfig(
    max_chars=int(cfg.get("chunking.max_chars", 1200)),
    overlap=int(cfg.get("chunking.overlap", 200)),
    drop_gibberish=bool(cfg.get("chunking.drop_gibberish", True)),
    drop_toc=bool(cfg.get("chunking.drop_toc", True)),
    min_align_score=int(cfg.get("chunking.min_align_score", 70)),
)
chunks_out = chunks_dir / f"{doc_id}.chunks.jsonl"
n_chunks = md_to_chunks(
    doc_id=doc_id,
    md_path=outs.markdown_path,
    out_path=chunks_out,
    cfg=chunk_cfg,
    pages_jsonl=outs.pages_jsonl_path,
)
print(f"Chunks written: {n_chunks} → {chunks_out}")

hcfg = HierarchyConfig(
    min_node_chars=int(cfg.get("hierarchy.min_node_chars", 400)),
    use_section_regex=bool(cfg.get("hierarchy.use_section_regex", True)),
    fold_tiny_into_misc=bool(cfg.get("hierarchy.fold_tiny_into_misc", True)),
)
build_hierarchy(chunks_dir=chunks_dir, out_dir=graph_dir, cfg=hcfg)
print("Hierarchy written to:", (graph_dir / "graph"))


Chunks written: 13848 → data\artifacts\NFS_2019\chunks\NFS_2019.chunks.jsonl
Hierarchy written to: data\artifacts\NFS_2019\graph


In [13]:
# --- Notebook hotfix: chunk Chroma .add() without touching files ---
from packages.providers.vector import chromadb as _c

def _chunked_add(self, collection_name, ids, docs, metas, embs):
    coll = self.get_or_create(collection_name)
    batch = int(getattr(self, "max_add_batch", 2000))
    n = len(ids)
    for i in range(0, n, batch):
        j = min(i + batch, n)
        coll.add(
            ids=ids[i:j],
            documents=docs[i:j],
            metadatas=metas[i:j],
            embeddings=embs[i:j],
        )

# apply monkey patch
_c.ChromaVectorStore.add = _chunked_add

# set the batch size on the instance after constructing Indexer()
vs_max = int(cfg.get("index.vector.max_add_batch", 2000))
indexer = Indexer(
    vector_provider=vp,
    persist_path=persist,
    embed_model_or_path=embed_m,
    bge_use_prompt=bge_prompt,
)
setattr(indexer.vs, "max_add_batch", vs_max)

# proceed to build
collection = indexer.build(
    chunks_path=chunks_out,
    collection_name=None,
    reset=True,
)
print("Collection:", collection)


Collection: NFS_2019


In [14]:
# Cell 8 — Quick artifact heartbeat
from glob import glob, iglob

def count_lines(p: Path) -> int:
    try:
        with p.open("r", encoding="utf-8") as f:
            return sum(1 for _ in f)
    except Exception:
        return 0

print("\n=== Artifact heartbeat ===")
print("MD exists:", outs.markdown_path.exists())
print("TOC:", outs.toc_json_path.exists())
print("Pages:", outs.pages_jsonl_path.exists(), "lines:", count_lines(outs.pages_jsonl_path))
print("Tables dir:", (art_root / "tables").exists(), "files:", len(list(iglob(str(art_root / 'tables' / '*.*')))))
print("Math dir:", (art_root / "math").exists(), "files:", len(list(iglob(str(art_root / 'math' / '*.*')))))
print("Chunks:", chunks_out.exists(), "lines:", count_lines(chunks_out))
print("Graph:", (graph_dir / "graph" / "hierarchy.json").exists())



=== Artifact heartbeat ===
MD exists: True
TOC: True
Pages: True lines: 514
Tables dir: True files: 0
Math dir: True files: 1
Chunks: True lines: 13848
Graph: True
