In [1]:
from __future__ import annotations
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import json
import fitz  # PyMuPDF

In [2]:
# import text, image, and table extractor functions

from parser_extractors import (
    extract_text_blocks,
    extract_image_blocks,
    extract_tables_for_page,
)

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [5]:
def extract_pdf(
    pdf_path: str,
    out_dir: str,
    *,
    export_images: bool = True,
    raster_fallback: bool = True,
    raster_dpi: int = 220,
    pages: Optional[List[int]] = None,  # 1-based page numbers; None = all
) -> Path:
    """
    Extract TEXT, IMAGES, TABLES for all (or selected) pages into one JSON.
    Returns path to the JSON payload.
    """
    pdf_path = str(pdf_path)
    stem = Path(pdf_path).stem
    workdir = Path(out_dir) / stem
    img_dir = workdir / "images"
    tables_dir = workdir / "tables"

    # create dirs (images dir optional)
    workdir.mkdir(parents=True, exist_ok=True)
    tables_dir.mkdir(parents=True, exist_ok=True)
    if export_images or raster_fallback:
        img_dir.mkdir(parents=True, exist_ok=True)

    doc = fitz.open(pdf_path)
    if doc.page_count == 0:
        doc.close()
        raise ValueError("Empty PDF.")

    # page list (1-based)
    if pages is None:
        page_indices = list(range(1, doc.page_count + 1))
    else:
        # sanitize and clamp
        page_indices = [p for p in pages if 1 <= p <= doc.page_count]
        if not page_indices:
            doc.close()
            raise ValueError("No valid pages to process.")

    all_texts: List[Dict[str, Any]] = []
    all_pictures: List[Dict[str, Any]] = []
    all_tables: List[Dict[str, Any]] = []
    per_page_stats: List[Dict[str, Any]] = []

    for pno in page_indices:
        page = doc.load_page(pno - 1)
        pdict = page.get_text("dict")

        # --- TEXT ---
        texts = extract_text_blocks(page, pno)
        all_texts.extend(texts)



        # --- TABLES ---
        tables, tstats = extract_tables_for_page(
            pdf_path=pdf_path,
            pno=pno,
            workdir=tables_dir,
            stem=stem,
            page_texts=texts, # we need to pass texts to look for captions 
            page=page,
            flavors=("lattice", "stream"),
            save_csv=True,
        )
        all_tables.extend(tables)
        

        # --- IMAGES ---
        pictures, istats = extract_image_blocks(
            doc,
            page,
            pdict,
            pno=pno,
            img_dir=img_dir,
            stem=stem,
            raster_fallback=raster_fallback,
            raster_dpi=raster_dpi,
        )
        all_pictures.extend(pictures)       



        
        # quick console summary per page
        print(
            f"[p{pno}] text={len(texts)} | figures={istats['figures']} "
            f"(with_caption={istats['with_caption']}) | "
            f"xobjs={istats['xobjects_found']}(exp={istats['xobjects_exported']}) "
            f"| tables={len(tables)}"
        )
        
        per_page_stats.append({
            "page": pno,
            "text_blocks": len(texts),
            **istats,
            "tables_found": len(tables),
        })

        

    # --- Write one JSON payload ---
    meta = doc.metadata or {}
    payload = {
        "title": meta.get("title") or None,
        "authors": [a.strip() for a in (meta.get("author") or "").replace(";", ",").split(",") if a.strip()],
        "source_path": pdf_path,
        "page_count": doc.page_count,
        "processed_pages": page_indices,
        "counts": {
            "texts": len(all_texts),
            "pictures": len(all_pictures),
            "tables": len(all_tables),
        },
        "stats_per_page": per_page_stats,
        "texts": all_texts,
        "pictures": all_pictures,
        "tables": all_tables,
    }
    out_json = workdir / f"{stem}.raw.json"
    out_json.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
    doc.close()
    print(f"[done] wrote {out_json}")
    return out_json


In [6]:
pdf = "BLIP.pdf"
out = "output"
extract_pdf(pdf, out, export_images=True)

Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is an invalid float value
Cannot set gray non-stroke color because /'P5' is an invalid float value


[p1] text=22 | figures=0 (with_caption=0) | xobjs=3(exp=3) | tables=1


Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is an invalid float value
Cannot set gray non-stroke color because /'P5' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is an invalid float value
Cannot set gray non-stroke color because /'P5' is an invalid float value


[p2] text=34 | figures=0 (with_caption=0) | xobjs=1(exp=1) | tables=1
[p3] text=16 | figures=0 (with_caption=0) | xobjs=0(exp=0) | tables=1
[p4] text=40 | figures=0 (with_caption=0) | xobjs=0(exp=0) | tables=1
[p5] text=34 | figures=0 (with_caption=0) | xobjs=3(exp=3) | tables=1
[p6] text=26 | figures=0 (with_caption=0) | xobjs=0(exp=0) | tables=1
[p7] text=46 | figures=0 (with_caption=0) | xobjs=0(exp=0) | tables=1
[p8] text=26 | figures=0 (with_caption=0) | xobjs=0(exp=0) | tables=1
[p9] text=25 | figures=0 (with_caption=0) | xobjs=0(exp=0) | tables=1
[p10] text=25 | figures=0 (with_caption=0) | xobjs=0(exp=0) | tables=1
[p11] text=20 | figures=0 (with_caption=0) | xobjs=0(exp=0) | tables=1


  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)


[p12] text=31 | figures=0 (with_caption=0) | xobjs=6(exp=6) | tables=1
[done] wrote output/BLIP/BLIP.raw.json


PosixPath('output/BLIP/BLIP.raw.json')

In [None]:
# parses all PDFs as JSONs and stores them in parsed_documents folder

# pdf_folder = Path("../data/pdf")     
# out_dir = "../data/parsed_documents"

# for pdf_path in pdf_folder.glob("*.pdf"):
#     print(f"\nProcessing {pdf_path.name} ...")
#     try:
#         extract_pdf(str(pdf_path), out_dir, export_images=True)
#     except Exception as e:
#         print(f"Failed on {pdf_path.name}: {e}")