In [1]:
# ================================
# Cell 1 — Bootstrap repo
# ================================
import os, sys
from pathlib import Path

ROOT = Path.cwd()
while not (ROOT / "pyproject.toml").exists() and ROOT != ROOT.parent:
    ROOT = ROOT.parent
os.chdir(ROOT)
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
print("Project root:", ROOT)

Project root: d:\IIT BBS\Job Resources\Business Optima\new-pdf-agent


In [2]:
# ================================
# Cell 2 — Config
# ================================
from pathlib import Path

# Input PDF
pdf_path = Path("data/raw/NFS_2019.pdf")
assert pdf_path.exists(), f"Missing PDF: {pdf_path}"
doc_id = pdf_path.stem

# Page range (1-based, inclusive)
FIRST_PAGE = 1
LAST_PAGE  = 20

# Rendering & output
DPI = 180  # lower DPI is faster on CPU; increase if needed
out_dir = Path(f"data/artifacts/{doc_id}/vlm_md")
out_dir.mkdir(parents=True, exist_ok=True)

# Ollama
OLLAMA_URL   = "http://localhost:11434/api/generate"
OLLAMA_MODEL = "llava:7b"   # you said you pulled this lightweight one

# Prompt (strict schema)
PROMPT = """You are an OCR-to-Markdown converter.

Convert the scanned page image into STRICT Markdown with these rules:
- Output ONLY Markdown. No explanations, no code fences.
- Use '## ' for headings.
- Represent tables using GitHub pipe tables:
  | Col A | Col B |
  | ---   | ---   |
  | ...   | ...   |
- Inline math as $...$ and display math as $$...$$ (LaTeX).
- For notations / abbreviations / definitions, emit blocks:

:::notation
TERM: explanation
ABBR: expansion
:::

- Preserve bullet/numbered lists when obvious.
- Do not hallucinate content; if something is unclear, leave blank cells.
- Keep original reading order (left → right, top → bottom).

Page:
"""

print("PDF:", pdf_path)
print("Output dir:", out_dir)
print("Ollama model:", OLLAMA_MODEL)

PDF: data\raw\NFS_2019.pdf
Output dir: data\artifacts\NFS_2019\vlm_md
Ollama model: llava:7b


In [3]:
# ================================
# Cell 3 — Helpers
# ================================
import io, base64, json, time, requests
import fitz  # PyMuPDF
from PIL import Image
from typing import Dict, Any

def ollama_up(url: str = "http://localhost:11434") -> bool:
    try:
        requests.get(f"{url}/api/tags", timeout=2)
        return True
    except Exception:
        return False

def render_page_image(doc: fitz.Document, page_no: int, dpi: int = 180) -> Image.Image:
    """Render 1-based page to PIL RGB image."""
    page = doc.load_page(page_no - 1)
    pix = page.get_pixmap(dpi=dpi)
    return Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")

def image_to_base64(img: Image.Image) -> str:
    buf = io.BytesIO()
    img.save(buf, format="PNG")
    return base64.b64encode(buf.getvalue()).decode("utf-8")

def sanitize_markdown(md: str) -> str:
    s = (md or "").strip()
    # Strip accidental role tags or code fences
    if s.startswith("```") and s.endswith("```"):
        s = s.strip("`").strip()
    lines = s.splitlines()
    if lines and lines[0].lower().startswith(("assistant:", "system:", "markdown")):
        s = "\n".join(lines[1:]).lstrip()
    return s

def md_quality(md: str) -> Dict[str, Any]:
    """Simple quality counters to spot pages with tables/math/notations."""
    import re
    lines = md.splitlines()
    has_table = any(l.strip().startswith("|") for l in lines) and any("---" in l for l in lines[:10] + lines[-10:])
    has_inline_math  = "$" in md
    has_display_math = "$$" in md
    has_notation = ":::notation" in md.lower()
    return {
        "chars": len(md),
        "has_table": bool(has_table),
        "has_inline_math": bool(has_inline_math),
        "has_display_math": bool(has_display_math),
        "has_notation": bool(has_notation),
    }

def vlm_ollama_page_to_md(img: Image.Image,
                          model: str = OLLAMA_MODEL,
                          url: str = OLLAMA_URL,
                          prompt: str = PROMPT,
                          timeout: int = 600,
                          retries: int = 2) -> str:
    """Call Ollama /generate with image and strict prompt; retry on transient errors."""
    b64 = image_to_base64(img)
    payload = {
        "model": model,
        "prompt": prompt,
        "images": [b64],
        "stream": False,
        "options": {
            "temperature": 0.1,
            "num_ctx": 4096
        }
    }
    last_err = None
    for attempt in range(retries + 1):
        try:
            resp = requests.post(url, json=payload, timeout=timeout)
            resp.raise_for_status()
            text = resp.json().get("response", "")
            return sanitize_markdown(text)
        except Exception as e:
            last_err = e
            time.sleep(1.0 * (attempt + 1))
    raise RuntimeError(f"Ollama call failed after {retries+1} tries: {last_err}")


In [None]:
# ================================
# Cell 4 — Run pages → page_X_ollama.md
# ================================
if not ollama_up():
    raise SystemExit(
        "Ollama is not running on http://localhost:11434.\n"
        "Start it with:\n"
        "  ollama serve\n"
        "Ensure the model is pulled:\n"
        f"  ollama pull {OLLAMA_MODEL}\n"
    )

doc = fitz.open(str(pdf_path))
manifest_fp = out_dir / "page_manifest.jsonl"

total = 0
good = 0

with manifest_fp.open("w", encoding="utf-8") as mf:
    for pno in range(FIRST_PAGE, min(LAST_PAGE, len(doc)) + 1):
        img = render_page_image(doc, pno, dpi=DPI)

        # Optional: also save the PNG for inspection/debug
        png_path = out_dir / f"page_{pno}.png"
        img.save(png_path)

        try:
            md = vlm_ollama_page_to_md(img)
        except Exception as e:
            print(f"[Ollama] page {pno} failed:", e)
            continue

        md_stats = md_quality(md)
        out_md = out_dir / f"page_{pno}_ollama.md"
        out_md.write_text(md, encoding="utf-8")

        mf.write(json.dumps({
            "page": pno,
            "md_path": str(out_md),
            **md_stats
        }) + "\n")

        total += 1
        good += 1 if md_stats["chars"] >= 40 else 0

        flag = []
        if md_stats["has_table"]: flag.append("table")
        if md_stats["has_inline_math"] or md_stats["has_display_math"]: flag.append("math")
        if md_stats["has_notation"]: flag.append("notation")
        flag_str = ", ".join(flag) if flag else "-"
        print(f"[OK] p{pno}: {md_stats['chars']} chars | {flag_str} → {out_md}")

print(f"Done. Wrote {good}/{total} page MDs → {out_dir}")


[OK] p1: 160 chars | table, math, notation → data\artifacts\NFS_2019\vlm_md\page_1_ollama.md
[OK] p2: 1224 chars | notation → data\artifacts\NFS_2019\vlm_md\page_2_ollama.md
[OK] p3: 814 chars | notation → data\artifacts\NFS_2019\vlm_md\page_3_ollama.md
[OK] p4: 911 chars | - → data\artifacts\NFS_2019\vlm_md\page_4_ollama.md
[OK] p5: 3624 chars | notation → data\artifacts\NFS_2019\vlm_md\page_5_ollama.md
[OK] p6: 1507 chars | - → data\artifacts\NFS_2019\vlm_md\page_6_ollama.md
[OK] p7: 120 chars | notation → data\artifacts\NFS_2019\vlm_md\page_7_ollama.md
[OK] p8: 119 chars | - → data\artifacts\NFS_2019\vlm_md\page_8_ollama.md
[OK] p9: 1376 chars | notation → data\artifacts\NFS_2019\vlm_md\page_9_ollama.md
[OK] p10: 146 chars | - → data\artifacts\NFS_2019\vlm_md\page_10_ollama.md
[Ollama] page 11 failed: Ollama call failed after 3 tries: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=600)


In [None]:
# ================================
# Cell 5 — Stitch & Summary
# ================================
stitched = out_dir / f"{doc_id}.vlm.ollama.md"
with stitched.open("w", encoding="utf-8") as fout:
    for pno in range(FIRST_PAGE, min(LAST_PAGE, len(doc)) + 1):
        fp = out_dir / f"page_{pno}_ollama.md"
        if fp.exists():
            fout.write(fp.read_text(encoding="utf-8").rstrip())
            fout.write("\n\n---\n\n")

print("Stitched file:", stitched)

# quick summary from manifest
import json
manifest_fp = out_dir / "page_manifest.jsonl"
stats = {"pages": 0, "tables": 0, "notations": 0, "math_inline": 0, "math_display": 0}
if manifest_fp.exists():
    with manifest_fp.open("r", encoding="utf-8") as f:
        for line in f:
            try:
                rec = json.loads(line)
            except Exception:
                continue
            stats["pages"] += 1
            stats["tables"] += 1 if rec.get("has_table") else 0
            stats["notations"] += 1 if rec.get("has_notation") else 0
            stats["math_inline"] += 1 if rec.get("has_inline_math") else 0
            stats["math_display"] += 1 if rec.get("has_display_math") else 0

print("Summary:", stats)
