In [3]:
# ==== COLAB CLEAN + LOCKED SETUP (run FIRST) ====
import pathlib, textwrap, pkgutil, os

# 0) Remove conflicting versions if they slipped in
!pip uninstall -y pillow pandas >/dev/null 2>&1

# 1) Create a constraints file with the versions that work with Docling + Colab
CONSTRAINTS = "/content/constraints.txt"
pathlib.Path(CONSTRAINTS).write_text(textwrap.dedent("""
pillow==11.3.0
pandas==2.2.2
jedi>=0.18
""").strip()+"\n")

# 2) Make pip ALWAYS use these pins for the rest of the session
%env PIP_CONSTRAINT=/content/constraints.txt

# 3) Install the pins first
!pip install -qU pillow==11.3.0 pandas==2.2.2 "jedi>=0.18"

# 4) Install your stack under the constraints (so nothing upgrades Pillow/pandas)
!pip install -qU --no-warn-conflicts \
  -c /content/constraints.txt \
  docling==2.57.0 \
  "docling-core[chunking]" \
  transformers \
  llama-index-core \
  llama-index-readers-docling \
  llama-index-node-parser-docling \
  llama-index-embeddings-huggingface \
  llama-index-vector-stores-milvus \
  sentence-transformers \
  tabulate \
  pylatexenc latex2mathml  # NEW (LaTeX): light utils for TeX cleaning/optional MathML

# 5) Sanity check
import PIL, pandas, pkgutil
print("Pillow:", PIL.__version__)
print("pandas:", pandas.__version__)
print("jedi present:", "jedi" in {m.name for m in pkgutil.iter_modules()})


env: PIP_CONSTRAINT=/content/constraints.txt
Pillow: 11.3.0
pandas: 2.2.2
jedi present: True


In [4]:
# Colab cell: installs
%pip install -qU pip

# Core parsing + chunking
%pip install -qU docling docling-core transformers

# RAG stack
%pip install -qU llama-index-core \
                 llama-index-readers-docling \
                 llama-index-node-parser-docling \
                 llama-index-embeddings-huggingface \
                 llama-index-vector-stores-milvus

# Utils
%pip install -qU pillow pandas tabulate sentence-transformers pylatexenc latex2mathml  # NEW (LaTeX)
# (Optional) Other vector stores:
# %pip install -qU qdrant-client chromadb weaviate-client


In [5]:
from pathlib import Path
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions

input_path = Path("/IVMSP-08-ImgMatch-Notes-2.pdf")  # <- your file

pdf_opts = PdfPipelineOptions()
pdf_opts.images_scale = 2.0
pdf_opts.generate_page_images = True
pdf_opts.generate_picture_images = True

converter = DocumentConverter(format_options={
    InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts)
})

conv_res = converter.convert(input_path)
dl_doc = conv_res.document
doc_stem = conv_res.input.file.stem

print("Parsed pages:", len(dl_doc.pages))
print("Tables:", len(dl_doc.tables), "| Pictures:", len(dl_doc.pictures))


[32m[INFO] 2025-10-20 10:37:26,308 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-10-20 10:37:26,318 [RapidOCR] download_file.py:68: Initiating download: https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/det/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-10-20 10:37:27,795 [RapidOCR] download_file.py:82: Download size: 13.83MB[0m
[32m[INFO] 2025-10-20 10:37:28,017 [RapidOCR] download_file.py:95: Successfully saved to: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-10-20 10:37:28,019 [RapidOCR] torch.py:54: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-10-20 10:37:28,321 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-10-20 10:37:28,322 [RapidOCR] download_file.py:68: Initiating download: https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/cls/ch_ptocr_mobile_v2.0_cls

Parsed pages: 5
Tables: 1 | Pictures: 12


In [6]:
import os, json
from PIL import Image
from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
import pandas as pd

# Create asset folders
assets_base = Path("/content/repo/assets")
pages_dir    = assets_base / "pages"
figs_dir     = assets_base / "figures"
tables_dir   = assets_base / "tables"
formulas_dir = assets_base / "formulas"
text_dir     = Path("/content/repo/text/markdown")
json_dir     = Path("/content/repo/text/json")
latex_dir    = Path("/content/repo/text/latex")  # NEW (LaTeX)

for d in [pages_dir, figs_dir, tables_dir, formulas_dir, text_dir, json_dir, latex_dir]:
    d.mkdir(parents=True, exist_ok=True)

# 2A) Save page images
for page_no, page in dl_doc.pages.items():
    out = pages_dir / f"{doc_stem}_page_{page_no}.png"
    page.image.pil_image.save(out, "PNG")

# 2B) Export figures and table preview images
fig_ix, tbl_ix = 0, 0
for element, _lvl in dl_doc.iterate_items():
    if isinstance(element, PictureItem):
        fig_ix += 1
        out = figs_dir / f"{doc_stem}_fig_{fig_ix}.png"
        element.get_image(dl_doc).save(out, "PNG")
    if isinstance(element, TableItem):
        tbl_ix += 1
        out = tables_dir / f"{doc_stem}_table_{tbl_ix}.png"
        element.get_image(dl_doc).save(out, "PNG")

# 2C) Export tables as CSV + HTML
for k, t in enumerate(dl_doc.tables, start=1):
    df = t.export_to_dataframe()
    df.to_csv(tables_dir / f"{doc_stem}_table_{k}.csv", index=False)
    (tables_dir / f"{doc_stem}_table_{k}.html").write_text(t.export_to_html(doc=dl_doc))

# 2D) Human-friendly exports with referenced images
md_ref = text_dir / f"{doc_stem}_with_image_refs.md"
html_ref = text_dir / f"{doc_stem}_with_image_refs.html"
dl_doc.save_as_markdown(md_ref, image_mode=ImageRefMode.REFERENCED)
dl_doc.save_as_html(html_ref, image_mode=ImageRefMode.REFERENCED)

# 2E) Save native Docling JSON
(json_dir / f"{doc_stem}.json").write_text(dl_doc.model_dump_json())

# 2F) Prepare LaTeX outputs (files created in Cell 5)
latex_jsonl_path = latex_dir / f"{doc_stem}_formulas.jsonl"   # NEW (LaTeX)
latex_md_path    = latex_dir / f"{doc_stem}_formulas.md"      # NEW (LaTeX)

print("Assets exported under:", assets_base)
print("LaTeX paths prepared:", latex_jsonl_path, latex_md_path)




Assets exported under: /content/repo/assets
LaTeX paths prepared: /content/repo/text/latex/IVMSP-08-ImgMatch-Notes-2_formulas.jsonl /content/repo/text/latex/IVMSP-08-ImgMatch-Notes-2_formulas.md


In [7]:
# ===== Cell 5 (Hybrid Math Detector): true formulas + mathy text + picture fallback =====
from typing import Iterable, Tuple, Optional, List, Dict
from PIL import Image
from pathlib import Path
from docling_core.types.doc import DocItemLabel, TextItem, PictureItem
try:
    from docling_core.types.doc import FormulaItem
except Exception:
    FormulaItem = tuple()  # sentinel if class not available

import io, hashlib, json, re

# ---- tuning knobs ----
STRICT_SKIP_NO_BBOX   = True     # don't crop full pages (prevents slide duplicates)
FALLBACK_LAST_PAGES   = 6        # slides often put formulas at the end
FALLBACK_MIN_W, FALLBACK_MIN_H = 80, 40  # ignore tiny logos/icons
TEXT_MATH_MIN_LEN     = 3        # don't flag ultra-short tokens as formula
# Recognize TeX-ish and math-y content in text lines
MATHY_REGEX = re.compile(
    r"(\$.*?\$|\\\[.*?\\\]|\\\(.+?\\\)|"        # TeX delimiters
    r"\\frac|\\sum|\\int|\\lim|\\log|\\sin|\\cos|\\tan|\\alpha|\\beta|\\gamma|\\infty|\\cdot|\\times|\\sqrt|"
    r"[∑∏∫∞≈≃≅≡≤≥≪≫±×·÷∂∇√⊂⊆⊃⊇∈∉∩∪∧∨¬⇒⇔→←↔≈≃≅≡≤≥⊥∥]"
    r"|[=^_]{1,}.*[0-9A-Za-z])"
)

def _bbox_box(item) -> Optional[tuple]:
    bbox = getattr(item, "bbox", None)
    if bbox is None:
        return None
    try:
        return tuple(bbox.to_box())
    except Exception:
        try:
            return tuple(bbox)
        except Exception:
            return None

def _page_no_of_item(doc, item) -> Optional[int]:
    for attr in ("page_no", "page_number", "page_idx", "page_index"):
        v = getattr(item, attr, None)
        if v is not None and v in doc.pages:
            return v
    page_obj = getattr(item, "page", None)
    if page_obj is not None:
        v = getattr(page_obj, "page_no", None)
        if v is not None and v in doc.pages:
            return v
    # bbox containment fallback
    box = _bbox_box(item)
    if box is not None:
        x0, y0, x1, y1 = box
        for pn, page in doc.pages.items():
            W, H = page.image.pil_image.size
            if 0 <= x0 < x1 <= W and 0 <= y0 < y1 <= H:
                return pn
    return None

def _png_md5(img: Image.Image) -> str:
    buf = io.BytesIO()
    img.save(buf, format="PNG")
    return hashlib.md5(buf.getvalue()).hexdigest()

def _wrap_tex(tex: str) -> Optional[str]:
    tex = (tex or "").strip()
    if not tex:
        return None
    if tex.startswith("$$") or tex.startswith("\\[") or (tex.startswith("$") and tex.endswith("$")):
        return tex
    return f"$$\n{tex}\n$$"

def _iter_true_formula_items(doc):
    for item, _lvl in doc.iterate_items():
        if (FormulaItem and isinstance(item, FormulaItem)) or \
           (isinstance(item, TextItem) and item.label == DocItemLabel.FORMULA):
            yield item

def _iter_mathy_text_items(doc):
    """Pick up equations typeset as text (common in some PDFs/slide exports)."""
    for item, _lvl in doc.iterate_items():
        if not isinstance(item, TextItem):
            continue
        s = (getattr(item, "text", None) or "").strip()
        if len(s) < TEXT_MATH_MIN_LEN:
            continue
        if MATHY_REGEX.search(s):
            yield item

def _iter_picture_fallback(doc, tail_pages):
    for item, _lvl in doc.iterate_items():
        if not isinstance(item, PictureItem):
            continue
        pn = _page_no_of_item(doc, item)
        if pn not in tail_pages:
            continue
        box = _bbox_box(item)
        if pn is None or box is None:
            continue
        x0, y0, x1, y1 = box
        if (x1 - x0) < FALLBACK_MIN_W or (y1 - y0) < FALLBACK_MIN_H:
            continue
        yield item

def _crop_by_item(doc, item) -> Optional[Tuple[Image.Image, int, Optional[tuple]]]:
    pn = _page_no_of_item(doc, item)
    box = _bbox_box(item)
    if STRICT_SKIP_NO_BBOX and (pn is None or box is None):
        return None
    page_img = doc.pages[pn].image.pil_image if pn is not None else next(iter(doc.pages.values())).image.pil_image
    crop = page_img if box is None else page_img.crop(box)
    return crop, pn, box

# Accumulators
records: List[Dict] = []
seen_md5 = set()

# 1) True formula items
for item in _iter_true_formula_items(dl_doc):
    out = _crop_by_item(dl_doc, item)
    if out is None:
        continue
    crop, pn, box = out
    md5 = _png_md5(crop)
    if md5 in seen_md5:
        continue
    seen_md5.add(md5)
    idx = len(seen_md5)

    # Try to read any latex-ish field provided by docling
    raw = None
    for cand in ("latex", "tex", "text", "content"):
        raw = getattr(item, cand, None)
        if raw:
            break
    tex = _wrap_tex(raw)

    path = formulas_dir / f"{doc_stem}_formula_{idx}.png"
    crop.save(path, "PNG")
    records.append({
        "id": f"{doc_stem}_f{idx}",
        "page": pn,
        "bbox": list(box) if box else None,
        "tex": tex,
        "mathml": None,   # (optional OCR step can fill later)
        "image_path": str(path),
        "context_prev": getattr(getattr(item, "prev", None), "text", None),
        "context_next": getattr(getattr(item, "next", None), "text", None),
        "source": "formula_item",
    })

# 2) Mathy TextItems (heuristic)
for item in _iter_mathy_text_items(dl_doc):
    out = _crop_by_item(dl_doc, item)
    if out is None:
        continue
    crop, pn, box = out
    md5 = _png_md5(crop)
    if md5 in seen_md5:
        continue
    seen_md5.add(md5)
    idx = len(seen_md5)

    s = (getattr(item, "text", None) or "").strip()
    tex = _wrap_tex(s)

    path = formulas_dir / f"{doc_stem}_formula_{idx}.png"
    crop.save(path, "PNG")
    records.append({
        "id": f"{doc_stem}_f{idx}",
        "page": pn,
        "bbox": list(box) if box else None,
        "tex": tex,       # comes from text; good enough for retrieval/rendering
        "mathml": None,
        "image_path": str(path),
        "context_prev": getattr(getattr(item, "prev", None), "text", None),
        "context_next": getattr(getattr(item, "next", None), "text", None),
        "source": "text_heuristic",
    })

# 3) Picture fallback on tail pages (for rasterized equations at end of slides)
if not records:
    pages_sorted = sorted(dl_doc.pages.keys())
    tail_pages = pages_sorted[-FALLBACK_LAST_PAGES:] if len(pages_sorted) >= FALLBACK_LAST_PAGES else pages_sorted
    for item in _iter_picture_fallback(dl_doc, tail_pages):
        out = _crop_by_item(dl_doc, item)
        if out is None:
            continue
        crop, pn, box = out
        md5 = _png_md5(crop)
        if md5 in seen_md5:
            continue
        seen_md5.add(md5)
        idx = len(seen_md5)
        path = formulas_dir / f"{doc_stem}_formula_{idx}.png"
        crop.save(path, "PNG")
        records.append({
            "id": f"{doc_stem}_f{idx}",
            "page": pn,
            "bbox": list(box) if box else None,
            "tex": None,     # no TeX; can be filled by OCR in Cell 5.1
            "mathml": None,
            "image_path": str(path),
            "context_prev": None,
            "context_next": None,
            "source": "picture_fallback",
        })

# Write JSONL + friendly MD index
latex_jsonl = latex_dir / f"{doc_stem}_formulas.jsonl"
latex_md    = latex_dir / f"{doc_stem}_formulas.md"
with open(latex_jsonl, "w", encoding="utf-8") as f:
    for rec in records:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

md_lines = [f"# Formulas / candidates from `{doc_stem}`\n"]
for j, rec in enumerate(records, start=1):
    md_lines.append(f"## {j}. Page {rec['page']} — {rec['source']}")
    md_lines.append(rec["tex"] if rec["tex"] else "_(no TeX — picture)_")
    md_lines.append(f"\n![crop]({rec['image_path']})\n")
Path(latex_md).write_text("\n".join(md_lines), encoding="utf-8")

print(f"Saved {len(records)} unique formula/candidate crops.")
print("LaTeX JSONL:", latex_jsonl)
print("LaTeX MD:", latex_md)


Saved 0 unique formula/candidate crops.
LaTeX JSONL: /content/repo/text/latex/IVMSP-08-ImgMatch-Notes-2_formulas.jsonl
LaTeX MD: /content/repo/text/latex/IVMSP-08-ImgMatch-Notes-2_formulas.md


In [None]:
# ===== Cell 5.1 (Optional): Try OCR (im2LaTeX) on picture_fallback crops to fill 'tex' =====
import json, sys, traceback
from pathlib import Path

latex_jsonl = latex_dir / f"{doc_stem}_formulas.jsonl"
tmp_jsonl   = latex_dir / f"{doc_stem}_formulas_tmp.jsonl"

def _try_import_im2latex():
    try:
        # Two popular packages; try either
        import pix2tex  # noqa
        return "pix2tex"
    except Exception:
        pass
    try:
        import latex_ocr  # noqa
        return "latex_ocr"
    except Exception:
        pass
    return None

backend = _try_import_im2latex()
if backend is None:
    # Try to install a backend quickly
    try:
        # pix2tex is common; if install fails, we give up gracefully
        %pip install -qU pix2tex
        import pix2tex  # noqa
        backend = "pix2tex"
    except Exception:
        print("⚠️ Could not import/install an im2LaTeX backend. Skipping OCR.")
        backend = None

def _ocr_pix2tex(img_path: str) -> Optional[str]:
    try:
        from pix2tex.cli import LatexOCR
        from PIL import Image
        model = LatexOCR()
        pred = model(Image.open(img_path))
        tex = str(pred).strip()
        return tex if tex else None
    except Exception:
        traceback.print_exc(file=sys.stdout)
        return None

def _ocr_one(image_path: str) -> Optional[str]:
    if backend == "pix2tex":
        return _ocr_pix2tex(image_path)
    # Add other backends here if you prefer
    return None

updated = 0
with open(latex_jsonl, "r", encoding="utf-8") as fin, open(tmp_jsonl, "w", encoding="utf-8") as fout:
    for line in fin:
        rec = json.loads(line)
        if rec.get("tex") or rec.get("source") != "picture_fallback" or backend is None:
            fout.write(json.dumps(rec, ensure_ascii=False) + "\n")
            continue
        guess = _ocr_one(rec["image_path"])
        if guess:
            rec["tex"] = f"$$\n{guess}\n$$"
            updated += 1
        fout.write(json.dumps(rec, ensure_ascii=False) + "\n")

Path(latex_jsonl).unlink()
Path(tmp_jsonl).rename(latex_jsonl)
print(f"OCR filled TeX for {updated} picture crops.")


OCR filled TeX for 0 picture crops.


In [None]:
from docling.chunking import HybridChunker
from transformers import AutoTokenizer

EMB_MODEL_NAME = "BAAI/bge-m3"  # strong, multilingual text retrieval
tokenizer = AutoTokenizer.from_pretrained(EMB_MODEL_NAME)

chunker = HybridChunker(
    tokenizer=tokenizer,
    max_tokens=512,
    merge_peers=True
)

chunks = list(chunker.chunk(dl_doc))
print("Chunk count:", len(chunks))
print(chunks[0].text[:300])


Chunk count: 2
- Randomization adds statistical independence and decorrelates trees:
- Averaging the votes of N statistically independent and identically distributed (i.i.d.) learners with accuracy > 50 % arbitrarily reduces the variance:
<!-- formula-not-decoded -->
- Note, however, that the i.i.d. assumption is 


In [None]:
# Install Milvus Lite extra for local DB URIs
!pip install -qU -c /content/constraints.txt "pymilvus[milvus_lite]>=2.4.7"

from pymilvus import connections
connections.connect(alias="default", uri="/content/milvus_docling.db")
print("Milvus Lite local DB: OK")


  from pkg_resources import DistributionNotFound, get_distribution


Milvus Lite local DB: OK


In [None]:
# ===== Fixed Cell 9: build text index, then insert formula docs (no DoclingNodeParser) =====
from llama_index.core import VectorStoreIndex, StorageContext, Document
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.milvus import MilvusVectorStore
from llama_index.readers.docling import DoclingReader
from llama_index.node_parser.docling import DoclingNodeParser
import json

json_path = (json_dir / f"{doc_stem}.json")

# 5A) Text embedding model
text_embed = HuggingFaceEmbedding(model_name=EMB_MODEL_NAME)
dim = len(text_embed.get_text_embedding("hello"))

# 5B) Milvus Lite (local file -> Lite)
milvus_uri = "/content/milvus_docling.db"
text_store = MilvusVectorStore(uri=milvus_uri, dim=dim, collection_name="text", overwrite=True)
storage_ctx = StorageContext.from_defaults(vector_store=text_store)

# 5C) Docling-aware parsing → Nodes with page/bbox metadata (apply ONLY to Docling docs)
reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)
node_parser = DoclingNodeParser()

docling_documents = reader.load_data(json_path)  # LlamaIndex Documents from Docling JSON

# Build index from Docling docs WITH the DoclingNodeParser
text_index = VectorStoreIndex.from_documents(
    documents=docling_documents,
    transformations=[node_parser],
    storage_context=storage_ctx,
    embed_model=text_embed,
)

# 5D) Load TeX formula docs produced in Cell 5 and insert them WITHOUT DoclingNodeParser
formula_docs = []
with open(latex_jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        tex = (rec.get("tex") or "").strip()
        ctx_prev = (rec.get("context_prev") or "").strip()
        ctx_next = (rec.get("context_next") or "").strip()
        text_blob = "\n".join([s for s in [ctx_prev, tex, ctx_next] if s])

        formula_docs.append(
            Document(
                text=text_blob if text_blob else tex,
                metadata={
                    "type": "formula",
                    "is_formula": True,
                    "page_number": rec.get("page"),
                    "bbox": rec.get("bbox"),
                    "image_path": rec.get("image_path"),
                    "id": rec.get("id"),
                    "file_name": f"{doc_stem}.pdf",
                },
            )
        )
# Insert plain formula docs directly (no transformations)
if formula_docs:
    # Reuse the SAME storage_ctx (same Milvus collection) and same embed model.
    # No DoclingNodeParser here.
    _ = VectorStoreIndex.from_documents(
        documents=formula_docs,
        storage_context=storage_ctx,
        embed_model=text_embed,
        # no transformations -> they're plain text docs
    )

text_retriever = text_index.as_retriever(similarity_top_k=5)
print(f"Indexed {len(docling_documents)} Docling docs and inserted {len(formula_docs)} TeX formula docs.")

# Load per-path metadata for captions
meta_by_path = {}
with open(latex_jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        meta_by_path[rec["image_path"]] = {
            "tex": (rec.get("tex") or "").strip().replace("\n"," "),
            "source": rec.get("source",""),
        }

image_docs = []
for p in sorted((assets_base / "figures").glob("*.png")):
    image_docs.append(image_doc(p, f"Figure from {doc_stem}", {"type": "figure"}))
for p in sorted((assets_base / "formulas").glob("*.png")):
    meta = meta_by_path.get(str(p), {"tex":"", "source":""})
    tex_snip = meta["tex"][:256]
    src = meta["source"]
    cap = f"Formula crop ({src}) from {doc_stem}" + (f". TeX: {tex_snip}" if tex_snip else "")
    image_docs.append(image_doc(p, cap, {"type": "formula", "is_formula": True, "source": src}))


Indexed 1 Docling docs and inserted 0 TeX formula docs.


NameError: name 'image_doc' is not defined

In [None]:
from llama_index.core import VectorStoreIndex, StorageContext, Document
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

IMG_EMB_MODEL = "clip-ViT-B-32"  # sentence-transformers alias
img_embed = HuggingFaceEmbedding(model_name=IMG_EMB_MODEL)
img_dim = len(img_embed.get_text_embedding("x"))  # CLIP uses same dim for text encoder

image_store = MilvusVectorStore(uri=milvus_uri, dim=img_dim, collection_name="images", overwrite=True)
image_storage = StorageContext.from_defaults(vector_store=image_store)

def image_doc(path: Path, caption: str, meta: dict):
    return Document(text=caption, metadata={"image_path": str(path), **meta})

# Load TeX per-id for formula captions
import json
tex_by_path = {}
with open(latex_jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        p = rec.get("image_path")
        t = (rec.get("tex") or "").strip().replace("\n", " ")
        tex_by_path[p] = t

image_docs = []
for p in sorted((assets_base / "figures").glob("*.png")):
    image_docs.append(image_doc(p, f"Figure from {doc_stem}", {"type": "figure"}))
for p in sorted((assets_base / "formulas").glob("*.png")):
    tex = tex_by_path.get(str(p), "")
    cap = f"Formula crop from {doc_stem}. TeX: {tex[:256]}" if tex else f"Formula crop from {doc_stem}"
    image_docs.append(image_doc(p, cap, {"type": "formula", "is_formula": True}))

image_index = VectorStoreIndex.from_documents(image_docs, storage_context=image_storage, embed_model=img_embed)
image_retriever = image_index.as_retriever(similarity_top_k=5)
print(f"Image docs -> figures: {len([d for d in image_docs if d.metadata.get('type')=='figure'])}, formulas: {len([d for d in image_docs if d.metadata.get('type')=='formula'])}")


Image docs -> figures: 4, formulas: 0


In [None]:
import re

def _is_mathy(q: str) -> bool:
    # Simple heuristic: LaTeX delimiters or common commands
    return bool(re.search(r"(\$|\\frac|\\sum|\\int|\\alpha|\\beta|\\gamma|\\infty|\\begin\{equation\})", q))

def fused_retrieve(query: str, k: int = 8):
    text_hits  = text_retriever.retrieve(query)
    image_hits = image_retriever.retrieve(query)

    # naive interleave
    fused = []
    for a, b in zip(text_hits, image_hits):
        fused.extend([a, b])

    fused = fused[:k]

    # If mathy, bubble up formula items
    if _is_mathy(query):
        def score_key(h):
            meta = getattr(h, "metadata", {}) or {}
            return (1 if meta.get("is_formula") else 0, )  # formulas first
        fused = sorted(fused, key=score_key, reverse=True)

    return fused, text_hits, image_hits

query = r"Where is the main theorem defined, and what does Figure 2 illustrate? Show me the formula for $\alpha=\frac{x}{y}$."
fused, text_hits, image_hits = fused_retrieve(query)

print("Top text nodes:")
for h in text_hits[:3]:
    meta = getattr(h, "metadata", {}) or {}
    print("-", h.get_content()[:140].replace("\n"," "),
          "| meta:", {k:meta.get(k) for k in ["page_number","bbox","type","file_name","is_formula"]})

print("\nTop images:")
for h in image_hits[:3]:
    meta = getattr(h, "metadata", {}) or {}
    print("-", meta.get("type"), meta.get("image_path"), "| is_formula:", meta.get("is_formula"))


Top text nodes:
- <!-- formula-not-decoded --> | meta: {'page_number': None, 'bbox': None, 'type': None, 'file_name': None, 'is_formula': None}
- <!-- formula-not-decoded --> | meta: {'page_number': None, 'bbox': None, 'type': None, 'file_name': None, 'is_formula': None}
- <!-- formula-not-decoded --> | meta: {'page_number': None, 'bbox': None, 'type': None, 'file_name': None, 'is_formula': None}

Top images:
- figure /content/repo/assets/figures/QoSiC_4_AnalyticalModeling_fig_3.png | is_formula: None
- figure /content/repo/assets/figures/QoSiC_4_AnalyticalModeling_fig_4.png | is_formula: None
- figure /content/repo/assets/figures/QoSiC_4_AnalyticalModeling_fig_2.png | is_formula: None
