In [1]:
"""draw_text_bboxes
====================
Utilities for visually inspecting PDFs and extracting text with **layout
awareness** — even when pages mix single‑column, multi‑column, and wide tables.

Functions
---------
* **`draw_text_bboxes_png`** – word rectangles (embedded text) → PNG.
* **`draw_text_bboxes_ocr_png`** – word rectangles (OCR) → PNG.
* **`extract_text_layout`** – column‑aware plain‑text reconstruction.
* **`visualize_text_order_png`** – **NEW**: draws a rectangle around **each
  logical line** in the reading order found by `extract_text_layout`, so you
  can *see* the sequence the algorithm will read.

Quick example
~~~~~~~~~~~~~
```python
from draw_text_bboxes import (
    extract_text_layout,
    visualize_text_order_png,
)

# Try a page with mixed layouts
layout_text = extract_text_layout("report.pdf", col_gap_pts=140.0)[0]
print(layout_text)

# Visual debugging – green rectangles around consecutive lines
visualize_text_order_png(
    "report.pdf",
    pages=[0],
    col_gap_pts=140.0,
    annotate=True,    # small line numbers in the margin
)
```

Installation
~~~~~~~~~~~~
```bash
pip install pymupdf pillow pytesseract packaging
# plus Tesseract‑OCR ≥ 4.0 if you need the OCR helper
```
"""
from __future__ import annotations

from pathlib import Path
from typing import List, Tuple, Optional, Dict

import fitz  # PyMuPDF
from PIL import Image, ImageDraw
import pytesseract
from pytesseract import Output
from packaging.version import parse as _v
from statistics import median

__all__ = [
    "draw_text_bboxes_png",
    "draw_text_bboxes_ocr_png",
    "extract_text_layout",
    "visualize_text_order_png",
]

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _is_visible(text: str) -> bool:
    return any(not ch.isspace() for ch in text)


def _safe_conf(val) -> float:
    try:
        return float(val)
    except (ValueError, TypeError):
        return -1.0

# ---------------------------------------------------------------------------
# 1. Embedded‑text visualiser
# ---------------------------------------------------------------------------

def draw_text_bboxes_png(
    pdf_path: str | Path,
    output_dir: str | Path | None = None,
    pages: Optional[List[int]] = None,
    rect_color: Tuple[float, float, float] = (1.0, 0.0, 0.0),
    rect_width: float = 0.5,
    zoom: float = 2.0,
    ignore_whitespace: bool = True,
    min_rect_size: float = 0.0,
) -> List[Tuple[int, List[fitz.Rect], Path]]:
    """Red rectangles around every *word* found by PyMuPDF (embedded text)."""
    pdf_path = Path(pdf_path)
    doc = fitz.open(pdf_path)
    output_dir = Path(output_dir or pdf_path.with_stem(pdf_path.stem + "_bbox_images"))
    output_dir.mkdir(parents=True, exist_ok=True)

    pages = list(range(len(doc))) if pages is None else pages
    matrix = fitz.Matrix(zoom, zoom)
    ret: List[Tuple[int, List[fitz.Rect], Path]] = []

    for p in pages:
        page = doc[p]
        rects = []
        for x0, y0, x1, y1, txt, *_ in page.get_text("words"):
            if ignore_whitespace and not _is_visible(txt):
                continue
            r = fitz.Rect(x0, y0, x1, y1)
            if min(r.width, r.height) < min_rect_size:
                continue
            page.draw_rect(r, color=rect_color, width=rect_width)
            rects.append(r)
        png = output_dir / f"{pdf_path.stem}_page_{p}.png"
        page.get_pixmap(matrix=matrix, alpha=False).save(png)
        ret.append((p, rects, png))
    doc.close()
    return ret

# ---------------------------------------------------------------------------
# 2. OCR visualiser
# ---------------------------------------------------------------------------

def _check_tess(min_v: str = "3.05") -> None:
    try:
        ver = pytesseract.get_tesseract_version()
    except SystemExit as e:
        raise RuntimeError(f"Tesseract missing or too old — {e}") from None
    if ver < _v(min_v):
        raise RuntimeError(f"Need Tesseract ≥ {min_v}, got {ver}")


def draw_text_bboxes_ocr_png(
    pdf_path: str | Path,
    output_dir: str | Path | None = None,
    pages: Optional[List[int]] = None,
    dpi: int = 300,
    rect_color: Tuple[int, int, int] = (255, 0, 0),
    rect_width: int = 2,
    min_conf: int = 50,
    min_rect_size: int = 0,
    tesseract_lang: str = "eng",
    tess_config: str | None = None,
) -> List[Tuple[int, List[fitz.Rect], Path]]:
    """Red rectangles around OCR‑detected words (for scanned PDFs)."""
    _check_tess()
    pdf_path = Path(pdf_path)
    output_dir = Path(output_dir or pdf_path.with_stem(pdf_path.stem + "_ocr_bbox_images"))
    output_dir.mkdir(parents=True, exist_ok=True)

    doc = fitz.open(pdf_path)
    pages = list(range(len(doc))) if pages is None else pages
    scale = dpi / 72
    ret = []

    for p in pages:
        page = doc[p]
        pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale), alpha=False)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        data = pytesseract.image_to_data(img, lang=tesseract_lang, config=tess_config or "", output_type=Output.DICT)
        drw = ImageDraw.Draw(img)
        rects = []
        for i in range(len(data["level"])):
            if _safe_conf(data["conf"][i]) < min_conf or not _is_visible(data["text"][i]):
                continue
            x, y, w, h = data["left"][i], data["top"][i], data["width"][i], data["height"][i]
            if min(w, h) < min_rect_size:
                continue
            drw.rectangle([x, y, x + w, y + h], outline=rect_color, width=rect_width)
            rects.append(fitz.Rect(x / scale, y / scale, (x + w) / scale, (y + h) / scale))
        png = output_dir / f"{pdf_path.stem}_page_{p}.png"
        img.save(png)
        ret.append((p, rects, png))
    doc.close()
    return ret

# ---------------------------------------------------------------------------
# 3. Column clustering & text extraction
# ---------------------------------------------------------------------------

def _cluster_columns(words: List[Tuple[float, float, float, float, str]], gap_pts: float) -> Dict[int, List[Tuple]]:
    if not words:
        return {}
    words_sorted = sorted(words, key=lambda w: 0.5 * (w[0] + w[2]))
    clusters: Dict[int, List[Tuple]] = {0: [words_sorted[0]]}
    centers = [0.5 * (words_sorted[0][0] + words_sorted[0][2])]
    for w in words_sorted[1:]:
        x_mid = 0.5 * (w[0] + w[2])
        idx, mind = min(enumerate((abs(x_mid - c) for c in centers)), key=lambda t: t[1])
        if mind <= gap_pts:
            clusters[idx].append(w)
            centers[idx] = (centers[idx] * (len(clusters[idx]) - 1) + x_mid) / len(clusters[idx])
        else:
            clusters[len(clusters)] = [w]
            centers.append(x_mid)
    return clusters


def extract_text_layout(
    pdf_path: str | Path,
    pages: Optional[List[int]] = None,
    col_gap_pts: float = 72.0,
    y_tol: float = 2.0,
) -> List[str]:
    pdf_path = Path(pdf_path)
    doc = fitz.open(pdf_path)
    pages = list(range(len(doc))) if pages is None else pages
    result_pages: List[str] = []

    for p in pages:
        words = [w for w in doc[p].get_text("words") if _is_visible(w[4])]
        if not words:
            result_pages.append("")
            continue
        cols = _cluster_columns(words, gap_pts=col_gap_pts)
        col_order = sorted(cols.keys(), key=lambda i: median(0.5 * (w[0] + w[2]) for w in cols[i]))
        page_lines: List[str] = []
        for ci in col_order:
            col_words = sorted(cols[ci], key=lambda w: (w[1], w[0]))
            current_y = None
            current_line: List[Tuple[float, str]] = []
            lines: List[List[Tuple[float, str]]] = []
            for x0, y0, x1, y1, txt, *_ in col_words:
                if current_y is None or abs(y0 - current_y) <= y_tol:
                    current_line.append((x0, txt))
                    current_y = y0 if current_y is None else current_y
                else:
                    lines.append(current_line)
                    current_line = [(x0, txt)]
                    current_y = y0
            if current_line:
                lines.append(current_line)
            for line in lines:
                line.sort(key=lambda t: t[0])
                page_lines.append(" ".join(word for _, word in line))
        result_pages.append("\n".join(page_lines))
    doc.close()
    return result_pages

# ---------------------------------------------------------------------------
# 4. NEW visualiser of reading order lines
# ---------------------------------------------------------------------------

def visualize_text_order_png(
    pdf_path: str | Path,
    output_dir: str | Path | None = None,
    pages: Optional[List[int]] = None,
    col_gap_pts: float = 72.0,
    y_tol: float = 2.0,
    zoom: float = 2.0,
    line_color: Tuple[float, float, float] = (0.0, 0.8, 0.0),
    line_width: float = 0.5,
    annotate: bool = False,
    font_size: float = 6.0,
) -> List[Tuple[int, List[fitz.Rect], Path]]:
    """Draw green rectangles around **each consecutive line** in reading order.

    Useful to visually verify that `extract_text_layout` walks the page in the
    intended sequence.  Set `annotate=True` to write small line numbers at the
    start of each rectangle.
    """
    pdf_path = Path(pdf_path)
    doc = fitz.open(pdf_path)
    output_dir = Path(output_dir or pdf_path.with_stem(pdf_path.stem + "_order_images"))
    output_dir.mkdir(parents=True, exist_ok=True)

    pages = list(range(len(doc))) if pages is None else pages
    matrix = fitz.Matrix(zoom, zoom)
    ret = []

    for p in pages:
        page = doc[p]
        words = [w for w in page.get_text("words") if _is_visible(w[4])]
        if not words:
            continue
        cols = _cluster_columns(words, gap_pts=col_gap_pts)
        col_order = sorted(cols.keys(), key=lambda i: median(0.5 * (w[0] + w[2]) for w in cols[i]))
        line_rects: List[fitz.Rect] = []
        line_idx = 1
        for ci in col_order:
            col_words = sorted(cols[ci], key=lambda w: (w[1], w[0]))
            curr_y = None
            current_line_boxes: List[Tuple[float, float, float, float]] = []
            for x0, y0, x1, y1, txt, *_ in col_words:
                if curr_y is None or abs(y0 - curr_y) <= y_tol:
                    current_line_boxes.append((x0, y0, x1, y1))
                    curr_y = y0 if curr_y is None else curr_y
                else:
                    # draw line rectangle
                    if current_line_boxes:
                        x0_line = min(b[0] for b in current_line_boxes)
                        y0_line = min(b[1] for b in current_line_boxes)
                        x1_line = max(b[2] for b in current_line_boxes)
                        y1_line = max(b[3] for b in current_line_boxes)
                        r = fitz.Rect(x0_line, y0_line, x1_line, y1_line)
                        page.draw_rect(r, color=line_color, width=line_width)
                        if annotate:
                            page.insert_text((x0_line, y0_line), str(line_idx), fontsize=font_size, color=line_color)
                        line_rects.append(r)
                        line_idx += 1
                    current_line_boxes = [(x0, y0, x1, y1)]
                    curr_y = y0
            # last line in column
            if current_line_boxes:
                x0_line = min(b[0] for b in current_line_boxes)
                y0_line = min(b[1] for b in current_line_boxes)
                x1_line = max(b[2] for b in current_line_boxes)
                y1_line = max(b[3] for b in current_line_boxes)
                r = fitz.Rect(x0_line, y0_line, x1_line, y1_line)
                page.draw_rect(r, color=line_color, width=line_width)
                if annotate:
                    page.insert_text((x0_line, y0_line), str(line_idx), fontsize=font_size, color=line_color)
                line_rects.append(r)
                line_idx += 1
        out_png = output_dir / f"{pdf_path.stem}_page_{p}.png"
        page.get_pixmap(matrix=matrix, alpha=False).save(out_png)
        ret.append((p, line_rects, out_png))
    doc.close()
    return ret


In [2]:
draw_text_bboxes_png(r"C:\Users\pmarq\Downloads\FSD_CH0047533549_SWC_CH_de.pdf", "test")

[(0,
  [Rect(38.266998291015625, 81.96099853515625, 142.70700073242188, 105.62100219726562),
   Rect(148.26699829101562, 81.96099853515625, 188.24700927734375, 105.62100219726562),
   Rect(193.8070068359375, 81.96099853515625, 238.24700927734375, 105.62100219726562),
   Rect(243.8070068359375, 81.96099853515625, 276.0469970703125, 105.62100219726562),
   Rect(38.266998291015625, 105.96099853515625, 63.827003479003906, 129.62100219726562),
   Rect(69.38700103759766, 105.96099853515625, 109.38700866699219, 129.62100219726562),
   Rect(38.266998291015625, 135.04200744628906, 69.93699645996094, 146.87200927734375),
   Rect(72.71699523925781, 135.04200744628906, 74.93699645996094, 146.87200927734375),
   Rect(77.71699523925781, 135.04200744628906, 142.14700317382812, 146.87200927734375),
   Rect(144.927001953125, 135.04200744628906, 147.14700317382812, 146.87200927734375),
   Rect(149.927001953125, 135.04200744628906, 173.81700134277344, 146.87200927734375),
   Rect(176.5970001220703, 135.0

In [4]:
draw_text_bboxes_ocr_png(r"C:\Users\pmarq\Downloads\FSD_CH0047533549_SWC_CH_de.pdf", "test")

[(0,
  [Rect(418.08, 39.12, 559.92, 60.72),
   Rect(418.32, 73.92, 443.28, 79.44),
   Rect(445.67999999999995, 73.92, 451.67999999999995, 79.44),
   Rect(39.36, 85.91999999999999, 142.07999999999998, 100.8),
   Rect(149.04, 85.44, 187.67999999999998, 103.44),
   Rect(194.88, 85.44, 236.88, 100.8),
   Rect(245.51999999999998, 86.64, 275.52, 100.55999999999999),
   Rect(39.839999999999996, 110.63999999999999, 63.599999999999994, 124.55999999999999),
   Rect(70.8, 110.39999999999999, 108.47999999999999, 124.8),
   Rect(39.12, 135.6, 69.6, 146.64),
   Rect(73.19999999999999, 137.04, 74.39999999999999, 146.88),
   Rect(145.44, 137.04, 146.64, 146.88),
   Rect(150.72, 137.04, 173.51999999999998, 144.72),
   Rect(177.11999999999998, 137.28, 198.48, 144.72),
   Rect(201.83999999999997, 141.11999999999998, 204.48, 142.32),
   Rect(208.55999999999997, 137.04, 294.96, 146.64),
   Rect(38.879999999999995, 183.35999999999999, 119.52, 192.0),
   Rect(38.879999999999995, 197.04, 51.599999999999994, 2

In [15]:
extract_text_layout(r"C:\Users\pmarq\Downloads\FSD_CH0047533549_SWC_CH_de.pdf", col_gap_pts = 140)

["Swisscanto (CH) Gold ETF\nEA USD\nÜbrige | Ausschüttend | März 2025 - Marketingmaterial\nFondsbeschreibung\nDer an der SIX Swiss Exchange kotierte Fonds investiert aus-\nschliesslich in Gold und ist stets zu 100% mit dem physischen\nEdelmetall hinterlegt. Die Wertgegenstände werden ausschliess-\nlich in der Schweiz gelagert. Es besteht kein Schuldnerrisiko, da\nes sich um ein Sondervermögen gemäss KAG handelt. Der Inves-\ntor hat jederzeit die Möglichkeit, seine Anteile zu veräussern oder\ndie Sachauszahlung in physischem Gold à Standardbarren von ca.\n12.5 kg zu verlangen. Der Anleger kann aufgrund der verschiede-\nnen Währungsklassen die Investition in CHF, USD, EUR oder GBP\ntätigen.\nDie Anteilsklasse wurde per 11.03.2025 von ZKB Gold ETF AA\nUSD auf Swisscanto (CH) Gold ETF EA USD umbenannt.\nVorteile von Edelmetallen\nEdelmetalle bieten einen wirksamen Inflationsschutz und Schutz\nin Krisenzeiten. Aufgrund der geringen Korrelation zu traditionel-\nlen Anlagen wie Obligationen o

In [3]:
visualize_text_order_png(r"C:\Users\pmarq\Downloads\FSD_CH0047533549_SWC_CH_de.pdf", "testss", col_gap_pts = 140)

[(0,
  [Rect(38.266998291015625, 81.96099853515625, 276.0469970703125, 105.62100219726562),
   Rect(38.266998291015625, 105.96099853515625, 109.38700866699219, 129.62100219726562),
   Rect(38.266998291015625, 135.04200744628906, 295.5370178222656, 146.87200927734375),
   Rect(38.266998291015625, 181.77000427246094, 120.09500122070312, 192.41700744628906),
   Rect(38.266998291015625, 194.82400512695312, 267.78497314453125, 205.41700744628906),
   Rect(38.266998291015625, 207.82400512695312, 274.8049621582031, 218.41700744628906),
   Rect(38.266998291015625, 220.82400512695312, 283.78704833984375, 231.41700744628906),
   Rect(38.266998291015625, 233.82400512695312, 280.7989807128906, 244.41700744628906),
   Rect(38.266998291015625, 246.82400512695312, 286.3070068359375, 257.4169921875),
   Rect(38.266998291015625, 259.8240051269531, 289.2860107421875, 270.4169921875),
   Rect(38.266998291015625, 272.8240051269531, 289.81695556640625, 283.4169921875),
   Rect(38.266998291015625, 285.82400