### Imports

In [None]:
import argparse, io, re, sys, tempfile, os, subprocess
from pathlib import Path
from typing import List, Tuple

import fitz  # PyMuPDF
import pdfplumber
from PIL import Image
import pytesseract

### Constants

In [None]:
# ---------- utilities ----------
BULLET_CHARS = "•·●◦▪–-—*"
LIST_RE = re.compile(rf"^\s*([{BULLET_CHARS}]|\d+[\.)])\s+")
ALLCAPS_RE = re.compile(r"^[A-Z0-9 &().,:;'/\-]{4,}$")

def is_pdf(path: Path) -> bool:
    return path.suffix.lower() == ".pdf"

### Helper methods

In [None]:
def load_pdf_pages_as_images(pdf_path: Path, dpi=300) -> List[ Image.Image]:
    imgs = []
    doc = fitz.open(pdf_path)
    for page in doc:
        mat = fitz.Matrix(dpi/72, dpi/72)
        pix = page.get_pixmap(matrix=mat, alpha=False)
        imgs.append(Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB"))
    return imgs

In [None]:
def try_extract_pdf_text(pdf_path: Path) -> str:
    # Try structured pdf text first (avoids OCR when text layer exists)
    collected = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            txt = page.extract_text(x_tolerance=1.5, y_tolerance=2.0) or ""
            collected.append(txt.strip())
    return "\n\n".join(collected).strip()

In [None]:
def ocr_image_to_lines(img: Image.Image, lang="eng") -> List[Tuple[str, int]]:
    """
    Returns list of (line_text, avg_conf) using Tesseract TSV.
    """
    tsv = pytesseract.image_to_data(img, lang=lang, config="--oem 3 --psm 3", output_type=pytesseract.Output.DATAFRAME)
    lines = []
    if tsv is None or tsv.empty:
        return lines
    # group by line number within each block/page
    for (_, _, _), df_line in tsv[tsv.conf!=-1].groupby(["page_num","block_num","line_num"]):
        line_text = " ".join([w for w in df_line.text.fillna("").tolist() if w])
        if not line_text.strip():
            continue
        avg_conf = int(df_line.conf.astype(float).mean())
        lines.append((line_text.strip(), avg_conf))
    return lines

### Heuristics

In [None]:
def heuristics_to_markdown(lines: List[str]) -> str:
    """
    Turn plain lines into rough Markdown:
    - Headings: ALLCAPS or short lines surrounded by whitespace
    - Lists: bullets / 1. 2. etc.
    - Paragraphs
    """
    out = []
    prev_blank = True
    for i, raw in enumerate(lines):
        s = raw.strip()
        if not s:
            out.append("") ; prev_blank = True
            continue

        # list items
        if LIST_RE.match(s):
            # Normalize bullets
            s_norm = LIST_RE.sub(lambda m: ("- " if not m.group(1)[0].isdigit() else m.group(0)), s)
            out.append(s_norm)
            prev_blank = False
            continue

        # heading heuristic: short line OR ALLCAPS and previous was blank
        if prev_blank and (len(s) <= 60 and (ALLCAPS_RE.match(s) or s.istitle())):
            out.append(f"# {s}")
            prev_blank = False
            continue

        # otherwise paragraph continuation
        out.append(s)
        prev_blank = False

    # collapse excessive blanks
    md = re.sub(r"\n{3,}", "\n\n", "\n".join(out)).strip()
    return md

In [None]:
def llm_polish_markdown(md_text: str) -> str:
    """
    Placeholder: send md_text with instructions like
    'Clean minor OCR errors, fix broken lists/headings, keep content faithful, return Markdown only.'
    Implement with your preferred provider. For now, no-op.
    """
    return md_text

In [None]:
def translate(text: str, target_lang: str) -> str:
    """
    Placeholder. For reproducibility, keep as no-op unless you plug in
    an offline translator (e.g., argos-translate) or an API you trust.
    """
    return text

### Main processing function

In [None]:
def process_image(img: Image.Image, ocr_lang: str) -> str:
    lines_conf = ocr_image_to_lines(img, lang=ocr_lang)
    lines = [lc[0] for lc in lines_conf]
    return heuristics_to_markdown(lines)

def process_pdf(path: Path, ocr_lang: str) -> str:
    text = try_extract_pdf_text(path)
    if text and len(text.split()) > 20:
        # If we have a decent text layer, format with light heuristics
        lines = [ln.rstrip() for ln in text.splitlines()]
        return heuristics_to_markdown(lines)
    # else rasterize and OCR
    pages = load_pdf_pages_as_images(path)
    md_pages = []
    for i, img in enumerate(pages, 1):
        md = process_image(img, ocr_lang)
        # Add page heading separator to help LLM polish later
        md_pages.append(f"<!-- Page {i} -->\n{md}")
    return "\n\n---\n\n".join(md_pages)

In [None]:
# ap = argparse.ArgumentParser(description="Convert image/PDF to Markdown.")
# ap.add_argument("input", type=str, help="Path to image or PDF")
# ap.add_argument("--lang", default="eng", help="Tesseract OCR language code (default: eng)")
# ap.add_argument("--polish", action="store_true", help="Send through LLM to clean formatting")
# ap.add_argument("--translate", type=str, default=None, metavar="LANG",
#                 help="Translate final Markdown to target language code (e.g., 'de', 'fr').")
# args = ap.parse_args()

path = Path("input") / "Lettre.pdf"
if not path.exists():
    print(f"File not found: {path}", file=sys.stderr)
    sys.exit(1)

md = process_pdf(path, "deu")

# if args.polish:
#     md = llm_polish_markdown(md)

# if args.translate:
#     md = translate(md, args.translate)

print(md)