
# DeepSeek-OCR: PPT/PDF → Markdown with ASCII diagrams

End-to-end: **PPT/PPTX/PDF → images → DeepSeek-OCR → `slides.md`**  
Plus **ASCII fallbacks** so figures are always visible inside Markdown (no Mermaid).



## Quick start
1) Set `INPUT_FILE` to your deck (`.ppt`, `.pptx`, or `.pdf`).  
2) Run **Setup** → **Load model** → **Convert & OCR**.  
3) Grab `output/slides.md` and any cropped images in `output/figures/`.


In [None]:

# === Config ===
INPUT_FILE = "/content/input.pdf"   # .ppt/.pptx/.pdf
OUTPUT_DIR = "output"

# Use the model only for OCR/text; we still ask it to try ASCII diagrams, but
# literal pixel→ASCII fallbacks below guarantee diagrams show up.
RECONSTRUCT_ASCII = True             # Let the model attempt clean ASCII diagrams
MODEL_NAME = "deepseek-ai/DeepSeek-OCR"

# Precision / memory
USE_BF16_FLASH_ATTN = True           # Good perf on recent NVIDIA GPUs
USE_4BIT = False                     # Set True if you need low VRAM (bitsandbytes)

# ASCII fallbacks (literal pixel→ASCII) — accurate, always visible in MD
ASCII_FOR_FIGURES = True             # ASCII for each model-cropped figure
ASCII_FOR_FULL_SLIDE = False         # Also ASCII for whole slide image
ASCII_FIGURE_WIDTH = 100             # 100–160 = more detail
ASCII_SLIDE_WIDTH  = 140
ASCII_CHARSET = " .:-=+*#%@"         # dark→light ramp (pure ASCII)
ASCII_Y_ASPECT = 0.5                 # characters are taller than wide


In [None]:

# === Setup: system & Python deps ===
# Notes:
# - DeepSeek-OCR expects an NVIDIA GPU with CUDA + Flash-Attention 2.
# - We install pdf2image (needs Poppler). On Linux (Colab), we apt-get poppler.
# - For PPT/PPTX, we try headless LibreOffice to convert to PDF.

import shutil, subprocess, sys

def _run(cmd):
    print(">", " ".join(cmd))
    try:
        subprocess.run(cmd, check=True)
    except Exception as e:
        print("[warn]", e)

# Linux (Colab-like) helpers
if shutil.which("apt-get"):
    _run(["apt-get", "update", "-y"])   # no sudo inside many notebooks
    _run(["apt-get", "install", "-y", "poppler-utils", "libreoffice"])  # for pdf2image + PPT→PDF

# Python deps
!pip -q install --upgrade pip
!pip -q install torch torchvision --index-url https://download.pytorch.org/whl/cu121 || true
!pip -q install transformers==4.46.3 tokenizers==0.20.3 einops addict easydict pillow tqdm pdf2image python-pptx
!pip -q install flash-attn==2.7.3 --no-build-isolation || true
!pip -q install bitsandbytes==0.43.3 || true


In [None]:

# === Utilities: PPT/PPTX → PDF, PDF → images, image → ASCII ===
from pathlib import Path
from pdf2image import convert_from_path
from PIL import Image
import numpy as np
import shutil, subprocess

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)
    return p

def ppt_to_pdf(ppt_path: Path, out_dir: Path) -> Path:
    ensure_dir(out_dir)
    soffice = shutil.which("soffice") or shutil.which("libreoffice")
    if not soffice:
        raise RuntimeError("LibreOffice not found; supply a PDF directly or install LibreOffice.")
    cmd = [soffice, "--headless", "--convert-to", "pdf", "--outdir", str(out_dir), str(ppt_path)]
    print("Running:", " ".join(cmd))
    subprocess.run(cmd, check=True)
    pdf_path = out_dir / (ppt_path.stem + ".pdf")
    if not pdf_path.exists():
        alt = out_dir / (ppt_path.stem + ".PDF")
        if alt.exists():
            pdf_path = alt
    if not pdf_path.exists():
        raise FileNotFoundError("PPT/PPTX → PDF failed.")
    return pdf_path

def pdf_to_images(pdf_path: Path, dpi: int = 300, img_dir: Path = Path("slides")) -> list[Path]:
    ensure_dir(img_dir)
    pages = convert_from_path(str(pdf_path), dpi=dpi)
    outs = []
    for i, page in enumerate(pages, 1):
        out = img_dir / f"slide_{i:03d}.png"
        page.save(out, "PNG")
        outs.append(out)
    return outs

def image_to_ascii(img_path, width=120, charset=" .:-=+*#%@", y_aspect=0.5):
    im = Image.open(img_path).convert("L")
    w = int(width)
    h = max(1, int(im.height * (w / im.width) * y_aspect))
    im = im.resize((w, h))
    arr = np.array(im)
    scale = (len(charset) - 1) / 255.0
    lines = []
    for row in arr:
        lines.append("".join(charset[int(px * scale)] for px in row))
    return "\n".join(lines)


In [None]:

# === Load DeepSeek-OCR ===
import torch
from transformers import AutoTokenizer, AutoModel

if USE_4BIT:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    model = AutoModel.from_pretrained(
        MODEL_NAME, trust_remote_code=True, use_safetensors=True,
        load_in_4bit=True, device_map="auto"
    )
else:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    model = AutoModel.from_pretrained(
        MODEL_NAME, trust_remote_code=True, use_safetensors=True,
        _attn_implementation=("flash_attention_2" if USE_BF16_FLASH_ATTN else "eager")
    )
    if torch.cuda.is_available():
        model = model.eval().cuda().to(torch.bfloat16 if USE_BF16_FLASH_ATTN else torch.float16)
    else:
        raise SystemError("CUDA GPU required for DeepSeek-OCR.")


In [None]:

# === Prompt (asks model to write clean Markdown + ASCII diagrams) ===
BASE_PROMPT = (
    "<image>\n"
    "<|grounding|>"
    "Convert this slide into clean Markdown.\n"
    "- Keep heading hierarchy and bullet/numbered lists\n"
    "- Convert tables to Markdown tables\n"
    + ("- Reconstruct ALL diagrams as plain ASCII (no Unicode). Use + - | / \\ ( ) for shapes, 'o' or '*' for points, and '->' for arrows.\n"
       "  Put diagrams inside fenced code blocks like ```text ... ```.\n" if RECONSTRUCT_ASCII else "")
    + "- Embed any extracted figure crops using Markdown image syntax\n"
    "- Include the slide number as a level-2 heading like `## Slide {n}`\n"
    "- Output only valid Markdown"
)


In [None]:

# === Convert & OCR ===
from pathlib import Path
import re, shutil

in_path = Path(INPUT_FILE).expanduser().resolve()
root_out = Path(OUTPUT_DIR)
slides_dir = root_out / "slides"
fig_out_dir = root_out / "figures"
ensure_dir(root_out); ensure_dir(slides_dir); ensure_dir(fig_out_dir)

# 1) Normalize to PDF
if in_path.suffix.lower() in {".ppt", ".pptx"}:
    pdf_path = ppt_to_pdf(in_path, root_out)
elif in_path.suffix.lower() == ".pdf":
    pdf_path = in_path
else:
    raise ValueError("Please provide a .ppt/.pptx/.pdf")

# 2) PDF → slide images (300 dpi for diagram fidelity)
images = pdf_to_images(pdf_path, dpi=300, img_dir=slides_dir)

# 3) OCR each slide
markdown_chunks = []
for idx, img in enumerate(images, 1):
    slide_out = slides_dir / f"slide_{idx:03d}"
    ensure_dir(slide_out / "images")
    prompt = BASE_PROMPT.replace("{n}", str(idx))

    print(f"[Slide {idx}] OCR →", img)
    res = model.infer(
        tokenizer,
        prompt=prompt,
        image_file=str(img),
        output_path=str(slide_out),
        base_size=1024,            # DeepSeek-OCR "Gundam" preset
        image_size=640,
        crop_mode=True,
        test_compress=True,
        save_results=True
    )

    # Read the per-slide markdown produced by the model
    result_file = slide_out / "result.mmd"
    if result_file.exists():
        md = result_file.read_text(encoding="utf-8")

        # Move cropped figures to a global folder and rewrite links
        local_img_dir = slide_out / "images"
        if local_img_dir.exists():
            for i_img in sorted(local_img_dir.glob("*.jpg")):
                dst = fig_out_dir / f"slide_{idx:03d}_{i_img.name}"
                shutil.copy2(i_img, dst)
                md = md.replace(f"images/{i_img.name}", f"figures/{dst.name}")

            # ASCII fallback for each crop (literal pixel→ASCII)
            if ASCII_FOR_FIGURES:
                ascii_blocks = []
                for i_img in sorted(local_img_dir.glob("*.jpg")):
                    try:
                        art = image_to_ascii(i_img, width=ASCII_FIGURE_WIDTH, charset=ASCII_CHARSET, y_aspect=ASCII_Y_ASPECT)
                        ascii_blocks.append(f"**ASCII fallback for figure `{i_img.name}`:**\n\n```text\n{art}\n```")
                    except Exception as e:
                        ascii_blocks.append(f"_ASCII conversion failed for {i_img.name}: {e}_")
                if ascii_blocks:
                    md += "\n\n" + "\n\n".join(ascii_blocks)

        # Optional ASCII of the full slide
        if ASCII_FOR_FULL_SLIDE:
            try:
                full = image_to_ascii(img, width=ASCII_SLIDE_WIDTH, charset=ASCII_CHARSET, y_aspect=ASCII_Y_ASPECT)
                md += f"\n\n**ASCII fallback (full slide):**\n\n```text\n{full}\n```"
            except Exception as e:
                md += f"\n\n_ASCII conversion failed for full slide: {e}_"

        markdown_chunks.append(md.strip())
    else:
        markdown_chunks.append(f"## Slide {idx}\n\n*(No text recognized)*")

# 4) Assemble master Markdown + anchors
toc = ["# Deck OCR", "## Table of Contents"]
for i in range(1, len(images)+1):
    toc.append(f"- [Slide {i}](#slide-{i})")
toc_md = "\n".join(toc)

body = []
for i, chunk in enumerate(markdown_chunks, 1):
    # Ensure slide anchors like "## Slide 2" → id="slide-2"
    chunk = re.sub(r"^##\s*Slide\s+(\d+)", r"## Slide \1\n<a id='slide-\1'></a>", chunk, flags=re.IGNORECASE | re.MULTILINE)
    body.append(chunk)

master = toc_md + "\n\n" + "\n\n---\n\n".join(body) + "\n"
out_md = Path(OUTPUT_DIR) / "slides.md"
out_md.write_text(master, encoding="utf-8")
print("✅ Wrote:", out_md.resolve())
print("Figures dir:", fig_out_dir.resolve())
