In [142]:
import os
import re
import base64, mimetypes
from pathlib import Path
from mistralai import Mistral
from openai import OpenAI

# 1) Load environment 
from dotenv import load_dotenv
load_dotenv()


True

In [143]:
# Read API keys

mistral_api_key = os.getenv("MISTRAL_API_KEY")
if not mistral_api_key:
    raise RuntimeError("Missing MISTRAL_API_KEY in environment.")
mistral_client = Mistral(api_key=mistral_api_key)

openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    raise RuntimeError("Missing OPENAI_API_KEY in environment.")
openai_client = OpenAI(api_key=openai_api_key)

In [150]:
# Insert your inputs/outputs

pdf_path = "../data/YOUR_PDF_NAME.pdf"  #RENAME FOR PDF
output_dir = Path("../data/ocr_md")
image_folder = Path("../data/ocr_md/Images")
md_out = output_dir / "YOUR_FINAL_MARKDOWN.md" #RENAME FINAL MD
img_tag = 'YOUR_IMAGE_LABEL'  #RENAME THIS with what the images will be tagged with (ex. 'bridgeport_s1_milling')
image_folder.mkdir(parents=True, exist_ok=True)

In [151]:
# Encoding in Base64 using Mistral

def encode_pdf_to_data_url(p: str) -> str:
    with open(p, "rb") as f:
        b64 = base64.b64encode(f.read()).decode("utf-8")
    return f"data:application/pdf;base64,{b64}"

pdf_data_url = encode_pdf_to_data_url(pdf_path)

ocr_response = mistral_client.ocr.process(
    model="mistral-ocr-latest",
    document={"type": "document_url", "document_url": pdf_data_url},
    include_image_base64=True,
)

In [152]:
#Helpers for saving images & rewriting markdown

def safe_write_bytes(path: Path, data: bytes):
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "wb") as f:
        f.write(data)

def b64_to_bytes(b64: str) -> bytes:
    if b64.startswith("data:image/"):
        b64 = b64.split(",", 1)[-1]
    return base64.b64decode(b64)

In [153]:
# Save explicit page images (if present)

def save_page_images(pages, img_tag:str) -> int:

    saved_names = set()
    count = 0
    img_tag = img_tag.lower().replace(" ", "_")
    
    for p in pages:
        imgs = getattr(p, "images", None)
        if not isinstance(imgs, list):
            continue

        for i, img in enumerate(imgs):
            # Pick extension (default .png)
            ext = "png"
            mt = getattr(img, "mime_type", None)
            if isinstance(mt, str) and "/" in mt:
                ext = "png"

            name = f"{img_tag}_img_{count}.{ext}"

            b64 = (
                getattr(img, "image_base64", None)
                or getattr(img, "base64", None)
                or getattr(img, "content", None)
            )
            url = getattr(img, "image_url", None)

            # Save from base64 or data URI
            if b64:
                safe_write_bytes(image_folder / name, b64_to_bytes(b64))
                count += 1
            elif isinstance(url, str) and url.startswith("data:image/"):
                safe_write_bytes(image_folder / name, b64_to_bytes(url))
                count += 1

    return count


In [154]:
#Rewriting links in markdown file

def rewrite_rel_links(
    md: str,
    image_folder,
    img_tag: str,
    openai_client=None,
    start_index: int = 0,
    start_figure: int = 1,
    caption_images: bool = False,
):

    REL_IMAGE_LINK_RE = re.compile(
        r'''(!\[)([^\]]*)(\]\()         # 1: "![", 2: alt, 3: "]("
            (?!(?:https?:|data:))       # not URLs/data URIs
            (?:\.{0,2}/)*               # optional ./ or ../
            ([^\s)]+\.(?:png|jpg|jpeg|webp|gif|tif|tiff|bmp|svg))  # 4: filename
            (\))                        # 5: ")"
        ''',
        flags=re.IGNORECASE | re.VERBOSE
    )

    def _image_to_data_url(p: Path) -> str:
        mime = mimetypes.guess_type(str(p))[0] or "image/png"
        with open(p, "rb") as f:
            b64 = base64.b64encode(f.read()).decode("utf-8")
        return f"data:{mime};base64,{b64}"

    idx = start_index
    fig = start_figure
    image_folder_str = str(image_folder)

    def _repl(m):
        nonlocal idx, fig
        pre, _old_alt, mid, old_name, suf = m.groups()
        ext = ".png"
        new_name = f"{img_tag.lower()}_img_{idx}{ext.lower()}"
        idx += 1

        # Rewrite alt text + link
        out = f"{pre}{new_name}{mid}{image_folder_str}/{new_name}{suf}"

        # Optional: dense caption with the actual image content
        if caption_images and openai_client:
            try:
                img_path = Path(image_folder) / new_name
                data_url = _image_to_data_url(img_path)
                prompt = (
                    "Write a concise, factual, technical caption describing this image from a manufacturing perspective. "
                    "Focus on visible components, labels, layout, and context. Avoid speculation and extra detail."
                )
                resp = openai_client.responses.create(
                    model="gpt-4o-mini",
                    input=[{
                        "role": "user",
                        "content": [
                            {"type": "input_text", "text": prompt},
                            {"type": "input_image", "image_url": data_url}

                        ],
                    }]
                )
                caption = (resp.output_text or "").strip()
                if caption:
                    out += f"\n*Figure {fig}. {caption}*"
                    fig += 1
            except Exception as e:
                print(f"[Caption skipped for {new_name}] {e}")

        return out

    new_md = REL_IMAGE_LINK_RE.sub(_repl, md)
    return new_md, idx, fig


In [None]:
# Save everything 

pages = getattr(ocr_response, "pages", []) or []
images_saved = save_page_images(pages, img_tag)

md_parts = []
img_idx = 0        # <— start once here
fig_idx = 1        # <— start once here

for p in pages:
    md = getattr(p, "markdown", "") or ""
    if not md:
        continue
    md, img_idx, fig_idx = rewrite_rel_links(
        md, image_folder, img_tag,
        openai_client=openai_client,      # or None to skip captions
        start_index=img_idx,
        start_figure=fig_idx,
        caption_images=True               # set False to skip captioning
    )
    md_parts.append(md)

output_dir.mkdir(parents=True, exist_ok=True)
with open(md_out, "w", encoding="utf-8") as f:
    f.write("\n\n---\n\n".join(md_parts))

print(f"OCR text saved to {md_out}")
print(f"Images saved to {image_folder} (count: {images_saved})")

OCR text saved to ../data/ocr_md/Bridgeport Milling OCR w captions.md
Images saved to ../data/ocr_md/Images (count: 82)
