<a href="https://colab.research.google.com/github/royam0820/LLM_OCR/blob/main/amr_2_mistral_ocr_pdfs_batch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Make sure you have the dependencies installed
!pip install -q mistralai

In [2]:
import os
from google.colab import userdata
from mistralai import Mistral

# Fetch the API key securely
api_key = userdata.get('MISTRAL_API_KEY')

# Set it in the environment without exposing it
if api_key:
    os.environ["MISTRAL_API_KEY"] = api_key
else:
    raise ValueError("MISTRAL_API_KEY not found. Make sure it is set in Colab.")

# Initialize Mistral client
client = Mistral(api_key=os.environ["MISTRAL_API_KEY"])
print("Mistral API Key loaded successfully.")


Mistral API Key loaded successfully.


In [3]:
import json
import base64
import shutil
from pathlib import Path
from mistralai import Mistral, DocumentURLChunk
from mistralai.models import OCRResponse

In [4]:
# Path configuration
INPUT_DIR = Path("pdfs_to_process")   # Folder where the user places the PDFs to be processed
DONE_DIR = Path("pdfs-done")            # Folder where processed PDFs will be moved
OUTPUT_ROOT_DIR = Path("ocr_output")    # Root folder for conversion results

# Ensure directories exist
INPUT_DIR.mkdir(exist_ok=True)
DONE_DIR.mkdir(exist_ok=True)
OUTPUT_ROOT_DIR.mkdir(exist_ok=True)

In [5]:
def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
    """
    This converts base64 encoded images directly in the markdown...
    And replaces them with links to external images, so the markdown is more readable and organized.
    """
    for img_name, base64_str in images_dict.items():
        markdown_str = markdown_str.replace(f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})")
    return markdown_str

def get_combined_markdown(ocr_response: OCRResponse) -> str:
    """
    Part of the response from the Mistral API, which is an OCRResponse object...
    And returns a single string with the combined markdown of all the pages of the PDF.
    """
    markdowns: list[str] = []
    for page in ocr_response.pages:
        image_data = {}
        for img in page.images:
            image_data[img.id] = img.image_base64
        markdowns.append(replace_images_in_markdown(page.markdown, image_data))

    return "\n\n".join(markdowns)

In [6]:
def process_pdf(pdf_path: Path):
    # Process all PDFs in INPUT_DIR
    # - Important to be careful with the number of PDFs, as the Mistral API has a usage limit
    #   and it could cause errors by exceeding the limit.

    # PDF base name
    pdf_base = pdf_path.stem
    print(f"Processing {pdf_path.name} ...")

    # Output folders
    output_dir = OUTPUT_ROOT_DIR / pdf_base
    output_dir.mkdir(exist_ok=True)
    images_dir = output_dir / "images"
    images_dir.mkdir(exist_ok=True)

    # PDF -> OCR
    with open(pdf_path, "rb") as f:
        pdf_bytes = f.read()

    uploaded_file = client.files.upload(
        file={
            "file_name": pdf_path.name,
            "content": pdf_bytes,
        },
        purpose="ocr"
    )

    signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)

    ocr_response = client.ocr.process(
        document=DocumentURLChunk(document_url=signed_url.url),
        model="mistral-ocr-latest",
        include_image_base64=True
    )

    # Save OCR in JSON
    # (in case something fails it could be reused, but it is not used in the rest of the code)
    ocr_json_path = output_dir / "ocr_response.json"
    with open(ocr_json_path, "w", encoding="utf-8") as json_file:
        json.dump(ocr_response.dict(), json_file, indent=4, ensure_ascii=False)
    print(f"OCR response saved in {ocr_json_path}")

    # OCR -> Markdown prepared for Obsidian
    # - That is, from base64 encoded images, it converts them to links to
    #   external images and generates the images as such, in a subfolder.

    global_counter = 1
    updated_markdown_pages = []

    for page in ocr_response.pages:
        updated_markdown = page.markdown
        for image_obj in page.images:

            # base64 to image
            base64_str = image_obj.image_base64
            if base64_str.startswith("data:"):
                base64_str = base64_str.split(",", 1)[1]
            image_bytes = base64.b64decode(base64_str)

            # image extensions
            ext = Path(image_obj.id).suffix if Path(image_obj.id).suffix else ".png"
            new_image_name = f"{pdf_base}_img_{global_counter}{ext}"
            global_counter += 1

            # save in subfolder
            image_output_path = images_dir / new_image_name
            with open(image_output_path, "wb") as f:
                f.write(image_bytes)

            # Update markdown with wikilink: ![[nombre_imagen]]
            updated_markdown = updated_markdown.replace(
                f"![{image_obj.id}]({image_obj.id})",
                f"![[{new_image_name}]]"
            )
        updated_markdown_pages.append(updated_markdown)

    final_markdown = "\n\n".join(updated_markdown_pages)
    output_markdown_path = output_dir / "output.md"
    with open(output_markdown_path, "w", encoding="utf-8") as md_file:
        md_file.write(final_markdown)
    print(f"Markdown generated in {output_markdown_path}")

In [7]:
# Process all PDFs in INPUT_DIR
# - Important to be careful with the number of PDFs, as the Mistral API has a usage limit
#   and it could cause errors by exceeding the limit.

pdf_files = list(INPUT_DIR.glob("*.pdf"))      # Get all PDFs in pdfs_to_process. So make sure to place the PDFs there.
if not pdf_files:
    print("No PDFs to process.")
    exit()

for pdf_file in pdf_files:
    try:
        process_pdf(pdf_file)
        shutil.move(str(pdf_file), DONE_DIR / pdf_file.name)
        print(f"{pdf_file.name} moved to {DONE_DIR}")
    except Exception as e:
        print(f"Error processing {pdf_file.name}: {e}")


Processing 2101.03961v3.pdf ...


<ipython-input-6-2a372ab4ec39>:40: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  json.dump(ocr_response.dict(), json_file, indent=4, ensure_ascii=False)


OCR response saved in ocr_output/2101.03961v3/ocr_response.json
Markdown generated in ocr_output/2101.03961v3/output.md
2101.03961v3.pdf moved to pdfs-done
Processing 2402.03300v3.pdf ...
OCR response saved in ocr_output/2402.03300v3/ocr_response.json
Markdown generated in ocr_output/2402.03300v3/output.md
2402.03300v3.pdf moved to pdfs-done
Processing 1804.07821v1.pdf ...
OCR response saved in ocr_output/1804.07821v1/ocr_response.json
Markdown generated in ocr_output/1804.07821v1/output.md
1804.07821v1.pdf moved to pdfs-done


In [9]:
import shutil
import os
from google.colab import files

# Define the folder to zip
folder_to_zip = '/content/ocr_output'

# Define the output zip file name
output_filename = 'ocr_output.zip'

# Create the zip file, specifying the full output path
shutil.make_archive(os.path.join(os.getcwd(), output_filename[:-4]), 'zip', folder_to_zip)
# The above change ensures the archive is created with the specified name in the current working directory

# Download the zip file using the full path
files.download(os.path.join(os.getcwd(), output_filename))
# The above change provides the full path to the download function.

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>