In [1]:
import os
import json
import shutil
import mimetypes
import base64
import time
from mistralai import Mistral
from dotenv import load_dotenv

# Load API key from environment file
load_dotenv()
api_key = os.environ["MISTRAL_API_KEY"]
client = Mistral(api_key=api_key)

# Define the working directories

# Folder where we drop the files to process
INPUT_DIR = "input"                    
# Folder where the OCR results (JSON) are saved
OUTPUT_JSON_DIR = "output/json"
# Folder where we move processed files
PROCESSED_DIR = "processed"            

# Create the folders if they don’t exist
os.makedirs(OUTPUT_JSON_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIR, exist_ok=True)

# Get all files in the input folder
files = [f for f in os.listdir(INPUT_DIR) if os.path.isfile(os.path.join(INPUT_DIR, f))]

if not files:
    print("Aucun fichier trouvé dans 'input/'")
else:
    for filename in files:
        file_path = os.path.join(INPUT_DIR, filename)
        base_name, ext = os.path.splitext(filename)
        ext = ext.lower()

        try:
            # --- Si c’est un fichier PDF ---
            if ext == ".pdf":
                print(f"Traitement du PDF : {filename}")
                with open(file_path, "rb") as f:
                    uploaded_pdf = client.files.upload(
                        file={"file_name": filename, "content": f},
                        purpose="ocr"
                    )
                signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)

                # On lance l’OCR à partir du lien signé
                ocr_response = client.ocr.process(
                    model="mistral-ocr-latest",
                    document={"type": "document_url", "document_url": signed_url.url},
                    include_image_base64=True
                )

            # --- Si c’est une image (PNG, JPG, JPEG) ---
            elif ext in [".png", ".jpg", ".jpeg"]:
                print(f"Traitement de l’image : {filename}")
                mime_type, _ = mimetypes.guess_type(file_path)
                with open(file_path, "rb") as f:
                    image_data = f.read()
                base64_encoded = base64.b64encode(image_data).decode("utf-8")
                image_url = f"data:{mime_type};base64,{base64_encoded}"

                # On lance l’OCR sur l’image encodée
                ocr_response = client.ocr.process(
                    model="mistral-ocr-latest",
                    document={"type": "image_url", "image_url": image_url},
                    include_image_base64=True
                )

            else:
                # Cas où le format de fichier n’est pas pris en charge
                print(f"Format non pris en charge : {filename}")
                continue

            # --- Sauvegarde du résultat OCR au format JSON ---
            with open(os.path.join(OUTPUT_JSON_DIR, base_name + ".json"), "w", encoding="utf-8") as f:
                json.dump(ocr_response.model_dump(), f, indent=2)

            # --- Archivage du fichier original dans 'processed/' ---
            shutil.move(file_path, os.path.join(PROCESSED_DIR, filename))
            print(f"Fichier traité et archivé : {filename}")

            # Pause rapide entre deux fichiers (sécurité)
            time.sleep(0.5)

        except Exception as e:
            print(f"Erreur lors du traitement de {filename} : {e}")

🚫 Aucun fichier trouvé dans 'input/'
