In [23]:
import os
import json
import base64
from mistralai import Mistral


In [24]:
CONFIG_PATH = os.path.expanduser('~/pet-projects/jupyter-notebooks/config.json')

In [9]:

def load_config():
    """Load configuration from config.json"""
    try:
        with open(CONFIG_PATH, 'r') as config_file:
            return json.load(config_file)
    except FileNotFoundError:
        raise Exception("config.json file not found")
    except json.JSONDecodeError:
        raise Exception("Error parsing config.json")

In [None]:
api_key = load_config()['mistral_api_key']

In [None]:
client = Mistral(api_key=api_key)

ocr_response = client.ocr.process(
    model="mistral-ocr-latest",
    document={
        "type": "document_url",
        "document_url": "https://arxiv.org/pdf/2502.12115"
    },
    include_image_base64=True
)

# Save the OCR response to a JSON file
output_file = "ocr_response.json"
with open(output_file, "w") as f:
    # Convert OCRResponse object to a serializable dictionary
    ocr_dict = ocr_response.model_dump() if hasattr(ocr_response, 'model_dump') else vars(ocr_response)
    json.dump(ocr_dict, f, indent=4)

print(f"OCR response saved to {output_file}")


In [25]:
def json_to_markdown(json_file="ocr_response.json", markdown_file="mistral_response.md"):
    """
    Read content from a JSON file and save it to a Markdown file.
    
    Args:
        json_file (str): Path to the JSON file containing Mistral AI response
        markdown_file (str): Path to the output Markdown file
    
    Returns:
        str: Path to the saved Markdown file
    """
    try:
        # Read the JSON file
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Create directory for images if it doesn't exist
        image_dir = os.path.dirname(markdown_file)
        os.makedirs(image_dir, exist_ok=True)
        
        markdown_content = ""
        
        # Process pages from the OCR response
        if 'pages' in data:
            for page in data['pages']:
                # Add page index as header
                markdown_content += f"## Page {page['index'] + 1}\n\n"
                
                # Add the markdown content
                if 'markdown' in page:
                    page_markdown = page['markdown']
                    markdown_content += page_markdown + "\n\n"
                
                # Process and save images
                if 'images' in page and page['images']:
                    for i, img in enumerate(page['images']):
                        if 'image_base64' in img and 'id' in img:
                            # Save the image to a file
                            img_filename = f"img-{i}.jpeg"
                            img_path = os.path.join(image_dir, img_filename)
                            
                            # Decode and save the base64 image
                            try:
                                # Remove potential header from base64 string
                                base64_data = img['image_base64']
                                if ',' in base64_data:
                                    base64_data = base64_data.split(',', 1)[1]
                                
                                # Add reference to the image in markdown
                                markdown_content += f"![{img_filename}]({img_filename})\n\n"
                                
                                # Decode and save the image
                                img_data = base64.b64decode(base64_data)
                                with open(img_path, 'wb') as img_file:
                                    img_file.write(img_data)
                                print(f"Saved image: {img_path}")
                            except Exception as e:
                                print(f"Error saving image {img['id']}: {e}")
        
        # Save to Markdown
        with open(markdown_file, 'w', encoding='utf-8') as f:
            f.write(markdown_content)
        
        print(f"Content successfully saved to {markdown_file}")
        return markdown_file
    
    except Exception as e:
        print(f"Error processing file: {e}")
        return None

In [None]:
# Example usage in a notebook:
json_file = os.path.expanduser('~/pet-projects/jupyter-notebooks/ocr_response.json')
markdown_file = os.path.expanduser('~/pet-projects/jupyter-notebooks/mistral_output.md')
json_to_markdown(json_file=json_file, markdown_file=markdown_file) 