# Retrieve Markdown with Mistral OCR

In [None]:
# Working with Mistral AI's OCR to extract structured data from a PDF document
import os
from dotenv import load_dotenv
from mistralai import Mistral
from mistralai.extra import response_format_from_pydantic_model
from pydantic import BaseModel
# Document Annotation response format
from pydantic import BaseModel, Field

class Document(BaseModel):
    language: str = Field(
        description="The primary language detected in the document, e.g., 'en' for English or 'de' for German."
    )
    chapter_titles: list[str] = Field(
        description="List of major section or chapter titles extracted from the document, preserving their original order."
    )
    tables: list[str] 
    number_of_tables: int = Field(
        description="The total number of tables present in the document."
    )
    tables_header_names: list[str] = Field(
        description="List of header names (column titles) found across all tables in the document."
    )
    tables_row_names: list[str] = Field(
        description="List of row labels or identifiers (first-column values) found across all tables in the document."
    )

  
load_dotenv()
api_key = os.getenv("MISTRAL_API_KEY")
client = Mistral(api_key=api_key)

# Open the local PDF
document_path = "SouheilBichiou_Resume.pdf"
try:
    with open(document_path, "rb") as f:
        file_bytes = f.read()
except FileNotFoundError:
    print(f"Error: The file '{document_path}' was not found.")
    exit()

# Upload the PDF
uploaded_file = client.files.upload(
    file={
        "file_name": document_path,
        "content": file_bytes
    },
    purpose="ocr"
)

# Get a signed URL
file_signed_url = client.files.get_signed_url(file_id=uploaded_file.id)
file_url = file_signed_url.url

# Perform OCR using the document URL
response = client.ocr.process(
    model="mistral-ocr-latest",
    document={
        "type": "document_url",
        "document_url": file_url
    },
    document_annotation_format=response_format_from_pydantic_model(Document),
    include_image_base64=True
)

response

OCRResponse(pages=[OCRPageObject(index=0, markdown="# Souheil Bichiou \n\n## Computer Science Engineer\n\n- souheil.bichiou@ensi-uma.tn $\\square$ Souheil Bichiou $\\bigcirc$ BSouheil $\\bigcirc$ Tunisia\n\n\n## EDUCATION\n\nNational School of Computer Science\n09/2022 - 06/2025\nPreparatory Institute for Engineering Studies of Nabeul\n09/2020 - 06/2022\n\n## PROFESSIONAL EXPERIENCE\n\nData Software Engineer, cloudsquid $\\square$\nBerlin, Germany\nEvaluated and optimized cloudsquid data extraction pipeline :\n\n- Created an evaluation platform and obtained 0.937 precision for collected and created data.\n- Developed a solution to visualize bounding boxes of each extracted data.\n- integrated the solution into cloudsquid platform improving data traceability by $\\mathbf{9 5 \\%}$.\n\nData Scientist intern, Elyadata $\\square$\nTunisia\nFine-tuned T5 model for Arabic language tasks, focusing on summarization and question-\nanswering:\n\n- Enhanced the model's performance using LoRA PEFT

In [None]:
print(response.document_annotation)

{
  "language": "English",
  "chapter_titles": [
    "EDUCATION",
    "PROFESSIONAL EXPERIENCE",
    "PROJECTS",
    "SKILLS",
    "CERTIFICATES",
    "LANGUAGES"
  ],
  "tables": [],
  "number_of_tables": 0,
  "tables_header_names": [],
  "tables_row_names": []
}


In [None]:
# Working with Mistral AI's OCR to extract structured data from an image file
from pydantic import BaseModel, Field
from mistralai.extra import response_format_from_pydantic_model
def perform_ocr(client, file_path: str, model: str = "mistral-ocr-latest"):
    """
    Uploads an image to the API, gets a signed URL, and performs OCR using the specified model.
    
    Args:
        client: Initialized API client.
        file_path: Local path to the image file.
        model: OCR model to use (default: "mistral-ocr-latest").
    
    Returns:
        OCR response object.
    """
    # Document Annotation response format
    class Document(BaseModel):
        language: str = Field(
            description="The primary language detected in the document, e.g., 'en' for English or 'de' for German."
        )
        chapter_titles: list[str] = Field(
            description="List of major section or chapter titles extracted from the document, preserving their original order."
        )
        tables: list[str] 
        number_of_tables: int = Field(
            description="The total number of tables present in the document."
        )
        tables_header_names: list[str] = Field(
            description="List of header names (column titles) found across all tables in the document."
        )
        tables_row_names: list[str] = Field(
            description="List of row labels or identifiers (first-column values) found across all tables in the document."
        )

    # Upload the image
    uploaded_file = client.files.upload(
        file={
            "file_name": file_path.split("/")[-1],
            "content": open(file_path, "rb")
        },
        purpose="ocr"
    )

    # Get signed URL
    file_signed_url = client.files.get_signed_url(file_id=uploaded_file.id)
    file_url = file_signed_url.url

    # Perform OCR
    response = client.ocr.process(
        model=model,
        document={
            "type": "image_url",
            "image_url": file_url
        },
        document_annotation_format=response_format_from_pydantic_model(Document),
        include_image_base64=True
    )

    return response

# --- Example usage ---
ocr_result = perform_ocr(client, "Screenshot_reference.png")
print(ocr_result)


pages=[OCRPageObject(index=0, markdown='# ENERGIEAUSWEIS für Wohngebäude gemäß den §§ 16 ff. Energieeinsparverordnung (EnEV) vom 18. November 2013\n\nGültig bis: 05.01.2032 Registriernummer: NI-2022-003911383\n\n## Gebäude\n\n|  Gebäudefizit | freistehendes Mehrfamilienhaus  |\n| --- | --- |\n|  Adresse | Schröderskamp 10, 28844 Weyhe  |\n|  Gebäudeteil | Mehrfamilienhaus  |\n|  Baujahr Gebäude 1 | 2020/2021  |\n|  Baujahr Wärmeerzeuger 1, 4 | 2021  |\n|  Anzahl Wohnungen | 5  |\n|  Gebäudenutzfläche (A_{1h}) | 548,7 m^{2}  |\n|  Wesentliche Energieträger für Heizung und Warmwasser 3 | Strom-Mix  |\n|  Erneuerbare Energien | Art: Geothermie  |\n|  Art der Lüftung / Kühlung | ☑ Fensterlüftung  |\n|  Anlass der Ausstellung des Energieausweises | ☑ Neubau  |\n|  Vermietung / Verkauf | ☑ Umsetzungs- und Abwärmerückgewinnung  |\n|  Modemisierung (Änderung / Erweiterung) | ☑ Anlage zur Kühlung  |\n|  Sonstiges (freiwillig) | ☑ Sonstiges  |\n\n## Hinweise zu den Angaben über die energetische 

In [55]:
print(ocr_result.document_annotation)

{
  "language": "de",
  "chapter_titles": [
    "Gebäude",
    "Hinweise zu den Angaben über die energetische Qualität des Gebäudes",
    "Hinweise zur Verwendung des Energieausweises"
  ],
  "tables": [
    "Gebäude",
    "Hinweise zu den Angaben über die energetische Qualität des Gebäudes",
    "Hinweise zur Verwendung des Energieausweises"
  ],
  "number_of_tables": 3,
  "tables_header_names": [
    "Gebäudetyp",
    "Adresse",
    "Gebäudeteil",
    "Baujahr Gebäude",
    "Baujahr Wärmeerzeuger",
    "Anzahl Wohnungen",
    "Gebäudenutzfläche (A_N)",
    "Wesentliche Energieträger für Heizung und Warmwasser",
    "Energieberater",
    "Art der Lüftung / Kühlung",
    "Anlass der Ausstellung des Energieausweises"
  ],
  "tables_row_names": [
    "Gebäude",
    "Hinweise zu den Angaben über die energetische Qualität des Gebäudes",
    "Hinweise zur Verwendung des Energieausweises"
  ]
}


In [17]:
def get_markdown_from_ocr(response):
    # Convert response to dictionary if needed
    response_dict = response.model_dump()
    response_dict.get("pages")
    md = response_dict.get("pages")[0].get("markdown")
    return md
md = get_markdown_from_ocr(ocr_result)
md


'# ENERGIEAUSWEIS für Wohngebäude gemäß den §§ 16 ff. Energieeinsparverordnung (EnEV) vom 18. November 2013\n\nGültig bis: 05.01.2032 Registriernummer: NI-2022-003911383\n\n## Gebäude\n\n|  Gebäudefizit | Freistehendes Mehrfamilienhaus  |\n| --- | --- |\n|  Adresse | Schröderskamp 10, 28844 Weyhe  |\n|  Gebäudeteil | Mehrfamilienhaus  |\n|  Baujahr Gebäude 1 | 2020/2021  |\n|  Baujahr Wärmeerzeuger 1, 4 | 2021  |\n|  Anzahl Wohnungen | 5  |\n|  Gebäudenutzfläche (A_{1h}) | 548,7 m^{2}  |\n|  Wesentliche Energieträger für Heizung und Warmwasser 3 | Strom-Mix  |\n|  Erneuerbare Energien | Art: Geothermie  |\n|  Art der Lüftung / Kühlung | ☑ Fensterlüftung  |\n|  Anlass der Ausstellung des Energieausweises | ☑ Neubau  |\n|  Vermietung / Verkauf | ☑ Umsetzungs- und Abwässerung  |\n|  Verwendungen / Heizung + Warmwasserbereitung | ☑ Anlage zur Kühlung  |\n|  ☑ Anlässe der Ausstellung des Energieausweises | ☑ Neubau  |\n|  ☑ Vermietung / Verkauf | ☑ Umsetzungs- und Abwässerung  |\n|  Sonstig

# Retrieve Our Chunks

In [29]:
%pip install -qU langchain-text-splitters

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
from langchain_text_splitters import MarkdownHeaderTextSplitter


headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]
def split_md(md):
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, return_each_line=True)
    md_header_splits = markdown_splitter.split_text(md)
    return md_header_splits
md_header_splits = split_md(md)
md_header_splits

[Document(metadata={'Header 1': 'ENERGIEAUSWEIS für Wohngebäude gemäß den §§ 16 ff. Energieeinsparverordnung (EnEV) vom 18. November 2013'}, page_content='Gültig bis: 05.01.2032 Registriernummer: NI-2022-003911383'),
 Document(metadata={'Header 1': 'ENERGIEAUSWEIS für Wohngebäude gemäß den §§ 16 ff. Energieeinsparverordnung (EnEV) vom 18. November 2013', 'Header 2': 'Gebäude'}, page_content='|  Gebäudefizit | Freistehendes Mehrfamilienhaus  |\n| --- | --- |\n|  Adresse | Schröderskamp 10, 28844 Weyhe  |\n|  Gebäudeteil | Mehrfamilienhaus  |\n|  Baujahr Gebäude 1 | 2020/2021  |\n|  Baujahr Wärmeerzeuger 1, 4 | 2021  |\n|  Anzahl Wohnungen | 5  |\n|  Gebäudenutzfläche (A_{1h}) | 548,7 m^{2}  |\n|  Wesentliche Energieträger für Heizung und Warmwasser 3 | Strom-Mix  |\n|  Erneuerbare Energien | Art: Geothermie  |\n|  Art der Lüftung / Kühlung | ☑ Fensterlüftung  |\n|  Anlass der Ausstellung des Energieausweises | ☑ Neubau  |\n|  Vermietung / Verkauf | ☑ Umsetzungs- und Abwässerung  |\n|  V

In [5]:
# list of dicts {"header": tuple, "text": str} 
def extract_headers_and_text(docs):
    """
    Converts Document list into a list of dicts:
    {'header': (Header1, Header2, ...), 'text': '...'}
    """
    chunks = []
    for doc in docs:
        header = tuple(doc.metadata.get(k) for k in sorted(doc.metadata.keys()))
        chunks.append({"header": header, "text": doc.page_content})
    return chunks
extracted_chunks = extract_headers_and_text(md_header_splits)
extracted_chunks

[{'header': ('ENERGIEAUSWEIS für Wohngebäude gemäß den §§ 16 ff. Energieeinsparverordnung (EnEV) vom 18. November 2013',),
  'text': 'Gültig bis: 05.01.2032 Registriernummer: NI-2022-003911383'},
 {'header': ('ENERGIEAUSWEIS für Wohngebäude gemäß den §§ 16 ff. Energieeinsparverordnung (EnEV) vom 18. November 2013',
   'Gebäude'),
  'text': '|  Gebäudefizit | freistehendes Mehrfamilienhaus  |\n| --- | --- |\n|  Adresse | Schröderskamp 10, 28844 Weyhe  |\n|  Gebäudeteil | Mehrfamilienhaus  |\n|  Baujahr Gebäude 1 | 2020/2021  |\n|  Baujahr Wärmeerzeuger 1, 4 | 2021  |\n|  Anzahl Wohnungen | 5  |\n|  Gebäudenutzfläche (A_{1h}) | 548,7 m^{2}  |\n|  Wesentliche Energieträger für Heizung und Warmwasser 3 | Strom-Mix  |\n|  Erneuerbare Energien | Art: Geothermie  |\n|  Art der Lüftung / Kühlung | ☑ Fensterlüftung  |\n|  Anlass der Ausstellung des Energieausweises | ☑ Neubau  |\n|  Vermietung / Verkauf | ☑ Umsetzungs- und Abwärmerückgewinnung  |\n|  Modemisierung (Änderung / Erweiterung) | ☑ A

In [None]:
def extract_unique_headers(md_splits):
    """
    Extract unique (level, header_text) tuples from MarkdownHeaderTextSplitter output,
    preserving the original order.
    """
    seen = set()
    unique_headers = []
    for doc in md_splits:
        for key, value in doc.metadata.items():
            if value:
                level = int(key.split(" ")[1])
                header_tuple = (level, value)
                if header_tuple not in seen:
                    seen.add(header_tuple)
                    unique_headers.append(header_tuple)
    return unique_headers

# Usage
unique_headers_list = extract_unique_headers(md_header_splits)
for h in unique_headers_list:
    print(h)


(1, 'ENERGIEAUSWEIS für Wohngebäude gemäß den §§ 16 ff. Energieeinsparverordnung (EnEV) vom 18. November 2013')
(2, 'Gebäude')
(2, 'Hinweise zu den Angaben über die energetische Qualität des Gebäudes')
(2, 'Hinweise zur Verwendung des Energieausweises')


# Scoring

### Sementic Score

In [32]:
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def encode_texts(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings

def compute_content_similarity_with_header_penalty(ref_chunks, cand_chunks, window=4, header_penalty=1):
    """
    Compute a similarity score between two lists of chunks, preserving local structure
    and penalizing header mismatches.
    
    ref_chunks: list of dicts {"header": tuple, "text": str} for reference document
    cand_chunks: same format for candidate document
    window: number of neighboring chunks on either side to consider
    header_penalty: multiplier for similarity if headers do not match (0 < header_penalty <= 1)
    """
    ref_texts = [c["text"] for c in ref_chunks]
    cand_texts = [c["text"] for c in cand_chunks]

    ref_emb = encode_texts(ref_texts)
    cand_emb = encode_texts(cand_texts)

    sims = []
    for i, r_emb in enumerate(ref_emb):
        # define neighborhood in candidate document
        start = max(0, i - window)
        end = min(len(cand_emb), i + window + 1)
        local_cand_emb = cand_emb[start:end]

        # compute similarity with local neighborhood
        sim_scores = cosine_similarity([r_emb], local_cand_emb)[0]

        # find the best match in neighborhood
        best_idx = sim_scores.argmax()
        best_score = sim_scores[best_idx]

        # apply header penalty if headers do not match
        ref_header = ref_chunks[i]["header"]
        cand_header = cand_chunks[start + best_idx]["header"]
        if ref_header != cand_header:
            best_score *= header_penalty  # penalize mismatch

        sims.append(best_score)

    return float(np.mean(sims))

# Example usage:
similarity_score = compute_content_similarity_with_header_penalty(extracted_chunks, extracted_chunks)
print("Similarity score with header penalty:", similarity_score)


Similarity score with header penalty: 1.0


### Structure Score

In [8]:
def compute_header_coverage(ref_headers, cand_headers):
    """
    Computes a score based on how many reference headers exist in the candidate.
    Returns a fraction [0,1].
    """
    ref_set = set(ref_headers)
    cand_set = set(cand_headers)
    covered = ref_set & cand_set
    return len(covered) / len(ref_set) if ref_set else 1.0

In [64]:
import json

def compute_structural_similarity(doc_ann1, doc_ann2, weights=None):
    # Convert Pydantic models or JSON strings to dicts if needed
    if isinstance(doc_ann1, str):
        doc_ann1 = json.loads(doc_ann1)
    elif not isinstance(doc_ann1, dict):
        doc_ann1 = doc_ann1.dict()

    if isinstance(doc_ann2, str):
        doc_ann2 = json.loads(doc_ann2)
    elif not isinstance(doc_ann2, dict):
        doc_ann2 = doc_ann2.dict()
    
    if weights is None:
        weights = {"chapters": 0.3, "tables": 0.2, "headers": 0.3, "rows": 0.2}
    
    scores = {}
    
    # --- Chapter titles overlap ---
    set1, set2 = set(doc_ann1.get("chapter_titles", [])), set(doc_ann2.get("chapter_titles", []))
    if set1 or set2:
        scores["chapters"] = len(set1 & set2) / len(set1 | set2)
    else:
        scores["chapters"] = 1.0
    
    # --- Number of tables similarity ---
    n1, n2 = doc_ann1.get("number_of_tables", 0), doc_ann2.get("number_of_tables", 0)
    if max(n1, n2) > 0:
        scores["tables"] = 1 - abs(n1 - n2) / max(n1, n2)
    else:
        scores["tables"] = 1.0
    
    # --- Table header names overlap ---
    headers1, headers2 = set(doc_ann1.get("tables_header_names", [])), set(doc_ann2.get("tables_header_names", []))
    if headers1 or headers2:
        scores["headers"] = len(headers1 & headers2) / len(headers1 | headers2)
    else:
        scores["headers"] = 1.0
    
    # --- Table row names overlap ---
    rows1, rows2 = set(doc_ann1.get("tables_row_names", [])), set(doc_ann2.get("tables_row_names", []))
    if rows1 or rows2:
        scores["rows"] = len(rows1 & rows2) / len(rows1 | rows2)
    else:
        scores["rows"] = 1.0
    
    # --- Weighted combination ---
    structural_score = sum(scores[k] * weights[k] for k in weights)
    
    return structural_score, scores


# Main

In [None]:
ocr_result1 = perform_ocr(client, "Screenshot_reference.png")
md1 = get_markdown_from_ocr(ocr_result1)
md_header_splits1 = split_md(md1)
extracted_chunks1 = extract_headers_and_text(md_header_splits1)
unique_headers_list1 = extract_unique_headers(md_header_splits1)



ocr_result2 = perform_ocr(client, "Screenshot_match.png")
md2 = get_markdown_from_ocr(ocr_result2)
md_header_splits2 = split_md(md2)
extracted_chunks2 = extract_headers_and_text(md_header_splits2)
unique_headers_list2 = extract_unique_headers(md_header_splits2)


sem_score1 = compute_content_similarity_with_header_penalty(extracted_chunks1, extracted_chunks2)
print("Semantic similarity score between doc1 and doc2:", sem_score1)

struct_score, sub_scores = compute_structural_similarity(
    ocr_result1.document_annotation,
    ocr_result2.document_annotation
)
print("Structural similarity score:", struct_score)
print("Breakdown:", sub_scores)


Semantic similarity score between doc1 and doc2: 0.7730315327644348
Structural similarity score: 0.6846153846153846
Breakdown: {'chapters': 1.0, 'tables': 1.0, 'headers': 0.6153846153846154, 'rows': 0.0}


In [89]:
overall_score = 0.6 * sem_score1 + 0.4 * struct_score
print("Overall similarity score:", overall_score)

Overall similarity score: 0.7376650735048147


In [87]:
print(ocr_result2.document_annotation)

{
  "language": "de",
  "chapter_titles": [
    "Gebäude",
    "Hinweise zu den Angaben über die energetische Qualität des Gebäudes",
    "Hinweise zur Verwendung des Energieausweises"
  ],
  "tables": [
    "Gebäudedaten",
    "Energiebedarfsausweis",
    "Energieverbrauchsausweis"
  ],
  "number_of_tables": 3,
  "tables_header_names": [
    "Gebäudetyp",
    "Adresse",
    "Gebäudeteil",
    "Baujahr Gebäude",
    "Baujahr Wärmeerzeuger",
    "Anzahl Wohnungen",
    "Gebäudenutzfläche (A_N)",
    "Wesentliche Energieträger für Heizung und Warmwasser",
    "Energiebedarfsausweis",
    "Energieverbrauchsausweis"
  ],
  "tables_row_names": [
    "Mehrfamilienhaus",
    "Eulersstraße 19, 88046 Friedrichshafen",
    "Ganzes Gebäude",
    "1992",
    "1992",
    "11",
    "2164,34 m²",
    "sonstige",
    "Art",
    "Verwendung"
  ]
}
