In [None]:
import json

In [None]:
import os

from dotenv import load_dotenv

load_dotenv()

TOKEN = os.getenv("IK_API_KEY")
PRODUCT_ID = os.getenv("IK_PRODUCT_ID")
URL = f"https://api.infomaniak.com/1/ai/{PRODUCT_ID}/openai"
MODEL_LLM = "mistral24b"

from openai import OpenAI

llm_base_client = OpenAI(
    api_key=TOKEN,
    base_url=URL,
)

In [None]:
import os

from dotenv import load_dotenv

load_dotenv()

API_KEY_RUNPOD = os.getenv("API_KEY_OCR_LLM", "")
RUNPOD_ENDPOINT_ID = "5hwezmg13oviky"

from openai import OpenAI

client = OpenAI(
    api_key=API_KEY_RUNPOD,
    base_url=f"https://api.runpod.ai/v2/{RUNPOD_ENDPOINT_ID}/openai/v1",
)

In [None]:
# This is a base prompt that will be used for training and running the fine tuned model
# It's simplified from the prompt which was used to generate the silver data, and can change from dataset to dataset
def build_finetuning_prompt(base_text: str) -> str:
    return (
        f"Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. "
        f"Just return the plain text representation of this document as if you were reading it naturally.\n"
        f"Do not hallucinate.\n"
        f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
    )


TOKEN = "x"
URL = "http://localhost:11434/"
MODEL = "allenai/olmOCR-7B-0225-preview"


def process_image(image_base64: str):
    # Build the full prompt
    prompt = build_finetuning_prompt(
        "PV de réunion de l'assemblée générale de l'association Magic Genève"
    )
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/png;base64,{image_base64}"},
                },
            ],
        }
    ]

    response = client.chat.completions.create(
        model=MODEL,
        messages=messages,
        temperature=0.8,
        max_completion_tokens=2000,
        max_tokens=2001,
        presence_penalty=0.3,
        stream=False,
    )

    return response

In [None]:
import base64

with open("../data/to_ocr/PV manuscrit.pdf", "rb") as f:
    image_base64 = base64.b64encode(f.read()).decode()

In [None]:
response = process_image(image_base64)

In [None]:
import base64

import pymupdf

pages_extracted = []
with open("../data/to_extract/PV manuscrit.pdf") as file:
    doc = pymupdf.open(file)
    for index, page in enumerate(doc):
        pix = page.get_pixmap()
        img = base64.b64encode(pix.tobytes("png")).decode()
        text = page.get_text().encode("utf8")
        prompt = build_finetuning_prompt("No text available")

        # Build the full prompt
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{img}"},
                    },
                ],
            }
        ]

        print(f"Processing page {index + 1}")
        response = client.chat.completions.create(
            model=MODEL,
            messages=messages,
            temperature=0.8,
            max_completion_tokens=2000,
            max_tokens=2001,
            presence_penalty=0.3,
            stream=False,
        )

        pages_extracted.append(response)

In [None]:
print(json.loads(pages_extracted[1].choices[0].message.content)["natural_text"])

In [None]:
response_2 = []
file_path = "../data/to_extract/PV manuscrit.pdf"
with open(file_path) as file:
    doc = pymupdf.open(file)
    for index, page in enumerate(doc):
        pix = page.get_pixmap()
        img = base64.b64encode(pix.tobytes("png")).decode()
        prompt = build_finetuning_prompt(
            json.loads(pages_extracted[index].choices[0].message.content)[
                "natural_text"
            ]
        )

        # Build the full prompt
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{img}"},
                    },
                ],
            }
        ]

        print(f"Processing page {index + 1}")
        response = client.chat.completions.create(
            model=MODEL,
            messages=messages,
            temperature=0.8,
            max_completion_tokens=2000,
            max_tokens=2001,
            presence_penalty=0.3,
            stream=False,
        )

        response_2.append(response)

In [None]:
with open("text_1_1.txt", "w") as f:
    f.write(json.loads(pages_extracted[0].choices[0].message.content)["natural_text"])
with open("text_1_2.txt", "w") as f:
    f.write(json.loads(pages_extracted[1].choices[0].message.content)["natural_text"])
with open("text_2_1.txt", "w") as f:
    f.write(json.loads(response_2[0].choices[0].message.content)["natural_text"])
with open("text_2_2.txt", "w") as f:
    f.write(json.loads(response_2[1].choices[0].message.content)["natural_text"])

In [None]:
response_stream = llm_base_client.chat.completions.create(
    model=MODEL_LLM,
    messages=[
        {
            "role": "user",
            "content": f"Voici le text extrait d'une prise de note manuscrite: {text}. Est-ce que tu peux me corriger les erreurs d'ocr ?",
        }
    ],
    temperature=0.3,
    max_tokens=4000,
    stream=True,
)
# Stream the response
for response in response_stream:
    print(response.choices[0].delta.content or "", end="", flush=True)

In [None]:
import requests

url = "https://www.datalab.to/api/v1/marker"

form_data = {
    "file": (file_path, open(file_path, "rb"), "application/pdf"),
    "langs": (None, "English"),
    "force_ocr": (None, False),
    "paginate": (None, False),
    "output_format": (None, "markdown"),
    "use_llm": (None, False),
    "strip_existing_ocr": (None, False),
    "disable_image_extraction": (None, False),
}

headers = {"X-Api-Key": os.getenv("MARKER_OCR_KEY")}

response = requests.post(url, files=form_data, headers=headers)
data = response.json()

In [None]:
from requests import get

res = get(data["request_check_url"], headers=headers)
data = res.json()

In [None]:
data["success"]

In [None]:
with open("output.md", "w") as f:
    f.write(data["markdown"])