In [None]:
!pip install openai google-genai anthropic requests pillow jiwer

In [None]:
%pip install -U -q "google-generativeai>=0.8.3"

In [None]:
%pip install --upgrade --quiet google-genai

In [None]:
!pip install bitsandbytes transformers huggingface_hub
!pip install git+https://github.com/deepseek-ai/Janus.git
!pip install git+https://github.com/Dao-AILab/flash-attention.git

In [None]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "[your_json]"

In [None]:
import torch
import numpy as np
from PIL import Image
from janus.utils.io import load_pil_images
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from janus.models import MultiModalityCausalLM, VLChatProcessor

cuda_device = 'cuda:0'

# Specify the path to the model
model_path = "deepseek-ai/Janus-Pro-7B"
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True, quantization_config=quantization_config, torch_dtype=torch.bfloat16)

vl_gp=vl_gpt.to(cuda_device)

def multimodal_understanding(images, question, seed, top_p, temperature):
    # Clear CUDA cache before generating
    torch.cuda.empty_cache()

    # Set seed
    torch.manual_seed(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed(seed)

    conversation = [
        {
            "role": "<|User|>",
            "content": f"<image_placeholder>\n{question}",
            "images": images,
        },
        {"role": "<|Assistant|>", "content": ""},
    ]

    # Ensure images are properly formatted as PIL images
    pil_images = [Image.fromarray(img) if isinstance(img, np.ndarray) else img for img in images]

    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(cuda_device, dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16)

    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False if temperature == 0 else True,
        use_cache=True,
        temperature=temperature,
        top_p=top_p,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    return answer

In [None]:
import requests
import json
import openai
import anthropic
#import google.generativeai as genai
from google import genai
from google.genai import types
from PIL import Image
import base64
import io
import pandas as pd
from pathlib import Path
from kaggle_secrets import UserSecretsClient
import jiwer  # Import JiWER for WER computation

user_secrets = UserSecretsClient()

# API Keys (Replace with your actual keys)
ANTHROPIC_API_KEY = user_secrets.get_secret("ANTROPIC_API_KEY")
OPENAI_API_KEY = user_secrets.get_secret("OPENAI_API_KEY")
GOOGLE_API_KEY = user_secrets.get_secret("GOOGLE_API_KEY")

# Load dataset
dataset_path = Path("/kaggle/input/donkeysmallocr-cyrillic-printed-8") # I used 50 images from https://huggingface.co/datasets/DonkeySmall/OCR-Cyrillic-Printed-8
test_csv = dataset_path / "test/test.txt"

df_test = pd.read_csv(test_csv, sep=",", names=["filename", "text"])
df_test["filepath"] = df_test["filename"].apply(lambda x: str(dataset_path / "test" / x))

# Take 2 images for fast testing
df_sample = df_test.sample(50)

# Convert image to Base64
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

# OpenAI API requests
def ocr_openai(image_path):
    """ Perform OCR using OpenAI GPT-4o Vision API """
    try:
        encoded_image = encode_image(image_path)
        headers = {"Authorization": f"Bearer {OPENAI_API_KEY}", "Content-Type": "application/json"}

        data = {
            "model": "gpt-4o",
            "messages": [
                {"role": "system", "content": "You are an AI assistant helping with OCR."},
                {"role": "user", "content": [
                    {"type": "text", "text": "Extract russian text from this image. Return ONLY extracted text."},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}}
                ]}
            ],
            "max_tokens": 500
        }

        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=data)

        if response.status_code == 429:
            print("OpenAI API quota exceeded. Skipping OpenAI OCR.")
            return "No result (OpenAI Quota Exceeded)"

        if response.status_code == 200:
            return response.json().get("choices", [{}])[0].get("message", {}).get("content", "No result")
        else:
            print(f"OpenAI API Error {response.status_code}: {response.text}")
            return "No result"

    except Exception as e:
        print(f"OpenAI API Error: {e}")
        return "No result"

# Anthropic API requests
def ocr_anthropic(image_path):
    """Perform OCR using Anthropic Claude 3.7 Sonnet API."""
    try:
        encoded_image = encode_image(image_path)

        client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)

        message = client.messages.create(
            model="claude-3-7-sonnet-20250219",
            max_tokens=1024,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": "image/jpeg",
                                "data": encoded_image,
                            },
                        },
                        {
                            "type": "text",
                            "text": "Extract russian text from this image. Return ONLY extracted text.",
                        },
                    ],
                }
            ],
        )

        return message.content[0].text if message and message.content else "No result"

    except Exception as e:
        print(f"Anthropic API Error: {e}")
        return "No result"


def ocr_google(image_path):
    """Perform OCR using Google Gemini 2.0 Flash API."""
    try:
         # Open image and convert to bytes
        with open(image_path, "rb") as image_file:
            image_bytes = image_file.read()

        # Convert to PIL Image (Gemini requires a valid Image object)
        img = Image.open(io.BytesIO(image_bytes))

        client = genai.Client(location="us-central1", project="serious-water-454012-a9", vertexai=True)
        #client = genai.Client(api_key=GOOGLE_API_KEY)
        response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=["Extract russian text from this image. Return ONLY extracted text.", img],
        )
        if response and response.candidates:
            return response.candidates[0].content.parts[0].text
        else:
            return "No result"

    except Exception as e:
        print(f"Google Gemini API Error: {e}")
        return "No result"

# Local DeepSeek usage
def ocr_deepseek(image_path):
    """ Perform OCR using Deepseek Janus Pro """
    try:
        # Load the image
        image = Image.open(image_path)

        # Define inputs
        question = "Extract russian text from this image. Return ONLY extracted text."

        seed = 42
        top_p = 0.8
        temperature = 0.5

        # Call the function with the correct image format
        pred_text=multimodal_understanding([image], question, seed, top_p, temperature)

        if pred_text:
            return pred_text

    except Exception as e:
        print(f"Deepseek Janus Error: {e}")
        return "No result"

# Process Batch Requests & Compute WER using JiWER
cer_scores = []
models = {"GPT-4o": ocr_openai, "Claude 3.7": ocr_anthropic, "Gemini 2.0 Flash": ocr_google, "Deepseek Janus Pro": ocr_deepseek}

for _, row in df_sample.iterrows():
    img_path = row["filepath"]
    true_text = row["text"]

    for model_name, ocr_function in models.items():
        pred_text = ocr_function(img_path)  # Perform OCR

        # Compute CER using JiWER
        cer_score = jiwer.cer(true_text.strip().lower(), pred_text.strip().lower())

        cer_scores.append({
            "Model": model_name,
            "Image": img_path,
            "CER": cer_score,
            "Reference": true_text,
            "Prediction": pred_text
        })

# Convert results to DataFrame
df_cer = pd.DataFrame(cer_scores)

###### 6 min

In [None]:
# Save to CSV
df_cer.to_csv("/kaggle/working/cer_results.csv", mode="w", index=False)

# Print confirmation
print("CER results saved: /kaggle/working/cer_results.csv")

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import os
from PIL import Image

# Function to Plot Best & Worst Examples
def plot_best_worst_examples(df_cer):
    models = df_cer["Model"].unique()

    for model_name in models:
        # Filter data for the current model
        model_data = df_cer[df_cer["Model"] == model_name]

        if model_data.empty:
            print(f"No data available for model: {model_name}")
            continue  # Skip if no data is available

        # Find best (min WER) and worst (max WER) examples
        best_example = model_data.loc[model_data["CER"].dropna().idxmin()]
        worst_example = model_data.loc[model_data["CER"].dropna().idxmax()]

        # Function to load and plot image
        def plot_example(example, title):
            img_path = example["Image"]
            if not os.path.exists(img_path):
                print(f"Image not found: {img_path}")
                return  # Skip missing image

            img = Image.open(img_path).convert("L")
            plt.imshow(img, cmap="gray")
            plt.axis("off")
            plt.title(f"{title}\nCER: {example['CER']:.2%}\nPrediction: {example['Prediction'][:100]}")

        # Display images with best and worst WER
        fig, axes = plt.subplots(1, 2, figsize=(12, 6))

        plt.suptitle(f"{model_name} - Best & Worst OCR Results")

        # Best Example
        plt.subplot(1, 2, 1)
        plot_example(best_example, f"Best (Min CER)")

        # Worst Example
        plt.subplot(1, 2, 2)
        plot_example(worst_example, f"Worst (Max CER)")

        plt.show()  # Show both images together

# Show best & worst results for all models
plot_best_worst_examples(df_cer)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Plot CER Distribution
plt.figure(figsize=(10, 5))
plt.boxplot([df_cer[df_cer["Model"] == model]["CER"] for model in df_cer["Model"].unique()], labels=df_cer["Model"].unique())
plt.title("CER Distribution Across Models")
plt.xlabel("Model")
plt.ylabel("Character Error Rate (CER)")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

# Compute Average CER per Model
avg_cer_per_model = df_cer.groupby("Model")["CER"].mean().reset_index()
avg_cer_per_model = avg_cer_per_model.sort_values(by="CER")  # Sort for readability

# Show CER summary table
fig, ax = plt.subplots(figsize=(10, 3))
ax.axis("tight")
ax.axis("off")
table = ax.table(cellText=avg_cer_per_model.values, colLabels=avg_cer_per_model.columns, cellLoc="center", loc="center")
table.auto_set_font_size(False)
table.set_fontsize(10)
plt.title("CER Results Per Model")
plt.show()

# Save results as CSV files
df_cer.to_csv("/kaggle/working/cer_per_sample.csv",mode="w", index=False)
avg_cer_per_model.to_csv("/kaggle/working/average_cer_per_model.csv",mode="w", index=False)

# Print confirmation
print("Results saved as CSV files:")
print("- CER per sample: /kaggle/working/cer_per_sample.csv")
print("- Average CER per model: /kaggle/working/average_cer_per_model.csv")