In [None]:
%%capture
!pip install gradio==6.3.0 git+https://github.com/huggingface/transformers.git
!pip install opencv-python torch torchvision pillow accelerate easydict spaces

In [None]:
import gradio as gr
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForImageTextToText
import spaces

MODEL_PATH = "strangervisionhf/PaddleOCR-VL-1.5-hf-transformers-v5.2.0.dev0"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Loading model: {MODEL_PATH} on {DEVICE}...")

try:
    model = AutoModelForImageTextToText.from_pretrained(
        MODEL_PATH,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True
    ).to(DEVICE).eval()

    processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
    print("Model loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")
    raise e

@spaces.GPU
def process_ocr(image):
    if image is None:
        return "Please upload an image."

    image = image.convert("RGB")

    prompt_text = "OCR:"

    max_pixels = 1280 * 28 * 28
    min_pixels = 256 * 28 * 28

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": prompt_text},
            ]
        }
    ]

    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
        images_kwargs={
            "size": {
                "shortest_edge": min_pixels,
                "longest_edge": max_pixels
            }
        },
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=512)

    generated_text = processor.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)

    return generated_text

with gr.Blocks() as demo:
    gr.Markdown("# **PaddleOCR-VL-1.5 (Free-OCR)**")

    with gr.Row():
        with gr.Column():
            input_image = gr.Image(type="pil", label="Input Image")
            run_btn = gr.Button("Extract Text", variant="primary")

        with gr.Column():
            output_text = gr.Textbox(label="Extracted Text", lines=15)

    run_btn.click(
        fn=process_ocr,
        inputs=[input_image],
        outputs=output_text
    )

if __name__ == "__main__":
    demo.queue().launch()