## **MinerU2.5-2509-1.2B**

MinerU2.5 is a 1.2B-parameter vision-language model for document parsing that achieves state-of-the-art accuracy with high computational efficiency. It adopts a two-stage parsing strategy: first conducting efficient global layout analysis on downsampled images, then performing fine-grained content recognition on native-resolution crops for text, formulas, and tables. Supported by a large-scale, diverse data engine for pretraining and fine-tuning, MinerU2.5 consistently outperforms both general-purpose and domain-specific models across multiple benchmarks while maintaining low computational overhead.


### **Install packages**

In [None]:
%%capture
!pip install transformers einops torch fpdf timm av decord \
             git+https://github.com/huggingface/accelerate.git \
             git+https://github.com/huggingface/peft.git \
             transformers-stream-generator huggingface-hub albumentations \
             pyvips-binary qwen-vl-utils sentencepiece opencv-python docling-core \
             python-docx torchvision safetensors matplotlib num2words \

!pip install xformers requests pymupdf hf_xet spaces pyvips pillow gradio \
              bitsandbytes reportlab
#Hold tight, this will take around 1-2 minutes.

### **Run Demo App**

In [None]:
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image
import os
import requests

# Define model options
MODEL_OPTIONS = {
    "MinerU2.5-2509-1.2B": "opendatalab/MinerU2.5-2509-1.2B",
}

# Preload models and processors into CUDA
models = {}
processors = {}
for name, model_id in MODEL_OPTIONS.items():
    print(f"Loading {name}🤗. Hold tight, this will take around 4-6 minutes..")
    models[name] = Qwen2VLForConditionalGeneration.from_pretrained(
        model_id,
        trust_remote_code=True,
        torch_dtype=torch.float16
    ).to("cuda").eval()
    processors[name] = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

def run_inference(model, processor, image_path, text_input):
    """
    Handles inference for the selected model.

    Args:
        model: The loaded Qwen2VLForConditionalGeneration model.
        processor: The loaded AutoProcessor.
        image_path (str): The local path to the input image.
        text_input (str): The text prompt to the model.

    Returns:
        str: The generated text from the model.
    """
    media_type = "image"
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": media_type,
                    media_type: image_path
                },
                {"type": "text", "text": text_input},
            ],
        }
    ]

    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, _ = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        padding=True,
        return_tensors="pt",
    ).to("cuda")

    # Generate token IDs
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=2048, # Increased token limit for potentially dense text
        do_sample=False      # Use greedy decoding for more deterministic OCR
    )

    # Decode only the newly generated tokens, excluding the prompt
    input_token_len = inputs["input_ids"].shape[1]
    generated_text = processor.batch_decode(
        generated_ids[:, input_token_len:],
        skip_special_tokens=True
    )[0]

    return generated_text.strip()

if __name__ == "__main__":
    # Select the model (only one option in this script)
    model_name = "MinerU2.5-2509-1.2B"
    model = models[model_name]
    processor = processors[model_name]

    # Set the image URL and the default prompt
    image_url = "https://huggingface.co/spaces/prithivMLmods/POINTS-Reader-OCR/resolve/main/examples/2.jpeg"
    prompt = "Perform OCR on the image precisely."
    image_filename = "downloaded_example_image.jpeg"

    # --- Main Execution ---
    try:
        # Download the image from the example URL
        print(f"Downloading example image from: {image_url}")
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Raise an exception for bad status codes

        # Save the image to a local file
        with open(image_filename, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Image saved locally as: {image_filename}")

        # Run the inference
        print("\nPerforming OCR... This may take a moment.")
        ocr_result = run_inference(
            model=model,
            processor=processor,
            image_path=image_filename,
            text_input=prompt
        )

        # Print the final result
        print("\n" + "="*20 + " OCR Result " + "="*20)
        print(ocr_result)
        print("="*54 + "\n")

    except requests.exceptions.RequestException as e:
        print(f"Error: Failed to download the image. Please check the URL and your internet connection. Details: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    finally:
        # Clean up by removing the downloaded image file
        if os.path.exists(image_filename):
            os.remove(image_filename)
            print(f"Cleaned up temporary file: {image_filename}")