# **tencent/POINTS-Reader**

[POINTS-Reader](https://huggingface.co/tencent/POINTS-Reader) is a powerful, distillation-free vision-language model from Tencent, built for end-to-end document conversion including complex layouts, tables, and formulas. Streamlined for efficiency, it employs a moderate-sized NaViT encoder with a Qwen2.5-3B-Instruct backbone, supporting straightforward image-to-text extraction in both Chinese and English, while achieving state-of-the-art accuracy on benchmarks like OmniDocBench. Leveraging a novel two-stage data augmentation and continuous self-evolution strategy, POINTS-Reader offers high throughput, easy deployment via frameworks like SGLang, and demonstrates robust self-improving document parsing for real-world and research applications.

`Requires L4 or higher GPUs to run the app.`

### **Install package**

In [None]:
%%capture
!pip install git+https://github.com/huggingface/accelerate.git \
             git+https://github.com/WePOINTS/WePOINTS.git \
             git+https://github.com/huggingface/peft.git \
             transformers-stream-generator huggingface_hub albumentations \
             pyvips-binary qwen-vl-utils sentencepiece opencv-python docling-core \
             python-docx torchvision safetensors matplotlib num2words \

!pip install xformers requests hf_xet spaces pyvips pillow transformers==4.55.2 \
             einops torch fpdf timm av decord bitsandbytes
#Hold tight, this will take around 1-2 minutes.

### **Run App**

In [None]:
import torch
import os
import traceback
from io import BytesIO
import uuid
import tempfile
import re
from PIL import Image
import requests

from transformers import AutoModelForCausalLM, AutoTokenizer, Qwen2VLImageProcessor

# --- Constants and Model Setup ---
MAX_INPUT_TOKEN_LENGTH = 4096
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
print("torch.__version__ =", torch.__version__)
print("torch.version.cuda =", torch.version.cuda)
print("cuda available:", torch.cuda.is_available())
print("cuda device count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("current device:", torch.cuda.current_device())
    print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

print("Using device:", device)


# --- Model Loading: tencent/POINTS-Reader ---
MODEL_PATH = 'tencent/POINTS-Reader'

print(f"Loading model: {MODEL_PATH}")
# It's recommended to use bfloat16 for better performance if your GPU supports it.
# Change to torch.float16 if bfloat16 is not available.
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map='auto'
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
image_processor = Qwen2VLImageProcessor.from_pretrained(MODEL_PATH)
print("Model loaded successfully.")


# --- Core Application Logic ---
def process_document(
    image: Image.Image,
    prompt_input: str,
    image_scale_factor: float = 1.0,
    max_new_tokens: int = 2048,
    temperature: float = 0.7,
    top_p: float = 0.8,
    top_k: int = 20,
    repetition_penalty: float = 1.05
) -> str:
    """
    Main function that handles model inference using tencent/POINTS-Reader.
    Takes a PIL Image and returns the extracted text as a string.
    """
    if image is None:
        print("Error: Please provide an image.")
        return "ERROR: Image not provided."
    if not prompt_input or not prompt_input.strip():
        print("Error: Please provide a prompt.")
        return "ERROR: Prompt not provided."

    # --- IMPLEMENTATION: Image Scaling based on user input ---
    if image_scale_factor > 1.0:
        try:
            original_width, original_height = image.size
            new_width = int(original_width * image_scale_factor)
            new_height = int(original_height * image_scale_factor)
            print(f"Scaling image from {image.size} to ({new_width}, {new_height}) with factor {image_scale_factor}.")
            # Use a high-quality resampling filter for better results
            image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
        except Exception as e:
            print(f"Error during image scaling: {e}")
            # Continue with the original image if scaling fails
            pass
    # --- END IMPLEMENTATION ---

    temp_image_path = None
    try:
        # --- Save the PIL Image to a temporary file ---
        # The model expects a file path, not a PIL object.
        temp_dir = tempfile.gettempdir()
        temp_image_path = os.path.join(temp_dir, f"temp_image_{uuid.uuid4()}.png")
        image.save(temp_image_path)

        # Prepare content for the model using the temporary file path
        content = [
            dict(type='image', image=temp_image_path),
            dict(type='text', text=prompt_input)
        ]
        messages = [
            {
                'role': 'user',
                'content': content
            }
        ]

        # Prepare generation configuration
        generation_config = {
            'max_new_tokens': max_new_tokens,
            'repetition_penalty': repetition_penalty,
            'temperature': temperature,
            'top_p': top_p,
            'top_k': top_k,
            'do_sample': True if temperature > 0 else False
        }

        # Run inference
        response = model.chat(
            messages,
            tokenizer,
            image_processor,
            generation_config
        )
        return response

    except Exception as e:
        traceback.print_exc()
        return f"An error occurred during processing: {str(e)}"
    finally:
        # --- Clean up the temporary image file ---
        if temp_image_path and os.path.exists(temp_image_path):
            os.remove(temp_image_path)

# --- Google Colab Usage Example ---
if __name__ == "__main__":
    # --- Step 1: Upload your image to Google Colab ---
    # You can do this by clicking the "Files" icon on the left sidebar and then "Upload to session storage".
    # Or you can load an image from a URL.

    # Example of loading an image from a URL
    try:
        # Replace this URL with the URL of your image or the path to your uploaded file
        image_url = "https://huggingface.co/spaces/prithivMLmods/POINTS-Reader-OCR/resolve/main/examples/1.jpeg"
        # image_path = "/content/your_uploaded_image.png" # Example for an uploaded file

        print(f"Loading image from: {image_url}")
        # If loading from a URL
        input_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")

        # If loading from a file path in Colab
        # input_image = Image.open(image_path).convert("RGB")

        # --- Step 2: Define your prompt ---
        prompt = "Perform OCR on the image precisely."

        # --- Step 3: Set parameters (optional) ---
        # For better OCR on images with small text, you can increase the scale factor.
        # A factor of 1.5 or 2.0 often helps.
        scale_factor = 1.5

        # --- Step 4: Run the processing function ---
        print("\nProcessing document...")
        extracted_text = process_document(
            image=input_image,
            prompt_input=prompt,
            image_scale_factor=scale_factor
        )

        # --- Step 5: Print the raw output ---
        print("\n" + "="*50)
        print("          Raw Extracted Output")
        print("="*50)
        print(extracted_text)
        print("="*50)


    except Exception as e:
        print(f"\nAn error occurred in the main execution block: {e}")
        print("Please ensure you have provided a valid image path or URL.")
        print("If using a file path, make sure the file is uploaded to your Colab session.")