In [1]:
!pip install bitsandbytes==0.45.5 transformers==4.51.3 qwen-vl-utils==0.0.11 accelerate -q

In [2]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
# Assuming qwen_vl_utils.py is in the same directory or accessible in PYTHONPATH
# If not, you might need to copy its code or adjust the import.
try:
    from qwen_vl_utils import process_vision_info
except ImportError:
    print("Warning: Could not import process_vision_info from qwen_vl_utils.")
    print("Please ensure the qwen_vl_utils.py file is available.")
    # Define a dummy function or handle the error appropriately
    def process_vision_info(messages):
        # Basic placeholder - replace with actual logic if needed/possible
        print("ERROR: process_vision_info function is missing!")
        image_inputs = []
        for msg in messages:
            if msg['role'] == 'user':
                for item in msg['content']:
                    if item['type'] == 'image':
                        # You would normally load and process the image here
                        # Since the function is missing, we can't load image data
                        print(f"Cannot process image: {item['image']}")
                        # Add a placeholder or skip if your pipeline requires image data
                        # image_inputs.append(None) # Example placeholder
                        pass # Or raise an error
        return image_inputs, [] # Return empty lists or placeholders

import time
import os
from PIL import Image # Needed for process_vision_info usually

# --- Configuration ---
# Using the specific Unsloth 4-bit model ID
MODEL_ID = "unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit"

# --- Check for necessary libraries ---
try:
    import bitsandbytes
except ImportError:
    raise ImportError("bitsandbytes is required for 4-bit loading. Please install it: pip install bitsandbytes")
try:
    import accelerate
except ImportError:
    raise ImportError("accelerate is required for device_map='auto'. Please install it: pip install accelerate")

# --- Model and Processor Loading (with explicit 4-bit config) ---
print(f"Loading model: {MODEL_ID} with 4-bit quantization...")

# Determine compute dtype (bfloat16 is preferred on Ampere+)
compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
print(f"Using compute dtype: {compute_dtype}")

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    load_in_4bit=True,                 # <<< Explicitly load in 4-bit
    device_map="auto",                 # <<< Let accelerate handle device placement
    bnb_4bit_quant_type="nf4",         # <<< NF4 is a common quantization type
    bnb_4bit_use_double_quant=True,    # <<< Use double quantization for potentially more memory savings
    bnb_4bit_compute_dtype=compute_dtype, # <<< Set the compute dtype (bf16 or fp16)
    # attn_implementation="flash_attention_2", # Keep commented unless you uncomment for larger model AND install flash-attn
    # torch_dtype="auto" is less relevant when using bnb_4bit_compute_dtype
)

print("Model loaded.")
print("Loading processor...")
# The processor doesn't need quantization parameters
processor = AutoProcessor.from_pretrained(MODEL_ID)
print("Processor loaded.")

Loading model: unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit with 4-bit quantization...
Using compute dtype: torch.bfloat16


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/2.09k [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/5.97G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

Model loaded.
Loading processor...


preprocessor_config.json:   0%|          | 0.00/575 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/5.80k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Processor loaded.


In [3]:
# --- Image Path and Validation ---
IMAGE_PATH = "California -USA-_Front_f4dce8c3dfba3c893adafeec60cdd00d.jpg"
# IMPORTANT: Make sure this image path is correct and accessible.
if not os.path.exists(IMAGE_PATH):
    raise FileNotFoundError(f"Image file not found at: {IMAGE_PATH}. Please upload it or correct the path.")
print(f"Using image: {IMAGE_PATH}")

# --- System Prompt for Full Text Extraction ---
system_prompt_content = """You are a specialized ID document OCR system with 100% accuracy requirements.
Your sole purpose is to extract EVERY field and text element from identification documents.

IMPORTANT: You MUST extract and include the following fields if present:
- Document title/type (e.g., "DRIVER LICENSE", "ID CARD")
- State/Country indicators (e.g., "CALIFORNIA", "USA")
- ID Number
- Expiration date (EXP)
- Class designation (CLASS)
- END field (often displayed as "END NONE" or with restrictions)
- Last name (LN)
- First name (FN)
- Address (street, city, state, ZIP)
- Date of Birth (DOB)
- Restrictions (RSTR)
- Donor status
- Sex/Gender
- Physical characteristics (HAIR, EYES, HGT, WGT)
- Issue date (ISS)
- Document discriminator (DD)
- Any security features or ID numbers that appear on the card
- Any watermarks or overlaid text

Extract ABSOLUTELY ALL text, including small print and text that appears in the background or as watermarks.
Your output should be the raw text ONLY, maintaining the relative positioning as much as possible.
"""

# --- Message Construction ---
messages = [
    {
        "role": "system",
        "content": system_prompt_content
    },
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": IMAGE_PATH, # Pass the path directly, process_vision_info should handle loading
            },
            {"type": "text", "text": "Perform a complete OCR. Extract every single word, number, symbol, and punctuation mark. Maintain the original layout, including line breaks and spacing, as accurately as possible. EXTRACT AND SAVE placeholders or sample data as well."},
        ],
    }
]

# --- Preparation for Inference ---
print("Preparing inputs...")
# 1. Apply chat template
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

# 2. Process vision info (handles image loading and prep based on messages)
# Ensure qwen_vl_utils.process_vision_info handles loading from the path
# If process_vision_info expects PIL Images, load it first:
# image_pil = Image.open(IMAGE_PATH)
# Then modify the message structure or the function to accept PIL Image object
# For now, assuming process_vision_info takes the path from the message:
try:
    image_inputs, video_inputs = process_vision_info(messages)
except Exception as e:
     print(f"Error during process_vision_info: {e}")
     print("Check if the function expects a file path or a loaded PIL Image.")
     # Example alternative if it expects PIL:
     # image_pil = Image.open(IMAGE_PATH).convert('RGB')
     # messages[1]['content'][0]['image'] = image_pil # Replace path with PIL image
     # image_inputs, video_inputs = process_vision_info(messages)
     # If it still fails, you need to debug process_vision_info or its usage.
     raise

# 3. Tokenize text and combine with processed images/videos
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)

# 4. Move inputs to the same device as the model
# Note: device_map="auto" primarily maps model layers. Inputs usually need explicit moving.
# Check the device of a model parameter to confirm where inputs should go.
target_device = next(model.parameters()).device
print(f"Moving inputs to device: {target_device}")
inputs = inputs.to(target_device)
print("Inputs prepared and moved to device.")

# --- Inference: Generation of the output ---
print("Starting text generation...")
start_time = time.perf_counter()

# Increased max_new_tokens slightly, can be increased further if text is very dense
# Generation parameters are good for OCR
generated_ids = model.generate(
    **inputs,
    max_new_tokens=4096,
    do_sample=False,     # Crucial for OCR: ensures deterministic, greedy decoding
    temperature=0.6,     # With do_sample=False, this has less effect but reinforces greedy behavior
    top_k=1,             # Also reinforces greedy decoding when do_sample=False
    # repetition_penalty=1.0 # Default is 1.0 (no penalty). Adjust if repetitions are an issue.
    eos_token_id=processor.tokenizer.eos_token_id, # Explicitly set End Of Sequence token
    pad_token_id=processor.tokenizer.pad_token_id if processor.tokenizer.pad_token_id is not None else processor.tokenizer.eos_token_id
)

# --- Post-processing ---
# Trim the input tokens from the generated output
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]

# Decode the generated tokens
output_text_raw = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True
)[0]

end_time = time.perf_counter()
elapsed_time = end_time - start_time
print("Generation complete.")

print("\n--- Raw Model Output (Extracted Text) ---")
print(output_text_raw)
print("-----------------------------------------")
print(f"\nTime taken: {elapsed_time:.4f} seconds")

Using image: California -USA-_Front_f4dce8c3dfba3c893adafeec60cdd00d.jpg
Preparing inputs...
Moving inputs to device: cuda:0
Inputs prepared and moved to device.
Starting text generation...




Generation complete.

--- Raw Model Output (Extracted Text) ---
CALIFORNIA USA DRIVER LICENSE
ID 11234568
EXP 08/31/2015
CLASS C
END NONE
LN CARDHOLDER
FN IMA
2570 24TH STREET
SACRAMENTO, CA 95818
DOB 08/31/1977
RSTR NONE
0831977
DONOR
SEX F
HAIR BRN
HGT 5'-05"
WGT 125 lb
EYES BRN
DD 09/30/201060221/21FD/15
ISS 09/30/2010

-----------------------------------------

Time taken: 22.8497 seconds
