In [None]:
!pip install --upgrade transformers bitsandbytes accelerate
!pip install git+https://github.com/huggingface/transformers.git@main
!pip install qwen-vl-utils

In [None]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import time

model_id = "unsloth/Qwen2.5-VL-3B-Instruct-bnb-4bit"

# Load model with device_map and trust_remote_code enabled
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16  # quantized model often uses float16 or no dtype (it manages internally)
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

# Fix pad_token if not set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id



In [None]:
# Path or URL of your gazette image
image_path = "https://pvnzkoipzvifklzrxcui.supabase.co/storage/v1/object/public/trila//(208).jpg"

# Prepare the multi-modal message with JSON instruction prompt
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": image_path,
            },
            {
                "type": "text",
                "text": (
                    "Extract the full text content from the gazette image below. "
                    "Output the result as a JSON object with fields: "
                   
                    "Please strictly follow the JSON format."
                ),
            },
        ],
    }
]


In [None]:
# Process inputs
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)

inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
).to(model.device)

print(f"Input IDs shape: {inputs.input_ids.shape}")

# Generate output with larger token limit and low temperature for OCR-like extraction
generated_ids = model.generate(
    **inputs,
    max_new_tokens=8024,           # increase max tokens for longer outputs
    do_sample=False,               # deterministic generation for stability
    temperature=0.3,               # low temperature helps accurate extraction
    top_p=0.95,
    repetition_penalty=1.1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

# Remove prompt tokens from output
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]

# Decode output into text (expected JSON string)
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print("Generated JSON output:")
print(output_text)

# Optional: parse JSON safely
import json
try:
    parsed = json.loads(output_text)
    print("\nParsed JSON structure:")
    print(parsed)
except json.JSONDecodeError as e:
    print("\nWARNING: Output is not valid JSON")
    print(e)