In [None]:
import torch
from PIL import Image, ImageGrab
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image
from transformers import TextStreamer
import os

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [None]:
DEVICE = "mps"

# Initialize processor and model
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-256M-Instruct", size={"longest_edge": 1*512})
streamer = TextStreamer(processor, skip_prompt=False)
model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-500M-Instruct",
    torch_dtype=torch.bfloat16,
    _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
    tie_word_embeddings=True,
    low_cpu_mem_usage=True,
).to(DEVICE)

In [None]:
# Take a screenshot
screenshot = ImageGrab.grab()
# Load images
image = screenshot #load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")


# Create input messages
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "Who is the current prime minister of canada?"}
        ]
    },
]

# Prepare inputs
prompt = "<|endoftext|>" + processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
inputs = inputs.to(DEVICE)

# Generate outputs
generated_ids = model.generate(**inputs, streamer=streamer, max_new_tokens=32)
generated_texts = processor.batch_decode(
    generated_ids,
    skip_special_tokens=True,
)

# print(generated_texts[0])

In [None]:
image.show()

In [None]:
processor.chat_template