In [3]:
# Reference 

import time
import warnings
from PIL import Image
from transformers import AutoProcessor
from optimum.intel.openvino import OVModelForVisualCausalLM

# Suppress specific deprecation warnings from optimum implementation of numpy arrays
# This block prevents clogging the API logs 
warnings.filterwarnings("ignore", message="__array__ implementation doesn't accept a copy keyword")


model_id = "/mnt/Ironwolf-4TB/Models/Pytorch/Qwen2.5-VL-7B-Instruct-int4_sym-ov"


ov_config = {"PERFORMANCE_HINT": "LATENCY"}
model = OVModelForVisualCausalLM.from_pretrained(model_id, export=False, device="GPU.2", ov_config=ov_config) #trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_id)


image_path = "dedication.png"
image = Image.open(image_path)
image = image.convert("RGB")

conversation = [
    {
        "role": "user",
        "content": [
            {
                "image": image  # The image object is passed here, not just declared as a type
            },
            {"type": "text", "text": "<image>\nDescribe this image."},
        ],
    }
]


# Preprocess the inputs
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'

inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors="pt")

# Print number of tokens
# print(f"Input token length: {len(inputs.input_ids[0])}")

# Inference: Generation of the output with performance metrics
start_time = time.time()
output_ids = model.generate(**inputs, max_new_tokens=1024)
generation_time = time.time() - start_time

generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)

# Calculate tokens per second
num_tokens_generated = len(generated_ids[0])
tokens_per_second = num_tokens_generated / generation_time

print(f"Generated text: {output_text}")
print(f"Generation time: {generation_time:.2f} seconds")
print(f"Tokens generated: {num_tokens_generated}")
print(f"Speed: {tokens_per_second:.2f} tokens/second")

ValueError: size must contain 'shortest_edge' and 'longest_edge' keys.

In [None]:
pip install optimum[openvino]+https://github.com/huggingface/optimum-intel

In [2]:
import time
import warnings
from PIL import Image
from transformers import AutoProcessor
from optimum.intel.openvino import OVModelForVisualCausalLM

# Suppress specific deprecation warnings
warnings.filterwarnings("ignore", message="__array__ implementation doesn't accept a copy keyword")

model_id = "/mnt/Ironwolf-4TB/Models/Pytorch/gemma-3-4b-it-int4_asym-ov"

ov_config = {"PERFORMANCE_HINT": "LATENCY"}
# Ensure export=False is correct if the model is already converted
model = OVModelForVisualCausalLM.from_pretrained(model_id, export=False, device="GPU.2", ov_config=ov_config)
processor = AutoProcessor.from_pretrained(model_id)

image_path = "dedication.png"
image = Image.open(image_path)
image = image.convert("RGB")

# --- CORRECTED MODIFICATION START ---

# 1. Get the correct "beginning of image" token from the processor
#    This is what the processor internally looks for when matching text and images.
image_token = processor.tokenizer.boi_token # Or potentially processor.boi_token if defined directly

# 2. Define the text prompt using THIS specific token
text_prompt_with_placeholder = f"{image_token}\nDescribe this image."

# 3. Call the processor ONCE, providing both text (with the correct placeholder) and image
inputs = processor(
    text=[text_prompt_with_placeholder],  # Pass the string with the correct token
    images=[image],                       # Pass the PIL image object
    padding=True,
    return_tensors="pt"
) # Move inputs to the same device as the model

# --- CORRECTED MODIFICATION END ---

# Print number of tokens (of the processed input)
print(f"Input token length: {inputs.input_ids.shape[1]}") # Use shape[1] for tensor length

# Inference: Generation of the output with performance metrics
start_time = time.time()
output_ids = model.generate(**inputs, max_new_tokens=1024)
generation_time = time.time() - start_time

# Adjust slicing
input_ids_len = inputs.input_ids.shape[1]
generated_ids = output_ids[:, input_ids_len:] # Correct slicing for tensors

# Post-processing
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)

# Calculate tokens per second
num_tokens_generated = len(generated_ids[0])
tokens_per_second = num_tokens_generated / generation_time if generation_time > 0 else 0

# Join the list of strings into a single string if needed
final_output_text = "".join(output_text)

print(f"Generated text: {final_output_text}")
print(f"Generation time: {generation_time:.2f} seconds")
print(f"Tokens generated: {num_tokens_generated}")
print(f"Speed: {tokens_per_second:.2f} tokens/second")

Input token length: 265




Generated text: 
Generation time: 38.18 seconds
Tokens generated: 1024
Speed: 26.82 tokens/second


In [None]:
# working

import warnings
from PIL import Image
from transformers import AutoProcessor
from optimum.intel.openvino import OVModelForVisualCausalLM

# Suppress specific deprecation warnings from optimum implementation of numpy arrays
# This block prevents clogging the API logs 
warnings.filterwarnings("ignore", message="__array__ implementation doesn't accept a copy keyword")


model_id = "/mnt/Ironwolf-4TB/Models/Pytorch/gemma-3-4b-it-int4_asym-ov"


ov_config = {"PERFORMANCE_HINT": "LATENCY"}
model = OVModelForVisualCausalLM.from_pretrained(model_id, export=False, device="GPU.1", ov_config=ov_config)
processor = AutoProcessor.from_pretrained(model_id)


image_path = "dedication.png"
image = Image.open(image_path)
image = image.convert("RGB")

conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]


# Preprocess the inputs
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'

inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors="pt")

# Print tokenizer length
print(f"Input token length: {len(inputs.input_ids[0])}")

# Generate output
output_ids = model.generate(**inputs, max_new_tokens=1024)

generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)

print(f"Generated text: {output_text}")

Input token length: 273




Generated text: ['']


In [None]:
import base64
from PIL import Image
import io
image_path = "dedication.png"
image = Image.open(image_path)

# Convert image to base64
buffered = io.BytesIO()
image.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')

# Print the base64 encoding
print(f"Base64 encoded image: {img_str}")


In [None]:
# Reference 

import time
import warnings
import base64
from io import BytesIO
from PIL import Image
from transformers import AutoProcessor
from optimum.intel.openvino import OVModelForVisualCausalLM

# Suppress specific deprecation warnings from optimum implementation of numpy arrays
# This block prevents clogging the API logs 
warnings.filterwarnings("ignore", message="__array__ implementation doesn't accept a copy keyword")


model_id = "/mnt/Ironwolf-4TB/Models/OpenVINO/Qwen2.5-VL-3B-Instruct-int4_sym-ov"


ov_config = {"PERFORMANCE_HINT": "LATENCY"}
model = OVModelForVisualCausalLM.from_pretrained(model_id, export=False, device="GPU.1", ov_config=ov_config)
processor = AutoProcessor.from_pretrained(model_id)


# Example base64 encoded image (in a real scenario, this would come from the request)
image_path = "dedication.png"
with open(image_path, "rb") as img_file:
    img_base64 = base64.b64encode(img_file.read()).decode('utf-8')

# Create conversation with base64 image
conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image_url": {
                    "url": f"data:image/png;base64,{img_base64}"
                }
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# Extract and decode the base64 image from the conversation
images = []
for message in conversation:
    if message["role"] == "user":
        for content_item in message["content"]:
            if content_item.get("type") == "image" and "image_url" in content_item:
                # Extract base64 data from the URL
                image_url = content_item["image_url"]["url"]
                if image_url.startswith("data:"):
                    # Parse the base64 data
                    base64_data = image_url.split(",")[1] if "," in image_url else image_url.split(";base64,")[1]
                    # Convert base64 to image
                    image_data = base64.b64decode(base64_data)
                    image = Image.open(BytesIO(image_data))
                    images.append(image)

# Preprocess the inputs
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'

inputs = processor(text=[text_prompt], images=images, padding=True, return_tensors="pt")

# Print tokenizer length
print(f"Input token length: {len(inputs.input_ids[0])}")

# Inference: Generation of the output with performance metrics
start_time = time.time()
output_ids = model.generate(**inputs, max_new_tokens=1024)
generation_time = time.time() - start_time

generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)

# Calculate tokens per second
num_tokens_generated = len(generated_ids[0])
tokens_per_second = num_tokens_generated / generation_time

print(f"Generated text: {output_text}")
print(f"Generation time: {generation_time:.2f} seconds")
print(f"Tokens generated: {num_tokens_generated}")
print(f"Speed: {tokens_per_second:.2f} tokens/second")

In [None]:
from transformers import AutoModelForSequenceClassification
import torch
import openvino as ov

# Load model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
model.eval()

# Define dynamic input shapes (batch, sequence length)
input_shape = [1, 128]  # Example: batch=1, seq_len=128
dummy_input = torch.randint(0, 100, input_shape)

# Convert directly to OpenVINO IR (no ONNX needed!)
ov_model = ov.convert_model(
    model, 
    input=[input_shape],  # Supports dynamic axes like [1, "seq_len"]
    share_weights=True,   # Reduces memory footprint
)

# Save IR (xml + bin)
ov.save_model(ov_model, "bert_ir.xml")


In [None]:
# Reference 

import time
import warnings
import base64
from io import BytesIO
from PIL import Image
from transformers import AutoProcessor
from optimum.intel.openvino import OVModelForVisualCausalLM

# Suppress specific deprecation warnings from optimum implementation of numpy arrays
# This block prevents clogging the API logs 
warnings.filterwarnings("ignore", message="__array__ implementation doesn't accept a copy keyword")


model_id = "/mnt/Ironwolf-4TB/Models/OpenVINO/Qwen2.5-VL-3B-Instruct-int4_sym-ov"


ov_config = {"PERFORMANCE_HINT": "LATENCY"}
model = OVModelForVisualCausalLM.from_pretrained(model_id, export=False, device="GPU.1", ov_config=ov_config)
processor = AutoProcessor.from_pretrained(model_id)


# Example base64 encoded image (in a real scenario, this would come from the request)
image_path = "dedication.png"
with open(image_path, "rb") as img_file:
    img_base64 = base64.b64encode(img_file.read()).decode('utf-8')

# Create conversation with base64 image
conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image_url": {
                    "url": f"data:image/png;base64,{img_base64}"
                }
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# Extract and decode the base64 image from the conversation
images = []
for message in conversation:
    if message["role"] == "user":
        for content_item in message["content"]:
            if content_item.get("type") == "image" and "image_url" in content_item:
                # Extract base64 data from the URL
                image_url = content_item["image_url"]["url"]
                if image_url.startswith("data:"):
                    # Parse the base64 data
                    base64_data = image_url.split(",")[1] if "," in image_url else image_url.split(";base64,")[1]
                    # Convert base64 to image
                    image_data = base64.b64decode(base64_data)
                    image = Image.open(BytesIO(image_data))
                    images.append(image)

# Preprocess the inputs
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'

inputs = processor(text=[text_prompt], images=images, padding=True, return_tensors="pt")

# Print tokenizer length
print(f"Input token length: {len(inputs.input_ids[0])}")

# Inference: Generation of the output with performance metrics
start_time = time.time()
output_ids = model.generate(**inputs, max_new_tokens=1024)
generation_time = time.time() - start_time

generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)

# Calculate tokens per second
num_tokens_generated = len(generated_ids[0])
tokens_per_second = num_tokens_generated / generation_time

print(f"Generated text: {output_text}")
print(f"Generation time: {generation_time:.2f} seconds")
print(f"Tokens generated: {num_tokens_generated}")
print(f"Speed: {tokens_per_second:.2f} tokens/second")