In [2]:
# Reference 

import time
import warnings
from PIL import Image
from transformers import AutoProcessor
from optimum.intel.openvino import OVModelForVisualCausalLM

# Suppress specific deprecation warnings from optimum implementation of numpy arrays
# This block prevents clogging the API logs 
warnings.filterwarnings("ignore", message="__array__ implementation doesn't accept a copy keyword")


model_id = "/mnt/Ironwolf-4TB/Models/OpenVINO/Qwen2-VL-7B-Instruct-int4_asym-ov"


ov_config = {"PERFORMANCE_HINT": "LATENCY"}
model = OVModelForVisualCausalLM.from_pretrained(model_id, export=False, device="GPU.0", ov_config=ov_config)
processor = AutoProcessor.from_pretrained(model_id)


image_path = "dedication.png"
image = Image.open(image_path)

conversation = [
    {
        "role": "user",
        "content": [
            {
                "image": image  # The image object is passed here, not just declared as a type
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]


# Preprocess the inputs
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'

inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors="pt")

# Print tokenizer length
print(f"Input token length: {len(inputs.input_ids[0])}")

# Inference: Generation of the output with performance metrics
start_time = time.time()
output_ids = model.generate(**inputs, max_new_tokens=1024)
generation_time = time.time() - start_time

generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)

# Calculate tokens per second
num_tokens_generated = len(generated_ids[0])
tokens_per_second = num_tokens_generated / generation_time

print(f"Generated text: {output_text}")
print(f"Generation time: {generation_time:.2f} seconds")
print(f"Tokens generated: {num_tokens_generated}")
print(f"Speed: {tokens_per_second:.2f} tokens/second")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Input token length: 547
Generated text: ['The image is a scanned page from a book, specifically page number xxiii. The text is written in English and appears to be a dedication or introductory note. The page is divided into two main sections, with the first section titled "Dedication" and the second section containing a series of statements and reflections.\n\n### Markdown Format of the Text:\n\n```markdown\n**Dedication.** xxiii\n\ntaking the word \'law\' in two different senses. You are confounding, as Hooker and Montes- quieu confounded, the laws which depend for their validity on rewards and punishments, with the laws which are generalizations from phenomena." I reply, "It is true that I am using the word \'law\' as Hooker and Montes- quieu used it. I am using it so deliberately. For I believe that those who make moral laws depend upon the rewards and punishments which enforce them are destroying morality; and that those who make physical laws into mere generalizations from phenome

In [None]:
# working

import warnings
from PIL import Image
from transformers import AutoProcessor
from optimum.intel.openvino import OVModelForVisualCausalLM

# Suppress specific deprecation warnings from optimum implementation of numpy arrays
# This block prevents clogging the API logs 
warnings.filterwarnings("ignore", message="__array__ implementation doesn't accept a copy keyword")


model_id = "/mnt/Ironwolf-4TB/Models/OpenVINO/Qwen2-VL-7B-Instruct-int4_asym-ov"


ov_config = {"PERFORMANCE_HINT": "LATENCY"}
model = OVModelForVisualCausalLM.from_pretrained(model_id, export=False, device="GPU.0", ov_config=ov_config)
processor = AutoProcessor.from_pretrained(model_id)


image_path = "dedication.png"
image = Image.open(image_path)

conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]


# Preprocess the inputs
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'

inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors="pt")

# Print tokenizer length
print(f"Input token length: {len(inputs.input_ids[0])}")

# Generate output
output_ids = model.generate(**inputs, max_new_tokens=1024)

generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)

print(f"Generated text: {output_text}")

In [3]:
import base64
from PIL import Image
import io
image_path = "dedication.png"
image = Image.open(image_path)

# Convert image to base64
buffered = io.BytesIO()
image.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')

# Print the base64 encoding
print(f"Base64 encoded image: {img_str}")


Base64 encoded image: iVBORw0KGgoAAAANSUhEUgAAAf4AAAM2CAAAAAAyIL4JAAEAAElEQVR4nOydZ2AU1deHn23JpnfSExJC772XUKT3joB0UEAFRFEEBUFAUZEivSkColRBeu+991ASIBAS0nt297wfNp1AApK/+ibPB5iZe26Z/U1m7ty55x6FUEjBRflPN6CQf5JC+Qs0hfIXaArlL9AUyl+gKZS/QFMof4GmUP4CTaH8BZpC+Qs0hfIXaArlL9AUyl+gKZS/QFMof4GmUP4CTaH8BZpC+Qs0hfIXaArlL9AUyl+gKZS/QFMof4GmUP4CTaH8BZpC+Qs0hfIXaArlL9AUyl+gKZS/QFMof4GmUP4CTaH8BZpC+Qs0hfIXaArlL9AUyl+gKZS/QFMof4GmUP4CTaH8BZpC+Qs0hfIXaArlL9AUyl+gKZS/QFMof4GmUP4CTaH8BZpC+f+tXDscB1y6C3BkUWz+VKLOn2Lzn0fDb5qUA48ijm3tsyXJ/vPQyYf4mY7vvaSEa9s7+uZnC/8e9yonTxvHkUbK39uzsh8xY/Kllv+s/LcvWriu02FmSLL5rXnWpLheIRA/gfeXqntkvzQAOD/j/Tow9q8du/8XLX09HiRzDS7q9Y9hM+TXhSr/YYrg+yxwJo4B2Y4n/A77RbpRIjHHfFVYJyLt6JzfDfwbxLasfU3kXuu+MSIhQ1bmUy3/ZfnPa1ggYmjHB9lT5uCWLBLw+Z0c8+3G2SAi5764m88N/Pfzn735Az+kWHYARcUty2cBPPu5Vu3UlL300ECxrwCIu+TrDBBnkZq6im4KoHJlAF2Yy/+yzTmwLw5FTVXC3chqrmFJYPbc8yoyLj6qWj5V/k9ff69PuBn+IiKz4KaI7iNLrE+IhDZxmRWrVpwVWdH3hIgkj7N11kwUkaNO9QOGNYuRPZ878OENeTRkwDMRud9KVWq2XmR6q/WJ696e/w+cxi0Ay1JAsRMmQDF/Fx

In [1]:
# Reference 

import time
import warnings
import base64
from io import BytesIO
from PIL import Image
from transformers import AutoProcessor
from optimum.intel.openvino import OVModelForVisualCausalLM

# Suppress specific deprecation warnings from optimum implementation of numpy arrays
# This block prevents clogging the API logs 
warnings.filterwarnings("ignore", message="__array__ implementation doesn't accept a copy keyword")


model_id = "/mnt/Ironwolf-4TB/Models/OpenVINO/Qwen2-VL-7B-Instruct-int4_asym-ov"


ov_config = {"PERFORMANCE_HINT": "LATENCY"}
model = OVModelForVisualCausalLM.from_pretrained(model_id, export=False, device="GPU.1", ov_config=ov_config)
processor = AutoProcessor.from_pretrained(model_id)


# Example base64 encoded image (in a real scenario, this would come from the request)
image_path = "dedication.png"
with open(image_path, "rb") as img_file:
    img_base64 = base64.b64encode(img_file.read()).decode('utf-8')

# Create conversation with base64 image
conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image_url": {
                    "url": f"data:image/png;base64,{img_base64}"
                }
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# Extract and decode the base64 image from the conversation
images = []
for message in conversation:
    if message["role"] == "user":
        for content_item in message["content"]:
            if content_item.get("type") == "image" and "image_url" in content_item:
                # Extract base64 data from the URL
                image_url = content_item["image_url"]["url"]
                if image_url.startswith("data:"):
                    # Parse the base64 data
                    base64_data = image_url.split(",")[1] if "," in image_url else image_url.split(";base64,")[1]
                    # Convert base64 to image
                    image_data = base64.b64decode(base64_data)
                    image = Image.open(BytesIO(image_data))
                    images.append(image)

# Preprocess the inputs
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'

inputs = processor(text=[text_prompt], images=images, padding=True, return_tensors="pt")

# Print tokenizer length
print(f"Input token length: {len(inputs.input_ids[0])}")

# Inference: Generation of the output with performance metrics
start_time = time.time()
output_ids = model.generate(**inputs, max_new_tokens=1024)
generation_time = time.time() - start_time

generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)

# Calculate tokens per second
num_tokens_generated = len(generated_ids[0])
tokens_per_second = num_tokens_generated / generation_time

print(f"Generated text: {output_text}")
print(f"Generation time: {generation_time:.2f} seconds")
print(f"Tokens generated: {num_tokens_generated}")
print(f"Speed: {tokens_per_second:.2f} tokens/second")

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Input token length: 547
Generated text: ['The image is a scanned page from a book, specifically page number xxiii. The text is written in English and appears to be a dedication or introductory note. The page is divided into two main sections, with the first section titled "Dedication" and the second section containing a series of statements and reflections.\n\n### Markdown Format of the Text:\n\n```markdown\n**Dedication.** xxiii\n\ntaking the word \'law\' in two different senses. You are confounding, as Hooker and Montes- quieu confounded, the laws which depend for their validity on rewards and punishments, with the laws which are generalizations from phenomena." I reply, "It is true that I am using the word \'law\' as Hooker and Montes- quieu used it. I am using it so deliberately. For I believe that those who make moral laws depend upon the rewards and punishments which enforce them are destroying morality; and that those who make physical laws into mere generalizations from phenome

In [1]:
from optimum.intel import OVDiffusionPipeline

model_id = "OpenVINO/LCM_Dreamshaper_v7-int8-ov"
pipeline = OVDiffusionPipeline.from_pretrained(model_id, export=False, device="GPU.1")

prompt = "sailing ship in storm by Rembrandt"
images = pipeline(prompt, num_inference_steps=4).images


  from .autonotebook import tqdm as notebook_tqdm
Fetching 24 files: 100%|██████████| 24/24 [00:10<00:00,  2.36it/s]


OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /home/echo/.cache/huggingface/hub/models--OpenVINO--LCM_Dreamshaper_v7-int8-ov/snapshots/340a03e5488f36f7f222e6efa29819850ad7e4be/safety_checker.