In [5]:
import time
import warnings
from PIL import Image
from transformers import AutoProcessor
from optimum.intel.openvino import OVModelForVisualCausalLM

# Suppress specific deprecation warnings from optimum implementation of numpy arrays
# This block prevents clogging the API logs 
warnings.filterwarnings("ignore", message="__array__ implementation doesn't accept a copy keyword")


model_id = "/mnt/Ironwolf-4TB/Models/OpenVINO/Qwen2-VL-7B-Instruct-int4_asym-ov"
# Load the model in half-precision on the available device(s)

ov_config = {"PERFORMANCE_HINT": "LATENCY"}
model = OVModelForVisualCausalLM.from_pretrained(model_id, export=False, device="GPU.0", ov_config=ov_config)
processor = AutoProcessor.from_pretrained(model_id)


image_path = "dedication.png"
image = Image.open(image_path)

conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]


# Preprocess the inputs
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'

inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors="pt")

# Print tokenizer length
print(f"Input token length: {len(inputs.input_ids[0])}")

# Inference: Generation of the output with performance metrics
start_time = time.time()
output_ids = model.generate(**inputs, max_new_tokens=1024)
generation_time = time.time() - start_time

generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)

# Calculate tokens per second
num_tokens_generated = len(generated_ids[0])
tokens_per_second = num_tokens_generated / generation_time

print(f"Generated text: {output_text}")
print(f"Generation time: {generation_time:.2f} seconds")
print(f"Tokens generated: {num_tokens_generated}")
print(f"Speed: {tokens_per_second:.2f} tokens/second")

Input token length: 547
Generated text: ['The image is a scanned page from a book, specifically page number xxiii. The text is written in English and appears to be a dedication or introductory note. The page is divided into two main sections, with the first section titled "Dedication" and the second section containing a series of statements and reflections.\n\n### Markdown Format of the Text:\n\n```markdown\n**Dedication.** xxiii\n\ntaking the word \'law\' in two different senses. You are confounding, as Hooker and Montes- quieu confounded, the laws which depend for their validity on rewards and punishments, with the laws which are generalizations from phenomena." I reply, "It is true that I am using the word \'law\' as Hooker and Montes- quieu used it. I am using it so deliberately. For I believe that those who make moral laws depend upon the rewards and punishments which enforce them are destroying morality; and that those who make physical laws into mere generalizations from phenome

In [None]:
!pip list

In [None]:
import gradio as gr
import time

def echo(message, history, system_prompt, tokens):
    response = f"System prompt: {system_prompt}\n Message: {message}."
    for i in range(min(len(response), int(tokens))):
        time.sleep(0.05)
        yield response[: i + 1]

demo = gr.ChatInterface(
    echo,
    type="messages",
    additional_inputs=[
        gr.Textbox("You are helpful AI.", label="System Prompt"),
        gr.Slider(10, 100),
    ],
)

demo.launch()
