In [1]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

# default: Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="cpu"
)

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2-VL-7B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)


Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [3]:
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "hotel.jpeg",
            },
            {
                "type": "text",
                "text": "Describe the image"
            },
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
# inputs = inputs.to("cpu")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

['The image depicts a modern dining room with a large, rectangular wooden dining table surrounded by six upholstered chairs. The table is set with white plates, silverware, and neatly folded napkins. The chairs have a contemporary design with a mix of beige and brown tones. The room features large windows that allow natural light to flood in, and there is a plant on a shelf near the windows. The overall aesthetic is clean and minimalist, with a focus on modern design elements.']


Foolow up with the list

In [4]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="cpu"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

# Initialize memory to store chat history (up to 10 chats)
memory = []

def add_to_memory(new_message, memory_limit=10):
    """Add new message to memory, maintaining the limit of memory."""
    memory.append(new_message)
    if len(memory) > memory_limit:
        memory.pop(0)  # Remove the oldest message to keep the memory size in check

def generate_response(image_path=None, user_text=None):
    # Add the new user message to memory
    new_message = {
        "role": "user",
        "content": []
    }
    
    # Add image if provided
    if image_path:
        new_message["content"].append({"type": "image", "image": image_path})
    
    # Add text if provided
    if user_text:
        new_message["content"].append({"type": "text", "text": user_text})
    
    # Add this message to the chat memory
    add_to_memory(new_message)
    
    # Process memory for the model input
    text_input = processor.apply_chat_template(memory, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(memory)
    inputs = processor(
        text=[text_input],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    
    # Inference
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    
    # Add the model's response to memory
    response_message = {
        "role": "assistant",
        "content": [{"type": "text", "text": output_text[0]}]
    }
    add_to_memory(response_message)

    return output_text[0]

# Example usage
# You can replace "hotel.jpeg" with an actual image path and "Describe the image" with any question
response = generate_response(image_path="hotel.jpeg", user_text="Describe the image")
print(response)

# Continue chatting
followup_response = generate_response(user_text="Can you suggest similar places?")
print(followup_response)


Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

The image depicts a modern dining room with a large, rectangular wooden dining table surrounded by six upholstered chairs. The table is set with white plates, silverware, and neatly folded napkins. The chairs have a contemporary design with a mix of beige and brown tones. The room features large windows that allow natural light to flood in, and there is a plant on a shelf near the windows. The overall aesthetic is clean and minimalist, with a focus on modern design elements.
Certainly! Here are some similar places you might enjoy:

1. **Modern Dining Room with Large Windows**: This type of room is perfect for those who love natural light and a spacious, open feel. You can find similar designs in upscale homes, modern apartments, or even in some hotels and restaurants.

2. **Contemporary Furniture**: The upholstered chairs and wooden dining table are key elements in this room. You can find similar furniture at home decor stores, online retailers like Wayfair or IKEA, or even at local fu

In [5]:
memory

[{'role': 'user',
  'content': [{'type': 'image', 'image': 'hotel.jpeg'},
   {'type': 'text', 'text': 'Describe the image'}]},
 {'role': 'assistant',
  'content': [{'type': 'text',
    'text': 'The image depicts a modern dining room with a large, rectangular wooden dining table surrounded by six upholstered chairs. The table is set with white plates, silverware, and neatly folded napkins. The chairs have a contemporary design with a mix of beige and brown tones. The room features large windows that allow natural light to flood in, and there is a plant on a shelf near the windows. The overall aesthetic is clean and minimalist, with a focus on modern design elements.'}]},
 {'role': 'user',
  'content': [{'type': 'text', 'text': 'Can you suggest similar places?'}]},
 {'role': 'assistant',
  'content': [{'type': 'text',
    'text': 'Certainly! Here are some similar places you might enjoy:\n\n1. **Modern Dining Room with Large Windows**: This type of room is perfect for those who love natur

ConversationBufferWindowMemory

In [21]:
import torch
from torchvision import transforms

# Define image transformation pipeline
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resizing image to expected size
    transforms.ToTensor(),  # Convert PIL image to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize with ImageNet values
])

# Open the image and apply the transformations
image = Image.open("hotel.jpeg")
image_tensor = transform(image).unsqueeze(0)  # Add batch dimension

In [24]:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from langchain.llms import BaseLLM
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferWindowMemory
from PIL import Image

# Load Qwen2-VL model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="cpu")
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

class Qwen2VLWrapper(BaseLLM):
    def __init__(self, model, processor):
        super().__init__()
        object.__setattr__(self, 'model', model)  # Explicitly bypass Pydantic
        object.__setattr__(self, 'processor', processor)  # Explicitly bypass Pydantic

    def _generate(self, prompt: str, image=None, **kwargs) -> str:
        # Check that prompt is a valid string
        if not isinstance(prompt, str):
            raise TypeError(f"Expected prompt to be a string, but got {type(prompt)}")
        
        # Check image format
        if image:
            assert isinstance(image, (PIL.Image.Image, torch.Tensor)), "Image must be a PIL Image or a Torch Tensor"

        # Process inputs for the model
        inputs = self.processor(
            text=[prompt],  # Ensure prompt is passed in a list
            images=[image] if image else None,  # Ensure image is passed in a list
            padding=True,
            return_tensors="pt"
        )

        # Generate response
        generated_ids = self.model.generate(**inputs, max_new_tokens=128)
        return self.processor.decode(generated_ids[0], skip_special_tokens=True)


    def _llm_type(self) -> str:
        return "qwen2vl"

    def __call__(self, prompt: str, image=None, **kwargs) -> str:
        return self._generate(prompt, image=image, **kwargs)

# Initialize memory
memory = ConversationBufferWindowMemory(window_size=10)

# Initialize the custom wrapper
qwen_wrapper = Qwen2VLWrapper(model=model, processor=processor)

# Initialize ConversationChain with the custom wrapper
conversation_chain = ConversationChain(
    llm=qwen_wrapper,
    memory=memory,
    verbose=True
)

# Generate Responses
def generate_response(user_text, image_path=None):
    if image_path:
        image = Image.open(image_path)
        prompt = f"{user_text} <|image|>"  # Include <|image|> token to process image input
        image_tensor = transform(image).unsqueeze(0)  # Convert image to tensor
        print(image_tensor)
    else:
        prompt = user_text
        image_tensor = None

    # Generate response from conversation chain
    response = conversation_chain.run({"input": prompt, "image": image_tensor})  # Send image along with prompt
    return response


Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

TypeError: argument of type 'NoneType' is not iterable

In [23]:

# Example usage
initial_response = generate_response(user_text="Describe the image", image_path="hotel.jpeg")
print("Initial Response:", initial_response)

follow_up_response = generate_response(user_text="Can you tell me more about the room?")
print("Follow-Up Response:", follow_up_response)



tensor([[[[-0.1486, -0.1486, -0.1143,  ..., -0.1828, -0.2342, -0.2342],
          [-0.1657, -0.1657, -0.1314,  ..., -0.1314, -0.1143, -0.1143],
          [-0.2171, -0.2171, -0.1999,  ..., -0.0972, -0.0287, -0.0458],
          ...,
          [-0.9705, -0.9534, -0.9534,  ..., -1.0219, -1.0390, -1.0390],
          [-0.9705, -0.9534, -0.9534,  ..., -1.0390, -1.0390, -1.0390],
          [-0.9705, -0.9534, -0.9534,  ..., -1.0562, -1.0562, -1.0562]],

         [[-0.6702, -0.6702, -0.6702,  ..., -0.8803, -0.9153, -0.8803],
          [-0.6527, -0.6527, -0.6527,  ..., -0.8452, -0.7927, -0.7577],
          [-0.6352, -0.6352, -0.6352,  ..., -0.7927, -0.7052, -0.6877],
          ...,
          [-1.5105, -1.4930, -1.4930,  ..., -1.3704, -1.4230, -1.4230],
          [-1.5105, -1.4930, -1.4930,  ..., -1.4055, -1.4230, -1.4230],
          [-1.5105, -1.4930, -1.4930,  ..., -1.4230, -1.4405, -1.4405]],

         [[-0.8981, -0.8981, -0.8981,  ..., -1.0550, -1.1073, -1.0898],
          [-0.8284, -0.8284, -

  response = conversation_chain.run({"input": prompt, "image": image_tensor})  # Send image along with prompt


TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]