# Task 14.1 Solutions: Vision-Language Models Demo

This notebook contains solutions to the exercises in the Vision-Language Demo notebook.

---

## Challenge Solution: Multi-Image Comparison Tool

The challenge was to build a function that:
1. Takes two images as input
2. Describes each image separately
3. Compares and contrasts the two images
4. Suggests which image is better for a given purpose

### Approach
We use a two-step strategy:
1. **Individual Analysis**: First, analyze each image separately to get detailed descriptions
2. **Combined Comparison**: Create a side-by-side composite image and ask the VLM to compare "left" vs "right"

This approach works better than just using text descriptions because VLMs can notice visual differences that are hard to describe in text (like color vibrancy, composition balance, etc.).

### Alternative Approach (Also Shown)
We also demonstrate using CLIP embeddings for objective comparison against specific criteria. This gives numerical similarity scores that can be compared programmatically.

In [None]:
import torch
from PIL import Image
from typing import Dict
import gc

def clear_gpu_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

In [None]:
# Load the VLM
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct",
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

In [None]:
def ask_about_image(image: Image.Image, question: str) -> str:
    """Ask a question about a single image."""
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": question}
        ]
    }]
    
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], images=[image], return_tensors="pt", padding=True)
    inputs = inputs.to(model.device)
    
    with torch.inference_mode():
        output_ids = model.generate(**inputs, max_new_tokens=200, temperature=0.7)
    
    generated_ids = output_ids[:, inputs.input_ids.shape[1]:]
    return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]


def compare_images(
    image1: Image.Image, 
    image2: Image.Image, 
    purpose: str = "social media post"
) -> Dict:
    """
    Compare two images and recommend which is better for a given purpose.
    
    Args:
        image1: First image
        image2: Second image  
        purpose: What the images will be used for
        
    Returns:
        Dictionary with comparison results
    """
    results = {}
    
    # Get descriptions of each image
    print("Analyzing Image 1...")
    results['image1_description'] = ask_about_image(
        image1, 
        "Describe this image in 2-3 sentences, noting its composition, colors, and mood."
    )
    
    print("Analyzing Image 2...")
    results['image2_description'] = ask_about_image(
        image2,
        "Describe this image in 2-3 sentences, noting its composition, colors, and mood."
    )
    
    # Create a side-by-side comparison image
    # Resize images to same height
    target_height = 400
    
    img1_resized = image1.copy()
    img1_resized.thumbnail((img1_resized.width * target_height // img1_resized.height, target_height))
    
    img2_resized = image2.copy()
    img2_resized.thumbnail((img2_resized.width * target_height // img2_resized.height, target_height))
    
    # Create combined image
    gap = 20
    combined_width = img1_resized.width + img2_resized.width + gap
    combined = Image.new('RGB', (combined_width, target_height), 'white')
    combined.paste(img1_resized, (0, 0))
    combined.paste(img2_resized, (img1_resized.width + gap, 0))
    
    # Ask for comparison
    print("Comparing images...")
    comparison_prompt = f"""These are two images side by side - the left image and the right image.
    
Compare them and tell me:
1. What are the main differences between them?
2. Which image would be better for {purpose} and why?
3. What improvements could make each image better?

Be specific and concise."""
    
    results['comparison'] = ask_about_image(combined, comparison_prompt)
    results['purpose'] = purpose
    
    return results

print("compare_images() function ready!")

In [None]:
# Test the comparison function with sample images

# Create two different test images
from PIL import ImageDraw

# Image 1: Bright, simple composition
img1 = Image.new('RGB', (400, 400), 'lightblue')
draw1 = ImageDraw.Draw(img1)
draw1.ellipse([100, 100, 300, 300], fill='yellow')  # Sun

# Image 2: Darker, complex composition  
img2 = Image.new('RGB', (400, 400), 'darkblue')
draw2 = ImageDraw.Draw(img2)
draw2.ellipse([250, 50, 350, 150], fill='white')  # Moon
for i in range(20):  # Stars
    x, y = (i * 20) % 400, (i * 17) % 400
    draw2.ellipse([x, y, x+5, y+5], fill='white')

# Compare
print("\n" + "=" * 50)
print("IMAGE COMPARISON RESULTS")
print("=" * 50)

results = compare_images(img1, img2, purpose="a children's book cover")

print(f"\nImage 1 Description:")
print(f"  {results['image1_description']}")

print(f"\nImage 2 Description:")
print(f"  {results['image2_description']}")

print(f"\nComparison for '{results['purpose']}':")
print(f"  {results['comparison']}")

In [None]:
# Alternative approach: Using CLIP for similarity-based comparison

from transformers import CLIPProcessor, CLIPModel
import numpy as np

def compare_with_clip(
    image1: Image.Image,
    image2: Image.Image,
    criteria: list
) -> Dict:
    """
    Compare images using CLIP embeddings against various criteria.
    
    Args:
        image1: First image
        image2: Second image
        criteria: List of text descriptions to compare against
        
    Returns:
        Dictionary with similarity scores
    """
    clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
    clip_model = CLIPModel.from_pretrained(
        "openai/clip-vit-large-patch14",
        torch_dtype=torch.float16
    ).to("cuda")
    
    results = {'image1_scores': {}, 'image2_scores': {}, 'winner': {}}
    
    for criterion in criteria:
        # Get embeddings
        inputs1 = clip_processor(images=image1, text=[criterion], return_tensors="pt")
        inputs1 = {k: v.to(clip_model.device) for k, v in inputs1.items()}
        
        inputs2 = clip_processor(images=image2, text=[criterion], return_tensors="pt")
        inputs2 = {k: v.to(clip_model.device) for k, v in inputs2.items()}
        
        with torch.inference_mode():
            outputs1 = clip_model(**inputs1)
            outputs2 = clip_model(**inputs2)
            
            # Cosine similarity between image and text
            score1 = outputs1.logits_per_image.item()
            score2 = outputs2.logits_per_image.item()
        
        results['image1_scores'][criterion] = score1
        results['image2_scores'][criterion] = score2
        results['winner'][criterion] = 'Image 1' if score1 > score2 else 'Image 2'
    
    # Cleanup
    del clip_model, clip_processor
    clear_gpu_memory()
    
    return results

# Test CLIP comparison
criteria = [
    "a bright and cheerful image",
    "a calm and peaceful scene",
    "suitable for children",
    "professional and polished"
]

clip_results = compare_with_clip(img1, img2, criteria)

print("\nCLIP-based comparison:")
print("-" * 50)
for criterion in criteria:
    print(f"\n{criterion}:")
    print(f"  Image 1: {clip_results['image1_scores'][criterion]:.2f}")
    print(f"  Image 2: {clip_results['image2_scores'][criterion]:.2f}")
    print(f"  Winner: {clip_results['winner'][criterion]}")

In [None]:
# Cleanup
del model, processor
clear_gpu_memory()
print("Solutions notebook complete!")