# Q3

In [None]:
# %pip install transformers torchvision torch

## 1. & 2.

In [4]:
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch

# Load model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

captions = {}

# Loop images from 1 to 10 (Renamed)
for i in range(1, 11):
    image_path = f"{image_dir}/{i}.jpg"
    try:
        image = Image.open(image_path).convert("RGB")
        # Preprocess
        inputs = processor(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            output = model.generate(**inputs)
            caption = processor.decode(output[0], skip_special_tokens=True)

        captions[f"{i}.jpg"] = caption
        print(f"{i}.jpg: {caption}")
    except Exception as e:
        print(f"Failed to process {i}.jpg: {e}")



1.jpg: a dog is walking on a green carpet
2.jpg: a small dog running across a green field
3.jpg: the girls in the pool
4.jpg: a bird perched on a plant
5.jpg: a small dog standing on a stone ledge next to a pool
6.jpg: a man riding a bike down the street
7.jpg: a brown butterfly sitting on a green plant
8.jpg: a man in a suit and tie sitting on a couch
9.jpg: a duck drinking water from a pond
10.jpg: a coffee machine with a cup of coffee


## 3.

In [None]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import os

# Load model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Hardcoded BLIPgenerated captions from previous cell
captions = {
    "1.jpg": "a dog is walking on a green carpet",
    "2.jpg": "a small dog running across a green field",
    "3.jpg": "the girls in the pool",
    "4.jpg": "a bird perched on a plant",
    "5.jpg": "a small dog standing on a stone ledge next to a pool",
    "6.jpg": "a man riding a bike down the street",
    "7.jpg": "a brown butterfly sitting on a green plant",
    "8.jpg": "a man in a suit and tie sitting on a couch",
    "9.jpg": "a duck drinking water from a pond",
    "10.jpg": "a coffee machine with a cup of coffee"
}

# Image folder path
image_dir = "Images"

similarity_scores = {}

# Loop through images (Renamed)
for i in range(1, 11):
    fname = f"{i}.jpg"
    caption = captions[fname]
    image_path = f"{image_dir}/{fname}"

    image = Image.open(image_path).convert("RGB")

    # Process image
    inputs = processor(text=[caption], images=image, return_tensors="pt", padding=True).to(device)

    # Get CLIP embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        image_embeds = outputs.image_embeds
        text_embeds = outputs.text_embeds

    # Normalize embeddings
    image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
    text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)

    # Compute cosine similarity
    similarity = (image_embeds @ text_embeds.T).squeeze().item()
    similarity_scores[fname] = similarity

    print(f"{fname}: Similarity Score = {similarity:.4f}")

1.jpg: Similarity Score = 0.3008
2.jpg: Similarity Score = 0.3295
3.jpg: Similarity Score = 0.2696
4.jpg: Similarity Score = 0.2800
5.jpg: Similarity Score = 0.3225
6.jpg: Similarity Score = 0.2648
7.jpg: Similarity Score = 0.2965
8.jpg: Similarity Score = 0.2876
9.jpg: Similarity Score = 0.3037
10.jpg: Similarity Score = 0.2816


## 4.

In [6]:
import torch
import torch.nn.functional as F
from PIL import Image
import os
from open_clip import create_model_from_pretrained, get_tokenizer

# Load model and processor
model, preprocess = create_model_from_pretrained('hf-hub:UCSC-VLAA/ViT-L-14-CLIPS-Recap-DataComp-1B')
tokenizer = get_tokenizer('hf-hub:UCSC-VLAA/ViT-L-14-CLIPS-Recap-DataComp-1B')

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# Hardcoded BLIPgenerated captions from previous cell
captions = {
    "1.jpg": "a dog is walking on a green carpet",
    "2.jpg": "a small dog running across a green field",
    "3.jpg": "the girls in the pool",
    "4.jpg": "a bird perched on a plant",
    "5.jpg": "a small dog standing on a stone ledge next to a pool",
    "6.jpg": "a man riding a bike down the street",
    "7.jpg": "a brown butterfly sitting on a green plant",
    "8.jpg": "a man in a suit and tie sitting on a couch",
    "9.jpg": "a duck drinking water from a pond",
    "10.jpg": "a coffee machine with a cup of coffee"
}

# Image folder path
image_dir = "Images"

similarity_scores = {}

# Loop through images (Renamed)
for i in range(1, 11):
    fname = f"{i}.jpg"
    caption = captions[fname]
    image_path = f"{image_dir}/{fname}"

    image = Image.open(image_path).convert("RGB")
    image_tensor = preprocess(image).unsqueeze(0).to(device)

    # Tokenize caption
    text = tokenizer([caption], context_length=model.context_length).to(device)

    # Encode and compute similarity
    with torch.no_grad(), torch.cuda.amp.autocast():
        image_features = model.encode_image(image_tensor)
        text_features = model.encode_text(text)
        image_features = F.normalize(image_features, dim=-1)
        text_features = F.normalize(text_features, dim=-1)

        # Compute cosine similarity
        similarity = (image_features @ text_features.T).squeeze().item()

    similarity_scores[fname] = similarity

    print(f"{fname}: Similarity Score = {similarity:.4f}")

  with torch.no_grad(), torch.cuda.amp.autocast():


1.jpg: Similarity Score = 0.1849
2.jpg: Similarity Score = 0.2014
3.jpg: Similarity Score = 0.1510
4.jpg: Similarity Score = 0.1666
5.jpg: Similarity Score = 0.1887
6.jpg: Similarity Score = 0.1526
7.jpg: Similarity Score = 0.1739
8.jpg: Similarity Score = 0.1298
9.jpg: Similarity Score = 0.1705
10.jpg: Similarity Score = 0.1330
