In [10]:
import torch
from PIL import Image
from transformers import AutoProcessor, CLIPModel
import torch.nn.functional as F

In [11]:
stephen_hat = Image.open('me_with_hat.jpeg')
stephen_no_hat = Image.open('me_no_hat_cropped_1.jpeg')

In [12]:
device = torch.device("cpu")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [13]:
with torch.no_grad():
    stephen_no_hat_inputs = processor(images=stephen_no_hat, return_tensors="pt").to(device)
    stephen_hat_inputs = processor(images=stephen_hat, return_tensors="pt").to(device)

    image_features_hat = model.get_image_features(**stephen_no_hat_inputs)
    image_features_no_hat = model.get_image_features(**stephen_hat_inputs)

    image_features_hat = F.normalize(image_features_hat, dim=-1)
    image_features_no_hat = F.normalize(image_features_no_hat, dim=-1)

In [14]:
def compute_similarity(text):
    with torch.no_grad():
        text_inputs = processor(text=[text], return_tensors="pt").to(device)
        text_features = model.get_text_features(**text_inputs)
        text_features = F.normalize(text_features, dim=-1)

        similarity_stephen_hat = (image_features_hat @ text_features.T).squeeze().item()
        similarity_stephen_no_hat = (image_features_no_hat @ text_features.T).squeeze().item()  
        
        print(f"Text: '{text}'")
        print(f"Cosine similarity with 'me_with_hat.jpeg': {similarity_stephen_hat:.4f}")
        print(f"Cosine similarity with 'me_no_hat_cropped_1.jpeg': {similarity_stephen_no_hat:.4f}")

In [16]:
compute_similarity('hat')

Text: 'hat'
Cosine similarity with 'me_with_hat.jpeg': 0.1775
Cosine similarity with 'me_no_hat_cropped_1.jpeg': 0.2286
