## p17 to p27

### p17

In [1]:
import torch
from PIL import Image
import requests
from transformers import AutoProcessor, CLIPModel
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

In [2]:
device='cuda'

In [3]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
model.eval();

In [4]:
image = Image.open('me_no_hat_cropped_1.jpeg')
text = "A photo of a man"
# image

In [5]:
inputs = processor(images=image, return_tensors="pt").to(device)
with torch.no_grad():
    image_features_man = model.get_image_features(**inputs)
    # image_features_man = image_features_man / image_features_man.norm(dim=-1, keepdim=True)

In [6]:
image_features_man[0,:10], image_features_man.shape

(tensor([ 0.3799, -0.0205, -0.3645,  0.3117, -0.3376, -0.2418,  0.1636,  0.8491,
          0.2491,  0.0771], device='cuda:0'),
 torch.Size([1, 512]))

In [7]:
inputs = processor(text=text, return_tensors="pt", padding=True).to(device)
with torch.no_grad():
    text_features_man = model.get_text_features(**inputs)
    # text_features_man = text_features_man / text_features_man.norm(dim=-1, keepdim=True)

In [8]:
text_features_man[0,:10], text_features_man.shape

(tensor([-0.2643,  0.3246, -0.0228,  0.2032, -0.0099, -0.2975, -0.1399, -1.0689,
         -0.0102,  0.3058], device='cuda:0'),
 torch.Size([1, 512]))

In [9]:
image = Image.open('n02123045_1955.jpg')
text = "A photo of a cat"

inputs = processor(images=image, return_tensors="pt").to(device)
with torch.no_grad():
    image_features_cat = model.get_image_features(**inputs)
    # image_features_cat = image_features_cat / image_features_cat.norm(dim=-1, keepdim=True)

inputs = processor(text=text, return_tensors="pt", padding=True).to(device)
with torch.no_grad():
    text_features_cat = model.get_text_features(**inputs)
    # text_features_cat = text_features_cat / text_features_cat.norm(dim=-1, keepdim=True)

In [10]:
image = Image.open('n02099601_7101.jpg')
text = "A photo of a dog"

inputs = processor(images=image, return_tensors="pt").to(device)
with torch.no_grad():
    image_features_dog = model.get_image_features(**inputs)
    # image_features_dog = image_features_dog / image_features_dog.norm(dim=-1, keepdim=True)

inputs = processor(text=text, return_tensors="pt", padding=True).to(device)
with torch.no_grad():
    text_features_dog = model.get_text_features(**inputs)
    # text_features_dog = text_features_dog / text_features_dog.norm(dim=-1, keepdim=True)

In [11]:
cos_similarities=np.zeros((3,3))
dot_products=np.zeros((3,3))
prods=[]

for i, (name_text, feature_text) in enumerate(zip(['cat_text', 'dog_text', 'man_text'], 
                                 [text_features_cat, text_features_dog, text_features_man])):
    prods.append([])
    for j, (name_image, feature_image) in enumerate(zip(['cat_image', 'dog_image', 'man_image'], 
                                     [image_features_cat, image_features_dog, image_features_man])):
        cos_similarities[i,j]=torch.cosine_similarity(feature_image, feature_text)
        
        prod=feature_image*feature_text
        # prod=(feature_image-ave_image_features)*(feature_text-ave_text_features) #Try removing mean to make differences more clear?
        
        dot_products[i,j]=prod.sum().item()
        prods[-1].append(prod.detach().cpu().numpy().reshape(16,32))
        # print(prod.min().item(), prod.max().item())
        

In [12]:
cos_similarities

array([[0.27838621, 0.19343476, 0.19544804],
       [0.22083758, 0.26263022, 0.20560792],
       [0.21663226, 0.20955765, 0.25059876]])

In [13]:
dot_products

array([[33.3657341 , 21.1536808 , 21.51506996],
       [27.03812408, 29.33909607, 23.12074661],
       [27.89419556, 24.62025833, 29.63657379]])

In [14]:
image_features_man.norm(dim=-1)

tensor([10.5075], device='cuda:0')

In [15]:
text_features_man.norm(dim=-1)

tensor([11.2551], device='cuda:0')