In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer

In [None]:
import torch
from torch.nn import functional as F
from IPython.display import display
from PIL import Image
import numpy as np
import requests
import matplotlib.pyplot as plt
from io import BytesIO

In [None]:
model_name = "openai/clip-vit-base-patch32"
model=CLIPModel.from_pretrained(model_name)
processor=CLIPProcessor.from_pretrained(model_name)
tokenizer=CLIPTokenizer.from_pretrained(model_name)

In [None]:
text = ["a donut", "a cookie", "an airplane", "a cat"]
tokens=tokenizer(text, padding=True, return_tensors="pt")
inputs=tokens.input_ids
print(inputs)

tensor([[49406,   320, 18471, 49407],
        [49406,   320,  9367, 49407],
        [49406,   550, 16451, 49407],
        [49406,   320,  2368, 49407]])


In [None]:
with torch.no_grad():
  outputs=model.get_text_features(inputs)
print(outputs.shape)
print(outputs)

torch.Size([4, 512])
tensor([[ 0.3581,  0.0311, -0.2988,  ..., -0.4201, -0.5272,  0.0181],
        [ 0.1526, -0.0538, -0.0021,  ..., -0.3003, -0.3579, -0.0010],
        [ 0.1899,  0.1223,  0.1641,  ..., -0.0940, -0.0714, -0.2662],
        [ 0.1981, -0.2040, -0.1533,  ..., -0.4514, -0.5664,  0.0596]])


In [None]:
out1=outputs.unsqueeze(1)
print(out1.shape)
print(out1)
out2=outputs.unsqueeze(0)
print(out2.shape)
print(out2)

torch.Size([4, 1, 512])
tensor([[[ 0.3581,  0.0311, -0.2988,  ..., -0.4201, -0.5272,  0.0181]],

        [[ 0.1526, -0.0538, -0.0021,  ..., -0.3003, -0.3579, -0.0010]],

        [[ 0.1899,  0.1223,  0.1641,  ..., -0.0940, -0.0714, -0.2662]],

        [[ 0.1981, -0.2040, -0.1533,  ..., -0.4514, -0.5664,  0.0596]]])
torch.Size([1, 4, 512])
tensor([[[ 0.3581,  0.0311, -0.2988,  ..., -0.4201, -0.5272,  0.0181],
         [ 0.1526, -0.0538, -0.0021,  ..., -0.3003, -0.3579, -0.0010],
         [ 0.1899,  0.1223,  0.1641,  ..., -0.0940, -0.0714, -0.2662],
         [ 0.1981, -0.2040, -0.1533,  ..., -0.4514, -0.5664,  0.0596]]])


In [None]:
sim=F.cosine_similarity(out1,out2,dim=2).cpu().numpy()
print(sim)

[[0.99999994 0.8890481  0.81164634 0.8415476 ]
 [0.8890481  1.         0.81958157 0.8454813 ]
 [0.81164634 0.81958157 1.         0.850893  ]
 [0.8415476  0.8454813  0.850893   0.9999998 ]]


In [None]:
cat='/content/drive/My Drive/Clip tokenization dataset/cat.jpg'
donut='/content/drive/My Drive/Clip tokenization dataset/Donut.jpg'
airplane='/content/drive/My Drive/Clip tokenization dataset/United_Airlines_Boeing_777-200_Meulemans.jpg'
cookie='/content/drive/My Drive/Clip tokenization dataset/Cookie.jpg'

In [None]:
import cv2
cat_img=cv2.imread(cat)
donut_img=cv2.imread(donut)
airplane_img=cv2.imread(airplane)
cookie_img=cv2.imread(cookie)
imgs=[cat_img,donut_img,airplane_img,cookie_img]

In [None]:
processor=CLIPProcessor.from_pretrained(model_name)

In [None]:
img_inputs=processor(images=imgs, return_tensors="pt")
print(img_inputs)

{'pixel_values': tensor([[[[-1.1061e+00, -1.1061e+00, -1.1061e+00,  ...,  9.0935e-02,
           -1.1255e-02,  3.3439e-03],
          [-1.1061e+00, -1.1061e+00, -1.1061e+00,  ...,  3.2541e-02,
           -5.5050e-02, -4.0451e-02],
          [-1.1207e+00, -1.1353e+00, -1.1353e+00,  ..., -4.0451e-02,
           -1.2804e-01, -1.1344e-01],
          ...,
          [ 7.0407e-01,  7.4786e-01,  7.7706e-01,  ...,  4.8509e-01,
            4.8509e-01,  4.7049e-01],
          [ 7.0407e-01,  7.9166e-01,  8.6465e-01,  ...,  4.2670e-01,
            4.2670e-01,  4.1210e-01],
          [ 7.3327e-01,  8.0626e-01,  8.6465e-01,  ...,  4.4130e-01,
            4.4130e-01,  4.1210e-01]],

         [[-6.7154e-01, -6.7154e-01, -6.7154e-01,  ...,  6.9417e-01,
            6.3414e-01,  6.3414e-01],
          [-6.7154e-01, -6.7154e-01, -6.7154e-01,  ...,  6.3414e-01,
            5.8911e-01,  5.8911e-01],
          [-6.8655e-01, -6.8655e-01, -6.8655e-01,  ...,  5.5910e-01,
            5.1408e-01,  5.1408e-01],
   

In [None]:
with torch.no_grad():
  img_outputs=model.get_image_features(**img_inputs)
print(img_outputs.shape)
print(img_outputs)

torch.Size([4, 512])
tensor([[ 0.3258, -0.4045, -0.4520,  ...,  0.2177, -0.2530,  0.0418],
        [ 0.7556, -0.0325, -0.5319,  ...,  0.2967, -0.1567,  0.0729],
        [ 0.2070,  0.2214, -0.3405,  ...,  0.5639, -0.1325, -0.5436],
        [-0.0400,  0.0076, -0.0899,  ...,  0.6723,  0.1510, -0.1595]])


In [None]:
sim=F.cosine_similarity(img_outputs.unsqueeze(0),out1,dim=2).cpu().numpy()
print(sim)

[[0.20861381 0.30822664 0.1594847  0.23292162]
 [0.21102326 0.25331    0.16852753 0.28615817]
 [0.19609761 0.20570849 0.24124888 0.1800846 ]
 [0.26188153 0.19082767 0.16480058 0.18845837]]
