In [3]:
from PIL import Image
import requests

from transformers import CLIPProcessor, CLIPModel

import cv2
from urllib.request import urlopen

import numpy as np

In [4]:
# model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
# processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


In [5]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"


image_url = url
resp = urlopen(image_url)
image = np.asarray(bytearray(resp.read()), dtype="uint8")
image = cv2.imdecode(image, cv2.IMREAD_COLOR) # The image object

# image = Image.open()
# image = cv2.imread(f'http://images.cocodataset.org/val2017/000000039769.jpg')

In [6]:
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image

array([[[140,  25,  56],
        [144,  25,  67],
        [146,  24,  73],
        ...,
        [ 94,  16,  38],
        [107,  13,  39],
        [102,  10,  33]],

       [[138,  22,  57],
        [142,  26,  49],
        [139,  20,  48],
        ...,
        [103,  11,  36],
        [115,  17,  42],
        [ 96,  13,  31]],

       [[135,  22,  42],
        [150,  33,  59],
        [142,  23,  53],
        ...,
        [103,   8,  32],
        [108,  19,  39],
        [ 93,  10,  26]],

       ...,

       [[237, 100, 190],
        [225,  84, 196],
        [236,  96, 203],
        ...,
        [171,  47, 131],
        [181,  62, 144],
        [147,  28, 110]],

       [[230,  84, 221],
        [226,  80, 213],
        [238,  99, 202],
        ...,
        [114,  24,  62],
        [103,   5,  46],
        [ 89,   9,  44]],

       [[238, 100, 175],
        [246, 109, 191],
        [238,  96, 214],
        ...,
        [ 74,  13,  29],
        [ 74,  25,  44],
        [ 73,  17,  42]]

In [7]:
inputs = processor(text=["2 cats lay down"], images=[image], return_tensors="pt", padding=True)

In [8]:
inputs

{'input_ids': tensor([[49406,   273,  3989,  6360,  1136, 49407]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]]), 'pixel_values': tensor([[[[ 0.5873,  0.5873,  0.6165,  ...,  0.0617,  0.0471, -0.0259],
          [ 0.5727,  0.5727,  0.6603,  ...,  0.1201,  0.0763,  0.0909],
          [ 0.5873,  0.5435,  0.6165,  ...,  0.0325,  0.1201,  0.0617],
          ...,
          [ 1.8719,  1.8573,  1.8719,  ...,  1.3902,  1.4340,  1.4194],
          [ 1.8281,  1.8719,  1.8427,  ...,  1.4486,  1.4340,  1.5070],
          [ 1.8573,  1.9011,  1.8281,  ...,  1.3756,  1.3610,  1.4486]],

         [[-1.3169, -1.3019, -1.3169,  ..., -1.4970, -1.4369, -1.4820],
          [-1.2418, -1.2718, -1.2268,  ..., -1.4369, -1.4669, -1.4519],
          [-1.2568, -1.3169, -1.2268,  ..., -1.4669, -1.4069, -1.4519],
          ...,
          [ 0.1239,  0.1089,  0.1239,  ..., -0.7016, -0.6865, -0.6865],
          [ 0.0789,  0.0939,  0.0488,  ..., -0.6565, -0.6865, -0.6115],
          [ 0.0939,  0.1089,  0.0038,  ..., 

In [9]:
outputs = model(**inputs)

In [10]:
outputs

CLIPOutput(loss=None, logits_per_image=tensor([[23.9463]], grad_fn=<PermuteBackward0>), logits_per_text=tensor([[23.9463]], grad_fn=<MulBackward0>), text_embeds=tensor([[-2.2753e-02,  3.1830e-02,  1.3560e-02, -3.7931e-02,  1.1542e-02,
         -1.1366e-02,  2.5364e-03, -7.4427e-03, -1.7145e-02, -6.2180e-04,
         -1.9085e-02,  2.1888e-02,  2.2970e-02, -3.8137e-03, -2.1129e-02,
         -3.3269e-03,  1.4753e-02,  3.7427e-03, -8.0159e-02, -1.1758e-02,
          1.4601e-02,  3.3486e-03,  1.3011e-03,  2.6309e-02,  6.1090e-03,
         -1.9453e-02, -3.6612e-02, -2.6868e-02, -1.8844e-02,  3.9488e-02,
          1.0533e-02,  1.3004e-02, -1.8712e-03, -4.2649e-02, -1.9809e-02,
         -1.9988e-02, -6.4555e-03,  2.5556e-02, -2.2376e-02,  1.2114e-02,
          2.6171e-02,  2.6744e-02,  1.0830e-03, -8.4166e-03, -3.6670e-03,
          6.6346e-03,  8.4956e-03,  3.4751e-02,  9.5155e-03,  3.0342e-02,
          3.1735e-03,  4.7542e-02, -1.3414e-02,  2.8706e-02,  5.3146e-03,
          5.0429e-03,  3.

In [11]:
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score

In [12]:
probs = logits_per_image.softmax(dim=1)

In [13]:
probs

tensor([[1.]], grad_fn=<SoftmaxBackward0>)