In [None]:
!pip install torch torchvision transformers clip-by-openai pillow

In [2]:
import torch
import clip
from PIL import Image

# Load the CLIP model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Load and preprocess an image
image_path = "ex1.png"  # Change this to your image file
image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)

# Define a set of class labels (custom categories for classification)
class_labels = ["cat", "dog", "car", "tree", "person", "airplane"]
text_tokens = clip.tokenize([f"A photo of a {label}" for label in class_labels]).to(device)

# Encode the image and text descriptions
with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text_tokens)

# Normalize features for cosine similarity
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)

# Compute similarity scores
similarity = (image_features @ text_features.T).squeeze(0)  # Cosine similarity

# Find the best-matching class label
best_match_idx = similarity.argmax().item()
print(f"Predicted Class: {class_labels[best_match_idx]}")

# Display similarity scores for each class
for label, score in zip(class_labels, similarity.tolist()):
    print(f"{label}: {score:.4f}")


NameError: name 'clip' is not defined

In [None]:
! pip install open_clip_torch

In [3]:
import torch
from PIL import Image
import open_clip

model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
model.eval()  # model in train mode by default, impacts some models with BatchNorm or stochastic depth active
tokenizer = open_clip.get_tokenizer('ViT-B-32')

image = preprocess(Image.open("ex1.png")).unsqueeze(0)
text = tokenizer(["a diagram", "a dog", "a cat"])

with torch.no_grad(), torch.autocast("cuda"):
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

print("Label probs:", text_probs)  # prints: [[0, 1., 0.]]

open_clip_model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]



Label probs: tensor([[7.1561e-04, 9.9927e-01, 1.7643e-05]])


In [6]:
image_features.shape

torch.Size([1, 512])

In [7]:
text_features.shape

torch.Size([3, 512])

In [8]:
(100.0 * image_features @ text_features.T)

tensor([[19.5020, 26.7436, 15.7992]])

In [9]:
(100.0 * image_features @ text_features.T).softmax(dim=-1)

tensor([[7.1561e-04, 9.9927e-01, 1.7643e-05]])

In [10]:
import open_clip
open_clip.list_pretrained()

[('RN50', 'openai'),
 ('RN50', 'yfcc15m'),
 ('RN50', 'cc12m'),
 ('RN101', 'openai'),
 ('RN101', 'yfcc15m'),
 ('RN50x4', 'openai'),
 ('RN50x16', 'openai'),
 ('RN50x64', 'openai'),
 ('ViT-B-32', 'openai'),
 ('ViT-B-32', 'laion400m_e31'),
 ('ViT-B-32', 'laion400m_e32'),
 ('ViT-B-32', 'laion2b_e16'),
 ('ViT-B-32', 'laion2b_s34b_b79k'),
 ('ViT-B-32', 'datacomp_xl_s13b_b90k'),
 ('ViT-B-32', 'datacomp_m_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_clip_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_laion_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_image_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_text_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_basic_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_s128m_b4k'),
 ('ViT-B-32', 'datacomp_s_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_clip_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_laion_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_image_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_text_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_basic_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_s13m_b4k'),
 ('ViT-