In [1]:
import torch
import open_clip
from PIL import Image


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Choose device
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# Load model, tokenizer, and preprocessing
model_name = "ViT-B-32"          # ViT-L-14, ViT-B-16, etc.
pretrained_dataset = "openai"    #"openai", "laion400m_e32", "laion2b_s34b_b79k"

model, _, preprocess = open_clip.create_model_and_transforms(
    model_name, pretrained=pretrained_dataset
)
tokenizer = open_clip.get_tokenizer(model_name)
model = model.to(device)



In [4]:
# Load test image
image_path = "clip_test2.jpg"  # replace with your own file
image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)

In [5]:
# Scene descriptions
scene_prompts = [
    "beach",
    "forest",
    "mountain",
    "office",
    "classroom",
    "street",
    "cafeteria",
    "kitchen",
    "living room",
    "wedding",
    "concert",
    "sports event",
    "park",
    "desert",
    "cityscape",
]

# Tokenize the text
text = tokenizer(scene_prompts).to(device)


In [6]:
with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)

    # Normalize
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    # Similarity scores
    similarity = (100.0 * image_features @ text_features.T)
    probs = similarity.softmax(dim=-1).cpu().numpy()


In [8]:
import numpy as np

for i, scene in enumerate(scene_prompts):
    print(f"{scene:30s} -> {probs[0][i]*100:.2f}%")


beach                          -> 0.10%
forest                         -> 1.37%
mountain                       -> 93.03%
office                         -> 0.90%
classroom                      -> 0.49%
street                         -> 0.18%
cafeteria                      -> 0.09%
kitchen                        -> 0.10%
living room                    -> 0.22%
wedding                        -> 0.07%
concert                        -> 0.13%
sports event                   -> 1.62%
park                           -> 0.80%
desert                         -> 0.78%
cityscape                      -> 0.12%


In [10]:
# Only get top 5 results
top_probs, top_labels = torch.topk(torch.tensor(probs), 5)
print("\nTop 5 Scene Predictions:")
for i in range(top_probs.shape[1]):
    print(f"{scene_prompts[top_labels[0][i]]:30s} -> {top_probs[0][i]*100:.2f}%")


Top 5 Scene Predictions:
mountain                       -> 93.03%
sports event                   -> 1.62%
forest                         -> 1.37%
office                         -> 0.90%
park                           -> 0.80%


In [12]:
# Top 1 prediction
top1_idx = np.argmax(probs)
print(f"{scene_prompts[top1_idx]:30s} -> {probs[0][top1_idx]*100:.2f}%")

mountain                       -> 93.03%


In [13]:
# Only output if the top prediction is above a certain threshold
threshold = 0.4  # 40%
if probs[0][top1_idx] > threshold:
    print(f"{scene_prompts[top1_idx]:30s} -> {probs[0][top1_idx]*100:.2f}%")
else:
    print("Unsure but top prediction is: "+f"{scene_prompts[top1_idx]:30s} -> {probs[0][top1_idx]*100:.2f}%")

mountain                       -> 93.03%


In [None]:
# Prediction of batch of images in a folder
import os

image_folder = os.path.join(os.getcwd(), "images") 
image_files = [f for f in os.listdir(image_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
