In [2]:
# Step 1: Installing CLIP

## GitHub Installation
# Clone the repository and install dependencies
!git clone https://github.com/openai/CLIP.git
%cd CLIP
!pip install -r requirements.txt

# Test the installation
import clip
print("CLIP is installed!")

## Hugging Face Installation
# Install transformers and PyTorch
!pip install transformers torch

from transformers import CLIPProcessor, CLIPModel

print("Hugging Face CLIP installed!")

# Step 2: Running a Demo

import clip
import torch
from PIL import Image

# Load the model
model, preprocess = clip.load("ViT-B/32", device="cpu")

# Load and preprocess an image
image = preprocess(Image.open(r"/Users/va/Documents/Documents - VA’s MacBook Air/Core/College/PreCog/CLIP/WhatsApp Image 2025-02-07 at 02.06.16.jpeg")).unsqueeze(0)
texts = clip.tokenize(["a dog", "a cat", "a car"])

# Run inference
with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(texts)
    logits_per_image, logits_per_text = model(image, texts)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()
    print("Label probabilities:", probs)

# Step 3: Image Search

images = [preprocess(Image.open(f"image_{i}.jpg")).unsqueeze(0) for i in range(5)]
images = torch.cat(images, dim=0)
text = clip.tokenize(["a photo of a cat"]).to("cpu")

with torch.no_grad():
    image_features = model.encode_image(images)
    text_features = model.encode_text(text)
    similarities = (image_features @ text_features.T).squeeze()
    best_match_idx = similarities.argmax().item()
    print(f"Best matching image is image_{best_match_idx}.jpg")

# Step 4: Using CLIP with Hugging Face

from transformers import CLIPProcessor, CLIPModel
from PIL import Image

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

image = Image.open("path_to_image.jpg")
inputs = processor(text=["a cat", "a dog"], images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
print("Label probabilities:", probs)


fatal: destination path 'CLIP' already exists and is not an empty directory.
/Users/va/Documents/Documents - VA’s MacBook Air/Core/College/PreCog/CLIP/CLIP
CLIP is installed!
Hugging Face CLIP installed!
Label probabilities: [[0.444472   0.16252272 0.39300525]]


FileNotFoundError: [Errno 2] No such file or directory: '/Users/va/Documents/Documents - VA’s MacBook Air/Core/College/PreCog/CLIP/CLIP/image_0.jpg'

In [4]:
%cd CLIP
import torch
import clip
from PIL import Image

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Load and preprocess image
image_path = "/Users/va/Documents/Documents - VA’s MacBook Air/Core/College/PreCog/CLIP/rome.jpg"  # Replace with your image path
try:
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
except FileNotFoundError:
    print(f"Error: File {image_path} not found.")
    exit()

# Generate diverse text descriptions
descriptions = [
   "roman", "greek", "egyptian", "medieval", "modern", "futuristic", "fantasy", "gothic", "victorian", "steampunk", 
   "church", "castle", "cathedral", "temple", "mosque", "synagogue", "shrine", "chapel", "monastery", "abbey","colosseum",
    "day", "night", "morning", "evening", "sunset", "sunrise", "dawn", "dusk", "twilight", "midnight",
    "summer", "winter", "spring", "autumn", "fall", "rainy", "sunny", "cloudy", "stormy", "foggy",
]

text_tokens = clip.tokenize(descriptions).to(device)

# Extract features
with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text_tokens)

# Normalize embeddings
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)

# Compute similarity
similarity = (image_features @ text_features.T).squeeze(0)

# Get best match
best_match_idx = similarity.argmax().item()
best_description = descriptions[best_match_idx]

print(f"CLIP thinks this image is: {best_description} (Similarity: {similarity[best_match_idx]:.4f})")

# # Print all similarities
# for desc, sim in zip(descriptions, similarity.tolist()):
#     print(f"{desc}: {sim:.4f}")

# sort the descriptions by similarity
sorted_descriptions = [desc for _, desc in sorted(zip(similarity.tolist(), descriptions), reverse=True)]
print("Sorted descriptions:")
for desc in sorted_descriptions:
    print(f"{desc}: {sim:.4f}")

[Errno 2] No such file or directory: 'CLIP'
/Users/va/Documents/Documents - VA’s MacBook Air/Core/College/PreCog/CLIP/CLIP/clip
CLIP thinks this image is: colosseum (Similarity: 0.3245)
Sorted descriptions:
colosseum: 0.1953
roman: 0.1953
castle: 0.1953
medieval: 0.1953
abbey: 0.1953
greek: 0.1953
gothic: 0.1953
cathedral: 0.1953
dawn: 0.1953
monastery: 0.1953
egyptian: 0.1953
day: 0.1953
temple: 0.1953
cloudy: 0.1953
modern: 0.1953
summer: 0.1953
fall: 0.1953
evening: 0.1953
rainy: 0.1953
futuristic: 0.1953
steampunk: 0.1953
stormy: 0.1953
fantasy: 0.1953
spring: 0.1953
autumn: 0.1953
sunrise: 0.1953
morning: 0.1953
synagogue: 0.1953
sunny: 0.1953
dusk: 0.1953
shrine: 0.1953
night: 0.1953
foggy: 0.1953
sunset: 0.1953
church: 0.1953
twilight: 0.1953
victorian: 0.1953
winter: 0.1953
midnight: 0.1953
chapel: 0.1953
mosque: 0.1953
