In [None]:
!pip uninstall -y torch torchaudio fastai

In [None]:
!pip install torch==2.6.0 torchvision==0.21.0

In [None]:
!pip install transformers

In [None]:
import transformers
print(transformers.__version__)

In [None]:
import torch
import torchvision

print("Torch version:", torch.__version__)
print("Torchvision version:", torchvision.__version__)


In [None]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

# Load the CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Load the local image directly
image_path = "/content/generated_img.png"  # Path to your local image
image = Image.open(image_path)

# List of generic captions or descriptions (you can extend this as needed)
# For a more dynamic approach, you could generate more captions using a caption generation model.
text = [
    "a photo of a cat",
    "a picture of a dog",
    "a person riding a bike",
    "a beautiful landscape",
    "a close-up of a flower",
    "a group of people at a party",
    "a city skyline at sunset",
    "a car driving on a road"
]

# Preprocess the image and text to match CLIP input requirements
inputs = processor(text=text, images=image, return_tensors="pt", padding=True)

# Check if CUDA is available, otherwise fall back to CPU
device = "cuda" if torch.cuda.is_available() else "cpu"

# Move tensors to the correct device
inputs = {key: value.to(device) for key, value in inputs.items()}
model.to(device)

# Get the image and text features using CLIP
with torch.no_grad():
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image  # Image-text similarity scores
    logits_per_text = outputs.logits_per_text  # Text-image similarity scores

# Softmax to normalize similarity scores
image_features = logits_per_image.softmax(dim=-1)  # For image-to-text similarity
text_features = logits_per_text.softmax(dim=-1)  # For text-to-image similarity

# Print similarity scores
print("Image to Text Similarity Scores:")
for idx, caption in enumerate(text):
    print(f"{caption}: {image_features[0][idx].item():.4f}")

# Optionally, return the best caption based on similarity score
best_caption_idx = torch.argmax(image_features)
print("\nBest caption for the image:", text[best_caption_idx])