In [1]:
!pip install transformers torchvision torch pillow
!pip install -U torchmetrics

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [14]:
import os
images_paths = [
    '/content/images/1.jpg',
    '/content/images/2.jpg',
    '/content/images/3.jpg',
    '/content/images/4.jpg',
    '/content/images/5.jpg',
    '/content/images/6.jpg',
    '/content/images/7.jpg',
    '/content/images/8.jpg',
    '/content/images/9.jpg',
    '/content/images/10.jpg'
]
print(images_paths)

prompts = [
"A glowing crystal forest under a violet sky, bioluminescent trees with mist swirling around the roots, fantasy style, ultra-detailed, magical atmosphere.",
"A futuristic cyberpunk city at night, neon signs reflecting on wet streets, people in tech-enhanced outfits, flying cars above, rain falling, Blade Runner aesthetic",
"A fierce medieval warrior in armor, holding a sword, battle-worn and intense, dramatic lighting, hyperrealistic, Rembrandt style portrait",
"A cozy cottage in the countryside during spring, surrounded by blooming flowers, sunlight streaming through trees, soft pastel tones, watercolor illustration style",
"A breathtaking alien landscape with two moons in the sky, strange flora and massive rock formations, an explorer in a space suit, cinematic lighting"
,"generate image of a frog holding a board with 'hi' written on it"
,"A realistic waterfall surrounded by lush green forests and fog in the air under natural daylight."
,"generate image of a realistic dog.",
"generate image of a relistic cat.",
"A narrow forest path winds through a lush, sunlit jungle with vibrant green foliage and beams of light filtering through the canopy."
]

print(prompts)

['/content/images/1.jpg', '/content/images/2.jpg', '/content/images/3.jpg', '/content/images/4.jpg', '/content/images/5.jpg', '/content/images/6.jpg', '/content/images/7.jpg', '/content/images/8.jpg', '/content/images/9.jpg', '/content/images/10.jpg']
['A glowing crystal forest under a violet sky, bioluminescent trees with mist swirling around the roots, fantasy style, ultra-detailed, magical atmosphere.', 'A futuristic cyberpunk city at night, neon signs reflecting on wet streets, people in tech-enhanced outfits, flying cars above, rain falling, Blade Runner aesthetic', 'A fierce medieval warrior in armor, holding a sword, battle-worn and intense, dramatic lighting, hyperrealistic, Rembrandt style portrait', 'A cozy cottage in the countryside during spring, surrounded by blooming flowers, sunlight streaming through trees, soft pastel tones, watercolor illustration style', 'A breathtaking alien landscape with two moons in the sky, strange flora and massive rock formations, an explorer 

In [15]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy
# Load model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
clip_model.to(device)

def calculate_clip_scores(image_paths, prompts):
    """
    image_paths: list of image file paths
    prompts: list of text prompts (same length as image_paths)
    """
    scores = []

    for img_path, prompt in zip(image_paths, prompts):
        image = Image.open(img_path).convert("RGB")
        inputs = clip_processor(text=[prompt], images=image, return_tensors="pt", padding=True).to(device)

        with torch.no_grad():
            outputs = clip_model(**inputs)
            score = outputs.logits_per_image.item()  # cosine similarity
            scores.append(round(score, 4))

    return scores




In [16]:
clip_score = calculate_clip_scores(images_paths, prompts)
print(f"CLIP score: {clip_score}")
print(f"Average CLIP score: {numpy.mean(clip_score)}")

CLIP score: [35.1079, 32.498, 28.955, 35.1339, 34.3453, 36.8377, 31.692, 31.5466, 28.2592, 31.3092]
Average CLIP score: 32.56848
