In [None]:
from torchmetrics.functional.multimodal import clip_score
from functools import partial

clip_score_fn = partial(clip_score, model_name_or_path="openai/clip-vit-base-patch16")

def calculate_clip_score(images, prompts):
    images_int = (images * 255).astype("uint8")
    clip_score = clip_score_fn(torch.from_numpy(images_int).permute(0, 3, 1, 2), prompts).detach()
    return round(float(clip_score), 4)

sd_clip_score = calculate_clip_score(images, prompts)
print(f"CLIP score: {sd_clip_score}")
# CLIP score: 35.7038

In [5]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

# Load the CLIP model and processor
model_id = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_id)
processor = CLIPProcessor.from_pretrained(model_id)

# Function to load and preprocess an image
def preprocess_image(image_path):
    image = Image.open(image_path)
    inputs = processor(images=image, return_tensors="pt")
    return inputs

# Load and preprocess the two images
image_a_path = "D:\Code\py\evaluator_pipeline\in.jpg"  # Replace with the path to your first image
image_b_path = "D:\Code\py\evaluator_pipeline\out_lineart+canny.png"  # Replace with the path to your second image
image_a_inputs = preprocess_image(image_a_path)
image_b_inputs = preprocess_image(image_b_path)

# Calculate the image embeddings
with torch.no_grad():
    image_a_embeddings = model.get_image_features(**image_a_inputs)
    image_b_embeddings = model.get_image_features(**image_b_inputs)

# Normalize the embeddings
image_a_embeddings = image_a_embeddings / image_a_embeddings.norm(dim=-1, keepdim=True)
image_b_embeddings = image_b_embeddings / image_b_embeddings.norm(dim=-1, keepdim=True)

# Calculate the cosine similarity between the embeddings
similarity_score = torch.nn.functional.cosine_similarity(image_a_embeddings, image_b_embeddings)

# Print the similarity score
print(f"CLIP Score: {similarity_score.item()}")

CLIP Score: 0.7782825827598572


CLIP Score: 0.7435868978500366