In [1]:
# CLIP Semantic Similarity Setup
import torch
import numpy as np
from transformers import CLIPProcessor, CLIPModel
from sklearn.metrics.pairwise import cosine_similarity
import cv2

# 1. Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_proc  = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# 2. Function to get 512-dim CLIP embedding from image
def get_clip_image_emb(image_path: str) -> np.ndarray:
    img = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
    inputs = clip_proc(images=img, return_tensors="pt").to(device)
    with torch.no_grad():
        emb = clip_model.get_image_features(**inputs)
    emb = emb[0].cpu().numpy()
    return emb / (np.linalg.norm(emb) + 1e-10)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [2]:
# Example usage — CLIP global similarity
path1 = "image_pairs/boy1.jpg"
path2 = "image_pairs/boy2.jpg"

emb1 = get_clip_image_emb(path1)
emb2 = get_clip_image_emb(path2)

clip_sim = cosine_similarity([emb1], [emb2])[0][0]
print(f"CLIP Semantic Similarity: {clip_sim * 100:.2f}%")


CLIP Semantic Similarity: 87.76%


In [3]:
# Use YOLO to crop objects and get CLIP embeddings for each crop
from ultralytics import YOLO
import cv2

# Load YOLOv8 (if not already)
yolo_model = YOLO('yolov8n.pt')

def get_clip_crops_embeddings(image_path):
    """
    Detects objects using YOLO, returns list of CLIP embeddings for each object crop.
    """
    image = cv2.imread(image_path)
    results = yolo_model(image)

    crops = []
    for box in results[0].boxes.xyxy.cpu().numpy():
        x1, y1, x2, y2 = map(int, box)
        crop = image[int(y1):int(y2), int(x1):int(x2)]
        if crop.size == 0:
            continue
        # convert BGR to RGB for CLIP
        crop_rgb = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
        inputs = clip_proc(images=crop_rgb, return_tensors="pt").to(device)
        with torch.no_grad():
            emb = clip_model.get_image_features(**inputs)[0].cpu().numpy()
        emb = emb / (np.linalg.norm(emb) + 1e-10)
        crops.append(emb)

    return crops  # List of 512-dim np arrays


In [4]:
def compute_object_level_similarity(embs1, embs2):
    """
    Matches each embedding in embs1 to its most similar in embs2.
    Returns average best-match similarity.
    """
    if not embs1 or not embs2:
        return 0.0

    total_sim = 0
    for e1 in embs1:
        sims = [cosine_similarity([e1], [e2])[0][0] for e2 in embs2]
        total_sim += max(sims)  # best match
    avg_sim = total_sim / len(embs1)
    return avg_sim * 100  # convert to %


In [5]:
# Get crop-wise embeddings from both images
crop_embs1 = get_clip_crops_embeddings( "image_pairs/boy1.jpg")
crop_embs2 = get_clip_crops_embeddings( "image_pairs/boy2.jpg")

obj_level_sim = compute_object_level_similarity(crop_embs1, crop_embs2)
print(f"Object-Level Similarity: {obj_level_sim:.2f}%")



0: 640x544 1 person, 1 tie, 292.2ms
Speed: 12.5ms preprocess, 292.2ms inference, 14.1ms postprocess per image at shape (1, 3, 640, 544)

0: 640x416 1 person, 1 tie, 218.4ms
Speed: 5.2ms preprocess, 218.4ms inference, 3.1ms postprocess per image at shape (1, 3, 640, 416)
Object-Level Similarity: 86.46%


In [6]:
final_similarity = 0.5 * clip_sim * 100 + 0.5 * obj_level_sim
print(f"Final Similarity Score: {final_similarity:.2f}%")


Final Similarity Score: 87.11%


In [7]:
# Siamese MLP head to learn similarity from CLIP embeddings
import torch.nn as nn
import torch

class SiameseHead(nn.Module):
    def __init__(self, embedding_dim=512):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embedding_dim * 2, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.Sigmoid()  # Output: similarity score between 0 and 1
        )

    def forward(self, emb1, emb2):
        x = torch.cat([emb1, emb2], dim=1)
        return self.net(x)


In [8]:
import pandas as pd
from torch.utils.data import Dataset
from PIL import Image
import os

class ImagePairDataset(Dataset):
    def __init__(self, csv_file, image_dir, processor, device):
        self.pairs_df = pd.read_csv(csv_file)
        self.image_dir = image_dir
        self.processor = processor
        self.device = device

    def __len__(self):
        return len(self.pairs_df)

    def __getitem__(self, idx):
        row = self.pairs_df.iloc[idx]
        path1 = os.path.join(self.image_dir, row['image1'])
        path2 = os.path.join(self.image_dir, row['image2'])
        label = float(row['similarity'])


        # Load both images
        img1 = Image.open(path1).convert("RGB")
        img2 = Image.open(path2).convert("RGB")

        # Use CLIP processor to get tensors
        inputs1 = self.processor(images=img1, return_tensors="pt")
        inputs2 = self.processor(images=img2, return_tensors="pt")

        # Get embeddings from frozen CLIP
        with torch.no_grad():
            emb1 = clip_model.get_image_features(**{k: v.to(self.device) for k, v in inputs1.items()})
            emb2 = clip_model.get_image_features(**{k: v.to(self.device) for k, v in inputs2.items()})

        # Normalize
        emb1 = emb1 / (emb1.norm(dim=1, keepdim=True) + 1e-10)
        emb2 = emb2 / (emb2.norm(dim=1, keepdim=True) + 1e-10)

        return emb1.squeeze(0), emb2.squeeze(0), torch.tensor([label], dtype=torch.float32)


In [9]:
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

# Hyperparameters
batch_size = 8
epochs = 5
learning_rate = 1e-3

# Dataset and DataLoader
dataset = ImagePairDataset("pairs.csv", "image_pairs", clip_proc, device)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Instantiate the Siamese head and move to device
siamese_head = SiameseHead().to(device)

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(siamese_head.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    siamese_head.train()
    running_loss = 0.0
    pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
    for emb1, emb2, labels in pbar:
        emb1, emb2, labels = emb1.to(device), emb2.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = siamese_head(emb1, emb2).squeeze()  # output shape: (batch_size)
        loss = criterion(outputs, labels.squeeze())
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        pbar.set_postfix(loss=running_loss / (pbar.n + 1))
    
    print(f"Epoch {epoch+1} Loss: {running_loss / len(dataloader):.4f}")

print("Training complete ✅")


Epoch 1/5: 100%|██████████| 13/13 [00:35<00:00,  2.72s/it, loss=0.656]


Epoch 1 Loss: 0.6555


Epoch 2/5: 100%|██████████| 13/13 [00:35<00:00,  2.77s/it, loss=0.593]


Epoch 2 Loss: 0.5935


Epoch 3/5: 100%|██████████| 13/13 [00:35<00:00,  2.69s/it, loss=0.567]


Epoch 3 Loss: 0.5674


Epoch 4/5: 100%|██████████| 13/13 [00:35<00:00,  2.73s/it, loss=0.558]


Epoch 4 Loss: 0.5575


Epoch 5/5: 100%|██████████| 13/13 [00:35<00:00,  2.73s/it, loss=0.537]

Epoch 5 Loss: 0.5367
Training complete ✅





In [10]:
def predict_similarity(img_path1, img_path2):
    img1 = Image.open(img_path1).convert("RGB")
    img2 = Image.open(img_path2).convert("RGB")

    inputs1 = clip_proc(images=img1, return_tensors="pt").to(device)
    inputs2 = clip_proc(images=img2, return_tensors="pt").to(device)

    with torch.no_grad():
        emb1 = clip_model.get_image_features(**inputs1)
        emb2 = clip_model.get_image_features(**inputs2)

    emb1 = emb1 / (emb1.norm(dim=1, keepdim=True) + 1e-10)
    emb2 = emb2 / (emb2.norm(dim=1, keepdim=True) + 1e-10)

    siamese_head.eval()
    with torch.no_grad():
        sim_score = siamese_head(emb1, emb2).item()

    print(f"Similarity: {sim_score * 100:.2f}%")
    return sim_score * 100


In [11]:
dataset = ImagePairDataset("pairs.csv", "image_pairs", clip_proc, device)
emb1, emb2, label = dataset[0]
print(emb1.shape, emb2.shape, label)


torch.Size([512]) torch.Size([512]) tensor([1.])


In [12]:
dataset = ImagePairDataset("pairs.csv", "image_pairs", clip_proc, device)
emb1, emb2, label = dataset[0]
print(emb1.shape, emb2.shape, label)


torch.Size([512]) torch.Size([512]) tensor([1.])


In [13]:
sim_score = predict_similarity("image_pairs/boy1.jpg", "image_pairs/boy1_zoomed.jpg")
print(f"Predicted similarity: {sim_score:.2f}%")


Similarity: 35.97%
Predicted similarity: 35.97%


In [14]:
def combined_similarity(img_path1, img_path2, weight_siamese=0.5, weight_yolo=0.5):
    # Siamese similarity
    sim_score_siamese = predict_similarity(img_path1, img_path2)
    
    # YOLO crop similarity
    crop_embs1 = get_clip_crops_embeddings(img_path1)
    crop_embs2 = get_clip_crops_embeddings(img_path2)
    obj_level_sim = compute_object_level_similarity(crop_embs1, crop_embs2)
    
    # Weighted average
    combined_score = weight_siamese * sim_score_siamese + weight_yolo * obj_level_sim
    return combined_score

# Example usage:
score = combined_similarity("image_pairs/boy1.jpg", "image_pairs/boy1_zoomed.jpg")
print(f"Combined similarity score: {score:.2f}%")


Similarity: 35.97%

0: 640x544 1 person, 1 tie, 369.5ms
Speed: 47.8ms preprocess, 369.5ms inference, 29.0ms postprocess per image at shape (1, 3, 640, 544)

0: 640x384 1 person, 236.9ms
Speed: 5.1ms preprocess, 236.9ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 384)
Combined similarity score: 57.21%
