In [2]:
import torch
from PIL import Image
from transformers import AutoProcessor, CLIPModel, AutoImageProcessor, AutoModel
import torchvision.models as models
import torchvision.transforms as T
import numpy as np
import os
import pandas as pd
from tqdm import tqdm
from skimage.metrics import structural_similarity as ssim, peak_signal_noise_ratio as psnr
from sklearn.metrics import mean_squared_error
import lpips

device = torch.device('cuda' if torch.cuda.is_available() else "cpu")

# Load CLIP model and processor
processor_clip = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
model_clip = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)

# Load DINOv2 model and processor
processor_dino = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
model_dino = AutoModel.from_pretrained('facebook/dinov2-base').to(device)

# Load VGG16 model
vgg16 = models.vgg16(pretrained=True).features.to(device).eval()

# Load LPIPS model
lpips_model = lpips.LPIPS(net='vgg').to(device)

# Image transformation for VGG16 and LPIPS
vgg_transform = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

lpips_transform = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor()
])

# Function to extract CLIP features
def extract_features_clip(image):
    with torch.no_grad():
        inputs = processor_clip(images=image, return_tensors="pt").to(device)
        image_features = model_clip.get_image_features(**inputs)
        return image_features

# Function to extract DINOv2 features
def extract_features_dino(image):
    with torch.no_grad():
        inputs = processor_dino(images=image, return_tensors="pt").to(device)
        outputs = model_dino(**inputs)
        image_features = outputs.last_hidden_state
        return image_features.mean(dim=1)

# Function to extract VGG16 features
def extract_features_vgg16(image):
    image = vgg_transform(image).unsqueeze(0).to(device)
    with torch.no_grad():
        features = vgg16(image).flatten(1)
    return features

# Function to calculate cosine similarity between two feature vectors
def cosine_similarity(vector1, vector2):
    vector1 = vector1 / vector1.norm(dim=-1, keepdim=True)
    vector2 = vector2 / vector2.norm(dim=-1, keepdim=True)
    return torch.sum(vector1 * vector2, dim=-1).item()

# Function to compute SSIM between two images
def compute_ssim(image1, image2):
    image1_np = np.array(image1)
    image2_np = np.array(image2)
    return ssim(image1_np, image2_np, multichannel=True)

# Function to compute MSE between two images
def compute_mse(image1, image2):
    image1_np = np.array(image1).astype(np.float32)
    image2_np = np.array(image2).astype(np.float32)
    return mean_squared_error(image1_np.flatten(), image2_np.flatten())

# Function to compute LPIPS (1 - LPIPS for similarity score)
def compute_lpips(image1, image2):
    # Transform images for LPIPS
    img1_tensor = lpips_transform(image1).unsqueeze(0).to(device)
    img2_tensor = lpips_transform(image2).unsqueeze(0).to(device)
    with torch.no_grad():
        lpips_score = lpips_model(img1_tensor, img2_tensor).item()
    return 1 - lpips_score  # Since LPIPS measures distance, we use 1 - LPIPS for similarity

# Function to compute PSNR between two images
def compute_psnr(image1, image2):
    image1_np = np.array(image1).astype(np.float32)
    image2_np = np.array(image2).astype(np.float32)
    return psnr(image1_np, image2_np)

# Retrieve all filenames from the two directories
def get_image_paths(directory):
    images = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('jpg'):
                images.append(os.path.join(root, file))
    images.sort()  # Ensure images are in the same order in both directories
    return images

# Resize images to the minimum dimensions between two images
def resize_to_minimum(img1, img2):
    min_width = min(img1.width, img2.width)
    min_height = min(img1.height, img2.height)
    img1_resized = img1.resize((min_width, min_height))
    img2_resized = img2.resize((min_width, min_height))
    return img1_resized, img2_resized

# Paths to the two directories containing the images
dir1 = './images_dataset/'
dir2 = './images_recon/'

# Get the sorted image paths for both directories
images1 = get_image_paths(dir1)
images2 = get_image_paths(dir2)

# Ensure both directories have the same number of images
assert len(images1) == len(images2), "The directories do not contain the same number of images."

# Initialize a list to store results
results = []

# Iterate over the paired images and calculate similarity scores
for img1_path, img2_path in tqdm(zip(images1, images2), total=len(images1), desc="Calculating similarities"):
    # Open images
    img1 = Image.open(img1_path).convert('RGB')
    img2 = Image.open(img2_path).convert('RGB')
    
    # Resize images to the minimum dimensions
    img1_resized, img2_resized = resize_to_minimum(img1, img2)
    
    # Extract features using CLIP
    clip_features1 = extract_features_clip(img1_resized).to(device)
    clip_features2 = extract_features_clip(img2_resized).to(device)
    clip_similarity = cosine_similarity(clip_features1, clip_features2)
    
    # Extract features using DINOv2
    dino_features1 = extract_features_dino(img1_resized).to(device)
    dino_features2 = extract_features_dino(img2_resized).to(device)
    dino_similarity = cosine_similarity(dino_features1, dino_features2)
    
    # Extract features using VGG16
    vgg_features1 = extract_features_vgg16(img1_resized).to(device)
    vgg_features2 = extract_features_vgg16(img2_resized).to(device)
    vgg_similarity = cosine_similarity(vgg_features1, vgg_features2)
    
    # Compute SSIM, MSE, LPIPS, and PSNR
    ssim_score = compute_ssim(img1_resized, img2_resized)
    mse_score = compute_mse(img1_resized, img2_resized)
    lpips_similarity = compute_lpips(img1_resized, img2_resized)
    psnr_score = compute_psnr(img1_resized, img2_resized)
    
    # Store results
    results.append({
        "image1": os.path.relpath(img1_path, start='.'),
        "image2": os.path.relpath(img2_path, start='.'),
        "CLIP_Similarity": clip_similarity,
        "DINOv2_Similarity": dino_similarity,
        "VGG16_Similarity": vgg_similarity,
        "SSIM": ssim_score,
        "MSE": mse_score,
        "1-LPIPS": lpips_similarity,
        "PSNR": psnr_score
    })

# Convert results to a DataFrame
df_results = pd.DataFrame(results)

# Save the DataFrame to a CSV file
df_results.to_csv("userstudy2.csv", index=False)

print("Similarity metrics saved to image_similarity_metrics.csv")




Setting up [LPIPS] perceptual loss: trunk [vgg], v[0.1], spatial [off]
Loading model from: c:\Users\Danish\AppData\Local\Programs\Python\Python310\lib\site-packages\lpips\weights\v0.1\vgg.pth


  attn_output = torch.nn.functional.scaled_dot_product_attention(
Calculating similarities:   0%|          | 0/87 [00:01<?, ?it/s]


ValueError: win_size exceeds image extent. Either ensure that your images are at least 7x7; or pass win_size explicitly in the function call, with an odd value less than or equal to the smaller side of your images. If your images are multichannel (with color channels), set channel_axis to the axis number corresponding to the channels.