Import libraries

In [None]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as T
from torchvision.models import vgg19
from torchvision.models.feature_extraction import create_feature_extractor
from diffusers.utils import load_image
from transformers import (
    CLIPTokenizer,
    CLIPTextModelWithProjection,
    CLIPVisionModelWithProjection,
    CLIPImageProcessor,
)
import lpips

import error: No module named 'triton'


    PyTorch 2.6.0+cpu with CUDA None (you have 2.5.1+cu118)
    Python  3.10.16 (you have 3.10.16)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details
A matching Triton is not available, some optimizations will not be enabled
Traceback (most recent call last):
  File "c:\Users\tsamak\AppData\Local\anaconda3\envs\diffusers\lib\site-packages\xformers\__init__.py", line 57, in _is_triton_available
    import triton  # noqa
ModuleNotFoundError: No module named 'triton'


Load data

In [2]:
# Get the current directory where the script is located
script_dir = os.getcwd()

# Folder paths to load images
image_folder = os.path.join(script_dir, "input_autodrive_small_onroad") # Load input frames from here
style_folder = os.path.join(script_dir, "style_autodrive_small_onroad") # Load style frames from here
output_folder = os.path.join(script_dir, "output_autodrive_small_onroad") # Load output frames from here

# List all image files
image_files = [f for f in os.listdir(image_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp'))]
style_files = [f for f in os.listdir(style_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp'))]
output_files = [f for f in os.listdir(output_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp'))]

Define Similarity Metrics

In [3]:
class SimilarityScores(nn.Module):
    def __init__(self, tokenizer, text_encoder, image_processor, image_encoder):
        super().__init__()
        self.tokenizer = tokenizer
        self.text_encoder = text_encoder
        self.image_processor = image_processor
        self.image_encoder = image_encoder

    def preprocess_image(self, image):
        image = self.image_processor(image, return_tensors="pt")["pixel_values"]
        return {"pixel_values": image.to("cuda")}

    def tokenize_text(self, text):
        inputs = self.tokenizer(
            text,
            max_length=self.tokenizer.model_max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {"input_ids": inputs.input_ids.to("cuda")}

    def encode_image(self, image):
        preprocessed_image = self.preprocess_image(image)
        image_features = self.image_encoder(**preprocessed_image).image_embeds
        image_features = image_features / image_features.norm(dim=1, keepdim=True)
        return image_features

    def encode_text(self, text):
        tokenized_text = self.tokenize_text(text)
        text_features = self.text_encoder(**tokenized_text).text_embeds
        text_features = text_features / text_features.norm(dim=1, keepdim=True)
        return text_features

    def compute_directional_similarity(self, img_feat_one, img_feat_two, text_feat_one, text_feat_two):
        sim_direction = F.cosine_similarity(img_feat_two - img_feat_one, text_feat_two - text_feat_one)
        return sim_direction
    
    def compute_cosine_similarity(self, img_feat_one, img_feat_two):
        sim_cosine = F.cosine_similarity(img_feat_two, img_feat_one)
        return sim_cosine

    def forward(self, image_one, image_two, caption_one, caption_two):
        img_feat_one = self.encode_image(image_one)
        img_feat_two = self.encode_image(image_two)
        text_feat_one = self.encode_text(caption_one)
        text_feat_two = self.encode_text(caption_two)
        directional_similarity = self.compute_directional_similarity(
            img_feat_one, img_feat_two, text_feat_one, text_feat_two
        )
        cosine_similarity = self.compute_cosine_similarity(
            img_feat_one, img_feat_two
        )
        return directional_similarity, cosine_similarity

clip_id = "openai/clip-vit-large-patch14"
tokenizer = CLIPTokenizer.from_pretrained(clip_id)
text_encoder = CLIPTextModelWithProjection.from_pretrained(clip_id).to("cuda")
image_processor = CLIPImageProcessor.from_pretrained(clip_id)
image_encoder = CLIPVisionModelWithProjection.from_pretrained(clip_id).to("cuda")

get_similarity_scores = SimilarityScores(tokenizer, text_encoder, image_processor, image_encoder)

Define LPIPS Metric

In [4]:
lpips_model = lpips.LPIPS(net='alex').to("cuda")
lpips_transform = T.Compose([
    T.Resize((256, 256)), T.ToTensor(), # 256x256 tensor
    T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) # [-1, 1] range
])

def get_lpips_score(original_image, edited_image):
    return lpips_model(lpips_transform(original_image).unsqueeze(0).to("cuda"),
                       lpips_transform(edited_image).unsqueeze(0).to("cuda")).mean().item()

Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]




Loading model from: c:\Users\tsamak\AppData\Local\anaconda3\envs\diffusers\lib\site-packages\lpips\weights\v0.1\alex.pth


  self.load_state_dict(torch.load(model_path, map_location='cpu'), strict=False)


Define Style Difference Metric

In [5]:
vgg_transform = T.Compose([
    T.Resize((256, 256)), T.ToTensor(), # 256x256 tensor
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]) # VGG norm range
])
vgg = vgg19(pretrained=True).to("cuda").eval()
layers = {'features.0': 'conv1_1', 'features.5': 'conv2_1', 'features.10': 'conv3_1'}
extractor = create_feature_extractor(vgg, return_nodes=layers)

def gram_matrix(feat):
    (b, c, h, w) = feat.size()
    feat = feat.view(b, c, h * w)
    G = torch.bmm(feat, feat.transpose(1, 2)) / (c * h * w)
    return G

def get_style_difference(original_image, style_image, edited_image):
    in_feats = extractor(vgg_transform(original_image).unsqueeze(0).to("cuda"))
    out_feats = extractor(vgg_transform(edited_image).unsqueeze(0).to("cuda"))
    style_feats = extractor(vgg_transform(style_image).unsqueeze(0).to("cuda"))
    o2s_gram_loss = 0
    i2s_gram_loss = 0
    for layer in layers.values():
        G_in = gram_matrix(in_feats[layer])
        G_out = gram_matrix(out_feats[layer])
        G_style = gram_matrix(style_feats[layer])
        o2s_gram_loss += torch.mean((G_out - G_style) ** 2).item()
        i2s_gram_loss += torch.mean((G_in - G_style) ** 2).item()
    return o2s_gram_loss, i2s_gram_loss



Metrics Evaulation on Data

In [6]:
clipds_scores = []
cosine_scores = []
lpips_scores = []
o2s_style_scores = []
i2s_style_scores = []

for i in range(len(image_files)):
    original_image = load_image(os.path.join(image_folder, image_files[i]))
    style_image = load_image(os.path.join(style_folder, style_files[0]))
    edited_image = load_image(os.path.join(output_folder, output_files[i]))
    original_caption = "a black road on green surface in real world"
    modified_caption = "a black road on green surface in simulation"
    clipds_score, cosine_score = get_similarity_scores(original_image, edited_image, original_caption, modified_caption)
    lpips_score = get_lpips_score(original_image, edited_image)
    o2s_gram_loss, i2s_gram_loss = get_style_difference(original_image, style_image, edited_image)
    clipds_scores.append(float(clipds_score.detach().cpu()))
    cosine_scores.append(float(cosine_score.detach().cpu()))
    lpips_scores.append(lpips_score)
    o2s_style_scores.append(o2s_gram_loss)
    i2s_style_scores.append(i2s_gram_loss)

print(f"↑ CLIP Directional Similarity:\t\t best={np.max(clipds_scores):.2e} | μ={np.mean(clipds_scores):.2e} | σ={np.std(clipds_scores):.2e}")
print(f"↑ Cosine Similarity (Input-to-Output):\t best={np.max(cosine_scores):.2e} | μ={np.mean(cosine_scores):.2e} | σ={np.std(cosine_scores):.2e}")
print(f"↓ LPIP Similarity (Input-to-Output):\t best={np.min(lpips_scores):.2e} | μ={np.mean(lpips_scores):.2e} | σ={np.std(lpips_scores):.2e}")
print(f"↓ Style Difference (Output-to-Style):\t best={np.min(o2s_style_scores):.2e} | μ={np.mean(o2s_style_scores):.2e} | σ={np.std(o2s_style_scores):.2e}")
print(f"↓ Style Difference (Input-to-Style):\t best={np.min(i2s_style_scores):.2e} | μ={np.mean(i2s_style_scores):.2e} | σ={np.std(i2s_style_scores):.2e}")
print(f"↑ Reduction in Style Difference:\t {-100*(np.mean(o2s_style_scores)-np.mean(i2s_style_scores))/np.mean(i2s_style_scores):.2f}%")

↑ CLIP Directional Similarity:		 best=2.09e-01 | μ=1.08e-01 | σ=3.47e-02
↑ Cosine Similarity (Input-to-Output):	 best=9.01e-01 | μ=8.15e-01 | σ=4.06e-02
↓ LPIP Similarity (Input-to-Output):	 best=2.42e-01 | μ=4.06e-01 | σ=7.22e-02
↓ Style Difference (Output-to-Style):	 best=6.32e-04 | μ=7.89e-04 | σ=1.15e-04
↓ Style Difference (Input-to-Style):	 best=1.05e-03 | μ=1.32e-03 | σ=1.31e-04
↑ Reduction in Style Difference:	 40.33%
