In [2]:
!pip install torch torchvision
!pip install git+https://github.com/openai/CLIP.git
!pip install scipy
!pip install gensim

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /private/var/folders/cz/d_k7_tfn267938ylcg0bywxr0000gn/T/pip-req-build-b5p36idu
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /private/var/folders/cz/d_k7_tfn267938ylcg0bywxr0000gn/T/pip-req-build-b5p36idu
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25ldone


In [21]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
import clip

from PIL import Image
import os

import torch
import torch.nn.functional as F

import numpy as np
from gensim.models import Word2Vec
import gensim.downloader as api

device = "cuda" if torch.cuda.is_available() else "cpu"

In [22]:
model, preprocess = clip.load("ViT-B/32", device=device)
w2v_model = api.load("word2vec-google-news-300")
resnet50_model = models.resnet50(pretrained=True)
resnet50_model.eval()



ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [23]:
def get_image_embedding(image):
    preprocessed_image = preprocess(image).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = model.encode_image(preprocessed_image)
        return image_features

def get_text_embedding(text):
    with torch.no_grad():
        return model.encode_text(clip.tokenize([text]).to(device))

def get_text_embedding_word2vec(text):
    words = text.split()

    embeddings = []

    for word in words:
        if word in w2v_model.key_to_index:
            word_embedding = w2v_model.get_vector(word)
            embeddings.append(word_embedding)

    if len(embeddings) == 0:
        return None

    embeddings = np.array(embeddings)
    text_embedding = np.mean(embeddings, axis=0)

    # Convert the NumPy array to a torch tensor
    text_embedding_tensor = torch.tensor(text_embedding)

    return text_embedding_tensor

def get_images_from_folder(folder_path):
    images = []

    for filename in os.listdir(folder_path):
        # Check if the file is an image file
        if filename.endswith(".jpg") or filename.endswith(".png") or filename.endswith(".jpeg"):
            # Open the image using PIL
            image_path = os.path.join(folder_path, filename)
            image = Image.open(image_path)
            # Append the image to the list
            images.append(get_image_embedding(image))

    return images

[tensor([[-2.8911e-01,  1.4448e-01, -2.8148e-01,  4.9361e-01,  6.0725e-01,
          -3.8749e-01,  8.5073e-02,  2.2460e-01,  8.4439e-01, -2.6256e-01,
          -1.2461e-02,  4.8880e-01,  7.5721e-01, -4.5405e-01, -9.1167e-02,
           1.2125e-01, -4.1563e-01,  1.8238e-01,  4.6077e-01,  8.8293e-02,
          -2.6824e-01, -1.0760e-01,  6.1279e-02, -2.0765e-01,  3.3849e-01,
          -8.9128e-02,  3.4414e-01, -3.5913e-01, -2.9704e-01, -1.7267e-01,
           2.8080e-01,  5.1674e-01,  2.6837e-01,  1.6358e-02,  3.0169e-01,
           5.4760e-02,  1.6023e-01, -5.4611e-02,  9.9827e-02, -9.7591e-02,
           3.0211e-01, -1.4098e-01,  8.8350e-02, -2.4714e-01, -4.4875e-01,
           5.6828e-01, -5.4467e-01,  2.1776e-01, -2.9209e-01,  2.7036e-01,
           1.3923e-01,  3.3794e-01,  3.6831e-01, -9.8649e-02, -6.8640e-01,
          -3.6855e-02, -6.2725e-01,  3.3968e-01,  8.0286e-02, -7.9475e-02,
           5.6304e-02, -6.7567e-01,  4.3627e-01,  9.0249e-02, -2.5391e-01,
          -5.7115e-02, -1

In [29]:
# Resnet image embedding 

preprocess_image = transforms.Compose([
    transforms.Resize(256),                   # Resize image to 256x256
    transforms.CenterCrop(224),               # Center crop image to 224x224
    transforms.ToTensor(),                    # Convert PIL image to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize image
])

def get_resnet_image_embedding(image_path):
    # Open and preprocess the image
    image = Image.open(image_path).convert("RGB")
    preprocessed_image = preprocess_image(image).unsqueeze(0)
    
    # Forward pass through the model to get image features
    with torch.no_grad():
        image_features = resnet50_model(preprocessed_image)

    return image_features

# def get_resnet_images_from_folder(folder_path):
#     images = []

#     for filename in os.listdir(folder_path):
#         # Check if the file is an image file
#         if filename.endswith(".jpg") or filename.endswith(".png") or filename.endswith(".jpeg"):
#             # Open the image using PIL
#             image_path = os.path.join(folder_path, filename)
#             image = Image.open(image_path)
#             # Append the image to the list
#             images.append(get_resnet_image_embedding(image))

#     return images

def get_resnet_images_from_folder(folder_path):
    images = []

    for filename in os.listdir(folder_path):
        # Check if the file is an image file
        if filename.endswith(".jpg") or filename.endswith(".png") or filename.endswith(".jpeg"):
            # Open the image using PIL
            image_path = os.path.join(folder_path, filename)
            # Append the image path to the list
            images.append(get_resnet_image_embedding(image_path))

    return images

In [46]:
from scipy.stats import spearmanr

def find_image_embedding_arithmetics(pair1,pair2):
    first_pair = pair1.split(':')
    second_pair = pair2.split(':')

    first_image_embeddings = get_images_from_folder(os.path.join("new_images" ,first_pair[0]))
    second_image_embeddings = get_images_from_folder(os.path.join("new_images" ,first_pair[1]))
    third_image_embeddings = get_images_from_folder(os.path.join("new_images" ,second_pair[0]))
    fourth_image_embeddings = get_images_from_folder(os.path.join("new_images" ,second_pair[1]))


    first_image_embeddings = torch.squeeze(torch.stack(first_image_embeddings, dim=0), dim=1)
    second_image_embeddings = torch.squeeze(torch.stack(second_image_embeddings, dim=0), dim=1)
    third_image_embeddings = torch.squeeze(torch.stack(third_image_embeddings, dim=0), dim=1)
    fourth_image_embeddings = torch.squeeze(torch.stack(fourth_image_embeddings, dim=0), dim=1)

    # Average of embeddings
    first_image_embeddings = torch.mean(first_image_embeddings, dim=0)
    second_image_embeddings = torch.mean(second_image_embeddings, dim=0)
    third_image_embeddings = torch.mean(third_image_embeddings, dim=0)
    fourth_image_embeddings = torch.mean(fourth_image_embeddings, dim=0)


    offset_vector_embedding = second_image_embeddings - first_image_embeddings + third_image_embeddings

    cos_sim = F.cosine_similarity(offset_vector_embedding, fourth_image_embeddings, dim=0)

    cos_image_before_normalization = torch.rad2deg(torch.acos(cos_sim))

    print("Angle for Image embedding before normalizing: Averaging ", cos_image_before_normalization)

    rho_image_before_normalization, p_value = spearmanr(offset_vector_embedding.cpu().numpy(), fourth_image_embeddings.cpu().numpy())
    print("Spearman's correlation coefficient for Image embedding before normalizing:", rho_image_before_normalization)


    first_image_embeddings = F.normalize(first_image_embeddings , p=2,dim=0)
    second_image_embeddings = F.normalize(second_image_embeddings , p=2,dim=0)
    third_image_embeddings = F.normalize(third_image_embeddings, p=2,dim=0)
    fourth_image_embeddings = F.normalize(fourth_image_embeddings , p=2,dim=0)

    offset_vector_embedding = second_image_embeddings - first_image_embeddings + third_image_embeddings

    cos_sim = F.cosine_similarity(offset_vector_embedding, fourth_image_embeddings, dim=0)

    cos_image_after_normalization = torch.rad2deg(torch.acos(cos_sim))

    print("Angle for Image embedding after normalizing: Averaging ", cos_image_after_normalization)

    rho_image_after_normalization, p_value = spearmanr(offset_vector_embedding.cpu().numpy(), fourth_image_embeddings.cpu().numpy())
    print("Spearman's correlation coefficient for Image embedding after normalizing:", rho_image_after_normalization)

    return cos_image_before_normalization, rho_image_before_normalization,cos_image_after_normalization, rho_image_after_normalization, offset_vector_embedding, fourth_image_embeddings


def find_text_embedding_arithmetics(pair1,pair2): 
    first_pair = pair1.split(':')
    second_pair = pair2.split(':')

    first = get_text_embedding(first_pair[0]).squeeze(0)
    second = get_text_embedding(first_pair[1]).squeeze(0)
    third = get_text_embedding(second_pair[0]).squeeze(0)
    fourth = get_text_embedding(second_pair[1]).squeeze(0)

    offset_vector_embedding = second - first + third

    # King - Queen + Man = Woman

    cos_sim_init = F.cosine_similarity(offset_vector_embedding, fourth, dim=0)

    cos_text_before_normalization = torch.rad2deg(torch.acos(cos_sim_init))

    print("Angle for text embedding before normalizing: Averaging ", cos_text_before_normalization)
    rho_text_before_normalization , p_value = spearmanr(offset_vector_embedding.cpu().numpy(), fourth.cpu().numpy())
    print("Spearman's correlation coefficient for text embedding before normalizing:", rho_text_before_normalization)

    first = F.normalize(first , p=2,dim=0)
    second = F.normalize(second , p=2,dim=0)
    third = F.normalize(third , p=2,dim=0)
    fourth = F.normalize(fourth, p=2,dim=0)

    offset_vector_embedding = second - first + third

    cos_sim = F.cosine_similarity(offset_vector_embedding, fourth, dim=0)

    cos_text_after_normalization = torch.rad2deg(torch.acos(cos_sim))

    print("Angle for text embedding after normalizing: Averaging ", cos_text_after_normalization)
    rho_text_after_normalization, p_value = spearmanr(offset_vector_embedding.cpu().numpy(), fourth.cpu().numpy())
    print("Spearman's correlation coefficient for text embedding after normalizing:", rho_text_after_normalization)

    return cos_sim_init, cos_text_before_normalization, rho_text_before_normalization, cos_text_after_normalization, rho_text_after_normalization, offset_vector_embedding, fourth

def find_w2v_text_embedding_arithmetics(pair1,pair2): 
    first_pair = pair1.split(':')
    second_pair = pair2.split(':')

    w2v_first = get_text_embedding_word2vec(first_pair[0])
    w2v_second = get_text_embedding_word2vec(first_pair[1])
    w2v_third = get_text_embedding_word2vec(second_pair[0])
    w2v_fourth = get_text_embedding_word2vec(second_pair[1])

    w2v_offset_vector_embedding = w2v_second - w2v_first + w2v_third

    # King - Queen + Man = Woman

    w2v_cos_sim_init = F.cosine_similarity(w2v_offset_vector_embedding, w2v_fourth, dim=0)

    w2v_cos_text_before_normalization = torch.rad2deg(torch.acos(w2v_cos_sim_init))

    print("Word2Vec Angle for text embedding before normalizing: Averaging ", w2v_cos_text_before_normalization)
    w2v_rho_text_before_normalization , w2v_p_value = spearmanr(w2v_offset_vector_embedding.cpu().numpy(), w2v_fourth.cpu().numpy())
    print("Word2Vec Spearman's correlation coefficient for text embedding before normalizing:", w2v_rho_text_before_normalization)

    w2v_first = F.normalize(w2v_first , p=2,dim=0)
    w2v_second = F.normalize(w2v_second , p=2,dim=0)
    w2v_third = F.normalize(w2v_third , p=2,dim=0)
    w2v_fourth = F.normalize(w2v_fourth, p=2,dim=0)

    w2v_offset_vector_embedding = w2v_second - w2v_first + w2v_third

    w2v_cos_sim = F.cosine_similarity(w2v_offset_vector_embedding, w2v_fourth, dim=0)

    w2v_cos_text_after_normalization = torch.rad2deg(torch.acos(w2v_cos_sim))

    print("Word2Vec Angle for text embedding after normalizing: Averaging ", w2v_cos_text_after_normalization)
    w2v_rho_text_after_normalization, w2v_p_value = spearmanr(w2v_offset_vector_embedding.cpu().numpy(), w2v_fourth.cpu().numpy())
    print("Word2Vec Spearman's correlation coefficient for text embedding after normalizing:", w2v_rho_text_after_normalization)

    return w2v_cos_sim_init, w2v_cos_text_before_normalization, w2v_rho_text_before_normalization, w2v_cos_text_after_normalization, w2v_rho_text_after_normalization, w2v_offset_vector_embedding, w2v_fourth

def find_resnet_image_embedding_arithmetics(pair1,pair2):
    first_pair = pair1.split(':')
    second_pair = pair2.split(':')

    first_image_embeddings = get_resnet_images_from_folder(os.path.join("new_images" ,first_pair[0]))
    second_image_embeddings = get_resnet_images_from_folder(os.path.join("new_images" ,first_pair[1]))
    third_image_embeddings = get_resnet_images_from_folder(os.path.join("new_images" ,second_pair[0]))
    resnet_fourth_image_embeddings = get_resnet_images_from_folder(os.path.join("new_images" ,second_pair[1]))


    first_image_embeddings = torch.squeeze(torch.stack(first_image_embeddings, dim=0), dim=1)
    second_image_embeddings = torch.squeeze(torch.stack(second_image_embeddings, dim=0), dim=1)
    third_image_embeddings = torch.squeeze(torch.stack(third_image_embeddings, dim=0), dim=1)
    resnet_fourth_image_embeddings = torch.squeeze(torch.stack(resnet_fourth_image_embeddings, dim=0), dim=1)

    # Average of embeddings
    first_image_embeddings = torch.mean(first_image_embeddings, dim=0)
    second_image_embeddings = torch.mean(second_image_embeddings, dim=0)
    third_image_embeddings = torch.mean(third_image_embeddings, dim=0)
    resnet_fourth_image_embeddings = torch.mean(resnet_fourth_image_embeddings, dim=0)


    resnet_offset_vector_embedding = second_image_embeddings - first_image_embeddings + third_image_embeddings

    cos_sim = F.cosine_similarity(resnet_offset_vector_embedding, resnet_fourth_image_embeddings, dim=0)

    resnet_cos_image_before_normalization = torch.rad2deg(torch.acos(cos_sim))

    print("Angle for Image embedding before normalizing: Averaging ", resnet_cos_image_before_normalization)

    resnet_rho_image_before_normalization, p_value = spearmanr(resnet_offset_vector_embedding.cpu().numpy(), resnet_fourth_image_embeddings.cpu().numpy())
    print("Spearman's correlation coefficient for Image embedding before normalizing:", resnet_rho_image_before_normalization)


    first_image_embeddings = F.normalize(first_image_embeddings , p=2,dim=0)
    second_image_embeddings = F.normalize(second_image_embeddings , p=2,dim=0)
    third_image_embeddings = F.normalize(third_image_embeddings, p=2,dim=0)
    resnet_fourth_image_embeddings = F.normalize(resnet_fourth_image_embeddings , p=2,dim=0)

    resnet_offset_vector_embedding = second_image_embeddings - first_image_embeddings + third_image_embeddings

    cos_sim = F.cosine_similarity(resnet_offset_vector_embedding, resnet_fourth_image_embeddings, dim=0)

    resnet_cos_image_after_normalization = torch.rad2deg(torch.acos(cos_sim))

    print("Angle for Image embedding after normalizing: Averaging ", resnet_cos_image_after_normalization)

    resnet_rho_image_after_normalization, p_value = spearmanr(resnet_offset_vector_embedding.cpu().numpy(), resnet_fourth_image_embeddings.cpu().numpy())
    print("Spearman's correlation coefficient for Image embedding after normalizing:", resnet_rho_image_after_normalization)

    return resnet_cos_image_before_normalization, resnet_rho_image_before_normalization, resnet_cos_image_after_normalization, resnet_rho_image_after_normalization, resnet_offset_vector_embedding, resnet_fourth_image_embeddings


In [32]:
# pair1 = "hour:seconds"
# pair2 = "feet:inches"
# find_resnet_image_embedding_arithmetics(pair1,pair2)

In [47]:
import os
import glob
import pandas as pd


def process_file(pair):

    pairs = pair.split("::")
    pair1 = pairs[0]
    pair2 = pairs[1]

    # image embedding results
    cos_image_before_normalization, rho_image_before_normalization,cos_image_after_normalization, rho_image_after_normalization, offset_vector_embedding_image, fourth_image_embedding = find_image_embedding_arithmetics(pair1,pair2)
    resnet_cos_image_before_normalization, resnet_rho_image_before_normalization, resnet_cos_image_after_normalization, resnet_rho_image_after_normalization, resnet_offset_vector_embedding_image, resnet_fourth_image_embedding = find_resnet_image_embedding_arithmetics(pair1,pair2)

    # word embedding results
    cos_sim_init, cos_text_before_normalization, rho_text_before_normalization, cos_text_after_normalization, rho_text_after_normalization, offset_vector_embedding_text, fourth_text_embedding = find_text_embedding_arithmetics(pair1,pair2)
    w2v_cos_sim_init, w2v_cos_text_before_normalization, w2v_rho_text_before_normalization, w2v_cos_text_after_normalization, w2v_rho_text_after_normalization, w2v_offset_vector_embedding_text, w2v_fourth_text_embedding = find_w2v_text_embedding_arithmetics(pair1,pair2)

    new_data = {
        'pair1':pair1,
        'pair2':pair2,

        'cos_image_before_normalization':cos_image_before_normalization.item(),
        'rho_image_before_normalization':rho_image_before_normalization.item(),
        'cos_image_after_normalization':cos_image_after_normalization.item(),
        'rho_image_after_normalization':rho_image_after_normalization.item(),

        'resnet_cos_image_before_normalization':resnet_cos_image_before_normalization.item(),
        'resnet_rho_image_before_normalization':resnet_rho_image_before_normalization.item(),
        'resnet_cos_image_after_normalization':resnet_cos_image_after_normalization.item(),
        'resnet_rho_image_after_normalization':resnet_rho_image_after_normalization.item(),

        'cos_text_before_normalization':cos_text_before_normalization.item(),
        'rho_text_before_normalization':rho_text_before_normalization.item(),
        'cos_text_after_normalization':cos_text_after_normalization.item(),
        'rho_text_after_normalization':rho_text_after_normalization.item(),

        'w2v_cos_text_before_normalization':w2v_cos_text_before_normalization.item(),
        'w2v_rho_text_before_normalization':w2v_rho_text_before_normalization.item(),
        'w2v_cos_text_after_normalization':w2v_cos_text_after_normalization.item(),
        'w2v_rho_text_after_normalization':w2v_rho_text_after_normalization.item(),

        'cos_sim_init_CLIP': cos_sim_init.item(),
        'cos_sim_init_W2V': w2v_cos_sim_init.item()
        
    }

    embedding_data = {
        'pair1':pair1,
        'pair2':pair2,

        'offset_vector_embedding_image' : offset_vector_embedding_image,
        'fourth_image_embedding' : fourth_image_embedding,

        'resnet_offset_vector_embedding_image' : resnet_offset_vector_embedding_image,
        'resnet_fourth_image_embedding' : resnet_fourth_image_embedding,

        'offset_vector_embedding_text' : offset_vector_embedding_text,
        'fourth_text_embedding':fourth_text_embedding,
        
        'w2v_offset_vector_embedding_text' : w2v_offset_vector_embedding_text,
        'w2v_fourth_text_embedding':w2v_fourth_text_embedding
    }

    return new_data, embedding_data

In [49]:
results = []
embedding_results = []

# Process Training and Testing folders
# for sub_folder in ['Training', 'Testing']:
#     sub_folder_path = os.path.join(data_folder, sub_folder)
#     process_folder(sub_folder_path,results,embedding_results)

lines = []

with open('new_pairs.txt', 'r') as file:
    for line in file:
        # Remove newline character at the end of each line and append to the list
        lines.append(line.strip())
        
for pair in lines:
    print("---- Pair ----" ,pair)
    similarity_data, embedding_data = process_file(pair)
    results.append(similarity_data)
    embedding_results.append(embedding_data)

df = pd.DataFrame(results)

df.to_csv('data_w2v_cos_sim.csv', index=False) # TODO: new file

df_embedding = pd.DataFrame(embedding_results)

df_embedding.to_csv('embeddings_w2v_cos_sim.csv', index=False) # TODO: new file

---- Pair ---- hour:seconds::feet:inches
Angle for Image embedding before normalizing: Averaging  tensor(44.0971)
Spearman's correlation coefficient for Image embedding before normalizing: 0.40121726262002033
Angle for Image embedding after normalizing: Averaging  tensor(35.3036)
Spearman's correlation coefficient for Image embedding after normalizing: 0.4918100223446745
Angle for Image embedding before normalizing: Averaging  tensor(56.2933)
Spearman's correlation coefficient for Image embedding before normalizing: 0.5911476631476632
Angle for Image embedding after normalizing: Averaging  tensor(50.7823)
Spearman's correlation coefficient for Image embedding after normalizing: 0.6504458784458784
Angle for text embedding before normalizing: Averaging  tensor(43.1168)
Spearman's correlation coefficient for text embedding before normalizing: 0.2857623868461107
Angle for text embedding after normalizing: Averaging  tensor(38.9310)
Spearman's correlation coefficient for text embedding afte