In [1]:
!pip install torch torchvision
!pip install git+https://github.com/openai/CLIP.git
!pip install scipy
!pip install gensim

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /private/var/folders/cz/d_k7_tfn267938ylcg0bywxr0000gn/T/pip-req-build-lxgn3jxi
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /private/var/folders/cz/d_k7_tfn267938ylcg0bywxr0000gn/T/pip-req-build-lxgn3jxi
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25ldone


In [4]:
import torch
import clip

from PIL import Image
import os

import torch
import torch.nn.functional as F

import numpy as np
from gensim.models import Word2Vec
import gensim.downloader as api

device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
model, preprocess = clip.load("ViT-B/32", device=device)
w2v_model = api.load("word2vec-google-news-300")

In [5]:
def get_image_embedding(image):
    preprocessed_image = preprocess(image).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = model.encode_image(preprocessed_image)
        return image_features

def get_text_embedding(text):
    with torch.no_grad():
        return model.encode_text(clip.tokenize([text]).to(device))

def get_text_embedding_word2vec(text):
    words = text.split()

    embeddings = []

    for word in words:
        if word in w2v_model.key_to_index:
            word_embedding = w2v_model.get_vector(word)
            embeddings.append(word_embedding)

    if len(embeddings) == 0:
        return None

    embeddings = np.array(embeddings)
    text_embedding = np.mean(embeddings, axis=0)

    # Convert the NumPy array to a torch tensor
    text_embedding_tensor = torch.tensor(text_embedding)

    return text_embedding_tensor

def get_images_from_folder(folder_path):
    images = []

    for filename in os.listdir(folder_path):
        # Check if the file is an image file
        if filename.endswith(".jpg") or filename.endswith(".png") or filename.endswith(".jpeg"):
            # Open the image using PIL
            image_path = os.path.join(folder_path, filename)
            image = Image.open(image_path)
            # Append the image to the list
            images.append(get_image_embedding(image))

    return images

In [6]:
from scipy.stats import spearmanr

def find_image_embedding_arithmetics(pair1,pair2):
    first_pair = pair1.split(':')
    second_pair = pair2.split(':')

    first_image_embeddings = get_images_from_folder(os.path.join("new_images" ,first_pair[0]))
    second_image_embeddings = get_images_from_folder(os.path.join("new_images" ,first_pair[1]))
    third_image_embeddings = get_images_from_folder(os.path.join("new_images" ,second_pair[0]))
    fourth_image_embeddings = get_images_from_folder(os.path.join("new_images" ,second_pair[1]))


    first_image_embeddings = torch.squeeze(torch.stack(first_image_embeddings, dim=0), dim=1)
    second_image_embeddings = torch.squeeze(torch.stack(second_image_embeddings, dim=0), dim=1)
    third_image_embeddings = torch.squeeze(torch.stack(third_image_embeddings, dim=0), dim=1)
    fourth_image_embeddings = torch.squeeze(torch.stack(fourth_image_embeddings, dim=0), dim=1)

    # Average of embeddings
    first_image_embeddings = torch.mean(first_image_embeddings, dim=0)
    second_image_embeddings = torch.mean(second_image_embeddings, dim=0)
    third_image_embeddings = torch.mean(third_image_embeddings, dim=0)
    fourth_image_embeddings = torch.mean(fourth_image_embeddings, dim=0)


    offset_vector_embedding = second_image_embeddings - first_image_embeddings + third_image_embeddings

    cos_sim = F.cosine_similarity(offset_vector_embedding, fourth_image_embeddings, dim=0)

    cos_image_before_normalization = torch.rad2deg(torch.acos(cos_sim))

    print("Angle for Image embedding before normalizing: Averaging ", cos_image_before_normalization)

    rho_image_before_normalization, p_value = spearmanr(offset_vector_embedding.cpu().numpy(), fourth_image_embeddings.cpu().numpy())
    print("Spearman's correlation coefficient for Image embedding before normalizing:", rho_image_before_normalization)


    first_image_embeddings = F.normalize(first_image_embeddings , p=2,dim=0)
    second_image_embeddings = F.normalize(second_image_embeddings , p=2,dim=0)
    third_image_embeddings = F.normalize(third_image_embeddings, p=2,dim=0)
    fourth_image_embeddings = F.normalize(fourth_image_embeddings , p=2,dim=0)

    offset_vector_embedding = second_image_embeddings - first_image_embeddings + third_image_embeddings

    cos_sim = F.cosine_similarity(offset_vector_embedding, fourth_image_embeddings, dim=0)

    cos_image_after_normalization = torch.rad2deg(torch.acos(cos_sim))

    print("Angle for Image embedding after normalizing: Averaging ", cos_image_after_normalization)

    rho_image_after_normalization, p_value = spearmanr(offset_vector_embedding.cpu().numpy(), fourth_image_embeddings.cpu().numpy())
    print("Spearman's correlation coefficient for Image embedding after normalizing:", rho_image_after_normalization)

    return cos_image_before_normalization, rho_image_before_normalization,cos_image_after_normalization, rho_image_after_normalization, offset_vector_embedding, fourth_image_embeddings


def find_text_embedding_arithmetics(pair1,pair2): 
    first_pair = pair1.split(':')
    second_pair = pair2.split(':')

    first = get_text_embedding(first_pair[0]).squeeze(0)
    second = get_text_embedding(first_pair[1]).squeeze(0)
    third = get_text_embedding(second_pair[0]).squeeze(0)
    fourth = get_text_embedding(second_pair[1]).squeeze(0)

    offset_vector_embedding = second - first + third

    # King - Queen + Man = Woman

    cos_sim = F.cosine_similarity(offset_vector_embedding, fourth, dim=0)

    cos_text_before_normalization = torch.rad2deg(torch.acos(cos_sim))
    

    print("Angle for text embedding before normalizing: Averaging ", cos_text_before_normalization)
    rho_text_before_normalization , p_value = spearmanr(offset_vector_embedding.cpu().numpy(), fourth.cpu().numpy())
    print("Spearman's correlation coefficient for text embedding before normalizing:", rho_text_before_normalization)


    first = F.normalize(first , p=2,dim=0)
    second = F.normalize(second , p=2,dim=0)
    third = F.normalize(third , p=2,dim=0)
    fourth = F.normalize(fourth, p=2,dim=0)

    offset_vector_embedding = second - first + third

    cos_sim = F.cosine_similarity(offset_vector_embedding, fourth, dim=0)

    cos_text_after_normalization = torch.rad2deg(torch.acos(cos_sim))

    print("Angle for text embedding after normalizing: Averaging ", cos_text_after_normalization)
    rho_text_after_normalization, p_value = spearmanr(offset_vector_embedding.cpu().numpy(), fourth.cpu().numpy())
    print("Spearman's correlation coefficient for text embedding after normalizing:", rho_text_after_normalization)

    return cos_text_before_normalization, rho_text_before_normalization, cos_text_after_normalization, rho_text_after_normalization, offset_vector_embedding, fourth

def find_w2v_text_embedding_arithmetics(pair1,pair2): 
    first_pair = pair1.split(':')
    second_pair = pair2.split(':')

    w2v_first = get_text_embedding_word2vec(first_pair[0])
    w2v_second = get_text_embedding_word2vec(first_pair[1])
    w2v_third = get_text_embedding_word2vec(second_pair[0])
    w2v_fourth = get_text_embedding_word2vec(second_pair[1])

    w2v_offset_vector_embedding = w2v_second - w2v_first + w2v_third

    # King - Queen + Man = Woman

    w2v_cos_sim = F.cosine_similarity(w2v_offset_vector_embedding, w2v_fourth, dim=0)

    w2v_cos_text_before_normalization = torch.rad2deg(torch.acos(w2v_cos_sim))
    

    print("Word2Vec Angle for text embedding before normalizing: Averaging ", w2v_cos_text_before_normalization)
    w2v_rho_text_before_normalization , w2v_p_value = spearmanr(w2v_offset_vector_embedding.cpu().numpy(), w2v_fourth.cpu().numpy())
    print("Word2Vec Spearman's correlation coefficient for text embedding before normalizing:", w2v_rho_text_before_normalization)


    w2v_first = F.normalize(w2v_first , p=2,dim=0)
    w2v_second = F.normalize(w2v_second , p=2,dim=0)
    w2v_third = F.normalize(w2v_third , p=2,dim=0)
    w2v_fourth = F.normalize(w2v_fourth, p=2,dim=0)

    w2v_offset_vector_embedding = w2v_second - w2v_first + w2v_third

    w2v_cos_sim = F.cosine_similarity(w2v_offset_vector_embedding, w2v_fourth, dim=0)

    w2v_cos_text_after_normalization = torch.rad2deg(torch.acos(w2v_cos_sim))

    print("Word2Vec Angle for text embedding after normalizing: Averaging ", w2v_cos_text_after_normalization)
    w2v_rho_text_after_normalization, w2v_p_value = spearmanr(w2v_offset_vector_embedding.cpu().numpy(), w2v_fourth.cpu().numpy())
    print("Word2Vec Spearman's correlation coefficient for text embedding after normalizing:", w2v_rho_text_after_normalization)

    return w2v_cos_text_before_normalization, w2v_rho_text_before_normalization, w2v_cos_text_after_normalization, w2v_rho_text_after_normalization, w2v_offset_vector_embedding, w2v_fourth


In [34]:
# pair1 = "hour:seconds"
# pair2 = "feet:inches"
# find_w2v_text_embedding_arithmetics(pair1,pair2)

In [7]:
import os
import glob
import pandas as pd


def process_file(pair):

    pairs = pair.split("::")
    pair1 = pairs[0]
    pair2 = pairs[1]


    cos_image_before_normalization, rho_image_before_normalization,cos_image_after_normalization, rho_image_after_normalization, offset_vector_embedding_image, fourth_image_embedding = find_image_embedding_arithmetics(pair1,pair2)
    cos_text_before_normalization, rho_text_before_normalization, cos_text_after_normalization, rho_text_after_normalization, offset_vector_embedding_text, fourth_text_embedding = find_text_embedding_arithmetics(pair1,pair2)
    w2v_cos_text_before_normalization, w2v_rho_text_before_normalization, w2v_cos_text_after_normalization, w2v_rho_text_after_normalization, w2v_offset_vector_embedding_text, w2v_fourth_text_embedding = find_w2v_text_embedding_arithmetics(pair1,pair2)

    new_data = {
        'pair1':pair1,
        'pair2':pair2,

        'cos_image_before_normalization':cos_image_before_normalization.item(),
        'rho_image_before_normalization':rho_image_before_normalization.item(),
        'cos_image_after_normalization':cos_image_after_normalization.item(),
        'rho_image_after_normalization':rho_image_after_normalization.item(),

        'cos_text_before_normalization':cos_text_before_normalization.item(),
        'rho_text_before_normalization':rho_text_before_normalization.item(),
        'cos_text_after_normalization':cos_text_after_normalization.item(),
        'rho_text_after_normalization':rho_text_after_normalization.item(),

        'w2v_cos_text_before_normalization':w2v_cos_text_before_normalization.item(),
        'w2v_rho_text_before_normalization':w2v_rho_text_before_normalization.item(),
        'w2v_cos_text_after_normalization':w2v_cos_text_after_normalization.item(),
        'w2v_rho_text_after_normalization':w2v_rho_text_after_normalization.item()
    }

    embedding_data = {
        'pair1':pair1,
        'pair2':pair2,

        'offset_vector_embedding_image' : offset_vector_embedding_image,
        'fourth_image_embedding' : fourth_image_embedding,

        'offset_vector_embedding_text' : offset_vector_embedding_text,
        'fourth_text_embedding':fourth_text_embedding,
        
        'w2v_offset_vector_embedding_text' : w2v_offset_vector_embedding_text,
        'w2v_fourth_text_embedding':w2v_fourth_text_embedding
    }

    return new_data, embedding_data

In [9]:
results = []
embedding_results = []

# Process Training and Testing folders
# for sub_folder in ['Training', 'Testing']:
#     sub_folder_path = os.path.join(data_folder, sub_folder)
#     process_folder(sub_folder_path,results,embedding_results)

lines = []

with open('test_data.txt', 'r') as file:
    for line in file:
        # Remove newline character at the end of each line and append to the list
        lines.append(line.strip())
        
for pair in lines:
    print("---- Pair ----" ,pair)
    similarity_data, embedding_data = process_file(pair)
    results.append(similarity_data)
    embedding_results.append(embedding_data)

df = pd.DataFrame(results)

df.to_csv('data_test.csv', index=False) # TODO: new file

df_embedding = pd.DataFrame(embedding_results)

df_embedding.to_csv('embeddings_test.csv', index=False) # TODO: new file

---- Pair ---- hour:seconds::feet:inches




Angle for Image embedding before normalizing: Averaging  tensor(44.0971)
Spearman's correlation coefficient for Image embedding before normalizing: 0.40121726262002033
Angle for Image embedding after normalizing: Averaging  tensor(35.3036)
Spearman's correlation coefficient for Image embedding after normalizing: 0.4918100223446745
Angle for text embedding before normalizing: Averaging  tensor(43.1168)
Spearman's correlation coefficient for text embedding before normalizing: 0.2857623868461107
Angle for text embedding after normalizing: Averaging  tensor(38.9310)
Spearman's correlation coefficient for text embedding after normalizing: 0.30123405331250497
Word2Vec Angle for text embedding before normalizing: Averaging  tensor(58.0464)
Word2Vec Spearman's correlation coefficient for text embedding before normalizing: 0.5296546773910836
Word2Vec Angle for text embedding after normalizing: Averaging  tensor(59.3113)
Word2Vec Spearman's correlation coefficient for text embedding after normal