In [6]:
import pandas as pd
import numpy as np
import os

base_path = "/data/user_data/sbharad2/SpeechCLIP/data"

embedding_read_path_pattern = (
    "{base_path}/Flickr8k.{csv_name}.token.txt.audio_embeddings/"
)

# FLICKR
csv_path_pattern = "{base_path}/flickr/{csv_name}.csv"

dev_df = pd.read_csv(
    csv_path_pattern.format(base_path=base_path, csv_name="flickr_dev_sampled")
)

# change target type as list
dev_df["target"] = dev_df["target"].apply(eval)

print("Dev size:", dev_df.shape)

Dev size: (1000, 6)


In [16]:
def _load_embeddings(df, embedding_read_path):
    embeddings = {}
    for i, r in df.iterrows():
        try:
            example_id = r["example_id"]
            embedding = np.load(os.path.join(embedding_read_path, example_id) + ".npy")
            embeddings[example_id] = embedding
        except FileNotFoundError:
            print(f"File not found for {example_id}.npy")

    return embeddings


embeddings = _load_embeddings(
    dev_df,
    embedding_read_path_pattern.format(
        base_path=base_path, csv_name="flickr_dev_sampled"
    ),
)

noisy_embeddings = _load_embeddings(
    dev_df,
    base_path + "/noisy.Flickr8k.flickr_dev_sampled.token.txt.audio_embeddings/",
)

In [17]:
def _get_image_id(example_id):
    return example_id.split("#")[0]


def _get_caption_id(example_id):
    return example_id.split("#")[1]


def compute_intra_image_similarities(embeddings):
    images_seen = set()
    similarities = []
    for example_id in embeddings.keys():
        image_id = _get_image_id(example_id)
        if image_id in images_seen:
            continue
        within_image_similarities = []  # Should have 10 numbers (5C2)
        for cid1 in range(5):
            for cid2 in range(cid1 + 1, 5):
                ex1 = image_id + "#" + str(cid1)
                ex2 = image_id + "#" + str(cid2)
                sim = np.dot(embeddings[ex1], embeddings[ex2]) / (
                    np.linalg.norm(embeddings[ex1]) * np.linalg.norm(embeddings[ex2])
                )
                within_image_similarities.append(sim)
        assert len(within_image_similarities) == 10
        # add mean similarity
        similarities.append(np.mean(within_image_similarities))
        images_seen.add(image_id)
    return similarities


def compute_inter_image_similarities(embeddings):
    images = list(set([_get_image_id(example_id) for example_id in embeddings.keys()]))
    inter_image_similarities = []
    for i in range(len(images)):
        outside_image_similarities = []
        for j in range(len(images)):
            if i == j:
                continue
            sim = np.dot(embeddings[images[i] + "#0"], embeddings[images[j] + "#0"]) / (
                np.linalg.norm(embeddings[images[i] + "#0"])
                * np.linalg.norm(embeddings[images[j] + "#0"])
            )
            outside_image_similarities.append(sim)
        assert len(outside_image_similarities) == len(images) - 1
        inter_image_similarities.append(np.mean(outside_image_similarities))
    return inter_image_similarities

In [18]:
intra_image_similarities = compute_intra_image_similarities(embeddings)
inter_image_similarities = compute_inter_image_similarities(embeddings)
print(
    "Intra image similarities:",
    np.mean(intra_image_similarities),
    len(intra_image_similarities),
)
print(
    "Inter image similarities:",
    np.mean(inter_image_similarities),
    len(inter_image_similarities),
)

Intra image similarities: 0.5382721 200
Inter image similarities: 0.025960488 200


In [None]:
# Compute for noisy embeddings
noisy_intra_image_similarities = compute_intra_image_similarities(noisy_embeddings)
noisy_inter_image_similarities = compute_inter_image_similarities(noisy_embeddings)
print(
    "Noisy Intra image similarities:",
    np.mean(noisy_intra_image_similarities),
    len(noisy_intra_image_similarities),
)
print(
    "Noisy Inter image similarities:",
    np.mean(noisy_inter_image_similarities),
    len(noisy_inter_image_similarities),
)

Noisy Intra image similarities: 0.40748093 200
Noisy Inter image similarities: 0.06338069 200


In [None]:
# Compute similarity between clean and noisy embeddings
def compute_similarity_between_clean_and_noisy(embeddings, noisy_embeddings):
    similarities = []
    for example_id in embeddings.keys():
        sim = np.dot(embeddings[example_id], noisy_embeddings[example_id]) / (
            np.linalg.norm(embeddings[example_id])
            * np.linalg.norm(noisy_embeddings[example_id])
        )
        similarities.append(sim)
    return similarities


print(
    "Similarity between clean and noisy embeddings:",
    np.mean(compute_similarity_between_clean_and_noisy(embeddings, noisy_embeddings)),
    len(compute_similarity_between_clean_and_noisy(embeddings, noisy_embeddings)),
)

Similarity between clean and noisy embeddings: 0.7409807 1000
