In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def load_embeddings(folder_path):
    embeddings = []

    for file in Path(folder_path).glob("*.npy"):
        embedding = np.load(str(file))
        embedding = embedding.astype(np.float32)
        embeddings.append(embedding)
    return torch.tensor(np.array(embeddings[0])).to(device)

def load_embeddings_map(file_path):
    embeddings_map = []

    for file in Path(file_path).glob("*.csv"):
        df = pd.read_csv(file, header=None)
        embeddings_map.append(df.values.tolist())
    return pd.Series(embeddings_map[0])

folder_path = '../data/embeddings-public-test-clip'
embeddings = load_embeddings(folder_path)
embeddings_map = load_embeddings_map(folder_path)
embeddings_map = embeddings_map.apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else x)

print(f"Embeddings shape: {embeddings.shape}")
print(f"Embeddings device: {embeddings.device}")
print(f"Embeddings map length: {len(embeddings_map)}")

Embeddings shape: torch.Size([1413, 512])
Embeddings device: cuda:0
Embeddings map length: 1413


In [2]:
import json
import numpy as np
from PIL import Image

def load_annotation(file_path):
    data = []

    with open (file_path, 'r', encoding="utf-8") as file:
        data = json.load(file)    
    return data

annotation_path = "../data/vimmsd-public-test.json"
data = load_annotation(annotation_path)

In [3]:
from sentence_transformers import SentenceTransformer, util
import torch

text_model = SentenceTransformer('sentence-transformers/clip-ViT-B-32').to(device)

def get_clip_cos_sim_score(image_embedding, label):
    text_embedding = text_model.encode(label, convert_to_tensor=True).to(device)
    image_embedding = image_embedding.to(device)
    score = util.cos_sim(text_embedding, image_embedding)
    return score[0].item()

def get_label_by_score(prompt, image_embedding, threshold):
    score = get_clip_cos_sim_score(image_embedding=image_embedding, label=prompt)
    if (score >= threshold):
        return ["sarcasm", score]
    else:
        return ["not-sarcasm", score]
    
def map_test_label(label):
    return label if (label == "not-sarcasm") else "sarcasm"

def map_label(label):
    return 1 if (label== "sarcasm") else 0

  from tqdm.autonotebook import tqdm, trange







In [10]:
from sklearn import metrics
import matplotlib.pyplot as plt

# CLIP model
threshold = 0.1995
prompt = "sarcasm"
export = []

for id in data:
    image_name = data[id]['image']
    label_test = map_test_label(data[id]['label'])
    
    embedding_id = embeddings_map.loc[embeddings_map == image_name].index
    result = get_label_by_score(prompt, embeddings[embedding_id], threshold)
    label_predicted, score = result[0], result[1]
    export.append([id, image_name, label_predicted, np.round(score, 3)])

In [None]:
export_df = pd.DataFrame(export, columns=["key", "image", "label", "score"])
export_df.to_json('../data/exports/image_labels_ver0.1.json', orient='records')