In [1]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import torch.nn as nn
import pandas as pd

class CLIPFineTuner(nn.Module):
    def __init__(self, clip_model, num_classes):
        super(CLIPFineTuner, self).__init__()
        self.clip_model = clip_model
        self.fc = nn.Linear(clip_model.config.projection_dim, num_classes)

    def forward(self, images, input_ids):
        outputs = self.clip_model(pixel_values=images, input_ids=input_ids)
        logits = self.fc(outputs.image_embeds)
        return logits
    
num_classes = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
model_ft = CLIPFineTuner(model, num_classes).to(device)
model_ft.load_state_dict(torch.load('../data/models/clip-vit-base-patch32-finetuned.pth', map_location=device))
model_ft.eval()
labels = ["sarcasm", "not-sarcasm"]
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

  model_ft.load_state_dict(torch.load('../data/models/clip-vit-base-patch32-finetuned.pth', map_location=device))


In [2]:
def predict_sarcasm(image_path):
    image = Image.open(image_path).convert("RGB")
    image_input = processor(images=image, return_tensors="pt").to(device)
    text_inputs = processor(text=[f"{label}." for label in labels], return_tensors="pt", padding=True).to(device)

    with torch.no_grad():
        outputs = model_ft(image_input.pixel_values, text_inputs.input_ids)
        probabilities = torch.softmax(outputs, dim=-1)
        predicted_class_idx = torch.argmax(probabilities, dim=-1).item()
    return labels[predicted_class_idx], probabilities[0][predicted_class_idx].item()

In [3]:
import json
import numpy as np
from PIL import Image

def load_annotation(file_path):
    data = []

    with open (file_path, 'r', encoding="utf-8") as file:
        data = json.load(file)    
    return data

annotation_path = "../data/vimmsd-public-test.json"
data = load_annotation(annotation_path)

In [4]:
def map_test_label(label):
    return label if (label == "not-sarcasm") else "sarcasm"

def map_label(label):
    return 1 if (label== "sarcasm") else 0

export = []

for id in data:
    image_name = data[id]['image']
    label_test = map_test_label(data[id]['label'])

    image_path = "../data/dev-images/{}".format(image_name)
    predicted_label, confidence = predict_sarcasm(image_path)

    export.append([id, image_name, predicted_label, np.round(confidence, 3)])

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [6]:
export_df = pd.DataFrame(export, columns=["key", "image", "label", "score"])
export_df.to_json('../data/exports/image_labels_ver0.2.json', orient='records')