In [None]:
import csv
import sys
import pandas as pd
import ast
import numpy as np
import pickle

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

from evaluation_a import evaluation_single

In [2]:
dataA = pd.read_pickle("dataA_bert-base-uncased_without_CLS_SEP.pkl")

new_i = []
new_l = []

for i in range(len(dataA)):
    new_idiomatic = dataA.iloc[i]["gpt_idiomatic_meaning"].split("is a metaphor for ")[1]
    new_i.append(new_idiomatic)
    new_literal = dataA.iloc[i]["gpt_literal_meaning"].split("literal")[1].strip()
    new_l.append(new_literal)

dataA["gpt_literal_meaning_cut"] = new_l
dataA["gpt_idiomatic_meaning_cut"] = new_i

data = dataA[dataA["subset"] == "Test"]

In [3]:
def sim_scores(current, sentences):
    # input = current line(example) & embeddings for sentence + captions
    
    scores = {}
    # keys = image names
    # values = scores

    embeddings = model.encode(sentences)
    similarities = model.similarity(embeddings[0], embeddings)
    # compares the embedding for the description of idiomatic / literal use of compound
    # with each of the embeddings, including itself and all the captions

    # [0][x] required because similarities tensor has additional layer
    score1 = similarities[0][1].item()
    scores[current["image1_name"]] = score1

    score2 = similarities[0][2].item()
    scores[current["image2_name"]] = score2

    score3 = similarities[0][3].item()
    scores[current["image3_name"]] = score3

    score4 = similarities[0][4].item()
    scores[current["image4_name"]] = score4

    score5 = similarities[0][5].item()
    scores[current["image5_name"]] = score5
    
    return scores

def rank_images(scores):
    ranking = []
    # scores = dictionary containing the cos similarity scores
    # comparing the sentence with the captions of the five images
    # keys = image names
    # values = scores

    for i in range(5):
        # find key which corresponds to the highest value
        m = max(scores, key=scores.get)
        # add the key (image name) to the ranking
        ranking.append(m)
        # delete the entry in the dictionary
        del scores[m]

    return ranking

In [4]:
# using BERT predictions, cut gpt idiomatic & literal meaning

submission = []

total_acc = 0
total_spearman = 0

for i in range(len(data)):

    current = data.iloc[i]

    # get gpt idiomatic / literal meaning based on BERT prediction
    reference = current["gpt_idiomatic_meaning_cut"]
    if current["pred_compound_embedding_sentence_compared_to_compound_embedding_gpt_sentence_meanSecondToLast"] == "literal":
        reference = current["gpt_literal_meaning_cut"]

    sentences = [reference, 
                 current["image1_caption"],
                 current["image2_caption"],
                 current["image3_caption"],
                 current["image4_caption"],
                 current["image5_caption"]]

    scores = sim_scores(current, sentences)
    ranking = rank_images(scores)

    submission.append({
        "compound": current["compound"],
        "expected_order":ranking 
    })

submission_df = pd.DataFrame(submission)
submission_df.to_csv("submission_EN.tsv", sep="\t", index=False)
print("File saved as submission_EN.tsv")

In [5]:
# preprocessing function von Victoria

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


def prepare_text(raw_text):
    
    # Convert text to lowercase and remove punctuation
    normalized_text = raw_text.lower()
    normalized_text = re.sub(r"[^a-zA-Z\s]", "", normalized_text)

    # Tokenize the normalized text
    tokens = word_tokenize(normalized_text)

    # Apply POS tagging and retain only nouns, verbs
    pos_tags = nltk.pos_tag(tokens, tagset='universal')
    pos_tags_to_keep = {"NOUN", "VERB"}
    filtered_tokens = [word for word, pos in pos_tags if pos in pos_tags_to_keep]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_text = [word for word in filtered_tokens if word.lower() not in stop_words]

    # Lemmatize the remaining tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_text]
    
    return " ".join(lemmatized_text)

In [6]:
# comparing embeddings for captions with that of compound (not whole sentence), with preprocessing; no BERT or SBERT

submission = []

for i in range(len(data)):

    current = data.iloc[i]
    sentences = [prepare_text(current["compound"]), 
                 prepare_text(current["image1_caption"]),
                 prepare_text(current["image2_caption"]),
                 prepare_text(current["image3_caption"]),
                 prepare_text(current["image4_caption"]),
                 prepare_text(current["image5_caption"])]

    scores = sim_scores(current, sentences)
    ranking = rank_images(scores)

    submission.append({
        "compound": current["compound"],
        "expected_order":ranking 
    })

submission_df = pd.DataFrame(submission)
submission_df.to_csv("submission_EN.tsv", sep="\t", index=False)
print("File saved as submission_EN.tsv")

File saved as submission_EN.tsv
