approach as in subtaskA_1, adapted for dev data
& added Ann's code to save results in the correct format for submission

In [1]:
import csv
import sys
import pandas as pd
import ast
import numpy as np
import pickle

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

from evaluation_a import evaluation_single

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Code von Wiebke

fileName = "subtask_a_dev.tsv"
fileDirectory = "AdMIRe Subtask A Dev/dev"

data = pd.read_csv(fileDirectory + "/" + fileName, sep='\t')

In [3]:
# preprocessing function von Victoria

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


def prepare_text(raw_text):
    
    # Convert text to lowercase and remove punctuation
    normalized_text = raw_text.lower()
    normalized_text = re.sub(r"[^a-zA-Z\s]", "", normalized_text)

    # Tokenize the normalized text
    tokens = word_tokenize(normalized_text)

    # Apply POS tagging and retain only nouns, verbs
    pos_tags = nltk.pos_tag(tokens, tagset='universal')
    pos_tags_to_keep = {"NOUN", "VERB"}
    filtered_tokens = [word for word, pos in pos_tags if pos in pos_tags_to_keep]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_text = [word for word in filtered_tokens if word.lower() not in stop_words]

    # Lemmatize the remaining tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_text]
    
    return " ".join(lemmatized_text)

In [4]:
def sim_scores(current, sentences):
    # input = current line(example) & embeddings for sentence + captions
    
    scores = {}
    # keys = image names
    # values = scores

    embeddings = model.encode(sentences)
    similarities = model.similarity(embeddings[0], embeddings)
    # compares the embedding for the sentence including the compound 
    # with each of the embeddings, including itself and all the captions

    # [0][x] required because similarities tensor has additional layer
    score1 = similarities[0][1].item()
    scores[current["image1_name"]] = score1

    score2 = similarities[0][2].item()
    scores[current["image2_name"]] = score2

    score3 = similarities[0][3].item()
    scores[current["image3_name"]] = score3

    score4 = similarities[0][4].item()
    scores[current["image4_name"]] = score4

    score5 = similarities[0][5].item()
    scores[current["image5_name"]] = score5
    
    return scores

In [5]:
def rank_images(scores):
    ranking = []
    # scores = dictionary containing the cos similarity scores
    # comparing the sentence with the captions of the five images
    # keys = image names
    # values = scores

    for i in range(5):
        # find key which corresponds to the highest value
        m = max(scores, key=scores.get)
        # add the key (image name) to the ranking
        ranking.append(m)
        # delete the entry in the dictionary
        del scores[m]

    return ranking

In [6]:
# compare embeddings for context sentence (including compound) with image captions, no preprocessing

submission = []

for i in range(len(data)):

    current = data.iloc[i]
    sentences = [current["sentence"], 
                 current["image1_caption"],
                 current["image2_caption"],
                 current["image3_caption"],
                 current["image4_caption"],
                 current["image5_caption"]]

    scores = sim_scores(current, sentences)
    ranking = rank_images(scores)

    submission.append({
        "compound": current["compound"],
        "expected_order":ranking 
    })

# Save the submission data to a .tsv file
submission_df = pd.DataFrame(submission)
submission_df.to_csv("submission_EN.tsv", sep="\t", index=False)
print("File saved as submission_EN.tsv")

File saved as submission_EN.tsv


In [7]:
# compare embeddings for context sentence (including compound) with image captions, with preprocessing

submission = []

for i in range(len(data)):

    current = data.iloc[i]
    sentences = [prepare_text(current["sentence"]), 
                 prepare_text(current["image1_caption"]),
                 prepare_text(current["image2_caption"]),
                 prepare_text(current["image3_caption"]),
                 prepare_text(current["image4_caption"]),
                 prepare_text(current["image5_caption"])]

    scores = sim_scores(current, sentences)
    ranking = rank_images(scores)

    submission.append({
        "compound": current["compound"],
        "expected_order":ranking 
    })

# Save the submission data to a .tsv file
submission_df = pd.DataFrame(submission)
submission_df.to_csv("submission_EN.tsv", sep="\t", index=False)
print("File saved as submission_EN.tsv")

File saved as submission_EN.tsv


In [8]:
# comparing embeddings for captions with that of compound (not whole sentence), no preprocessing

submission = []

for i in range(len(data)):

    current = data.iloc[i]
    sentences = [current["compound"], 
                 current["image1_caption"],
                 current["image2_caption"],
                 current["image3_caption"],
                 current["image4_caption"],
                 current["image5_caption"]]

    scores = sim_scores(current, sentences)
    ranking = rank_images(scores)

    submission.append({
        "compound": current["compound"],
        "expected_order":ranking 
    })

submission_df = pd.DataFrame(submission)
submission_df.to_csv("submission_EN.tsv", sep="\t", index=False)
print("File saved as submission_EN.tsv")

File saved as submission_EN.tsv


In [9]:
# comparing embeddings for captions with that of compound (not whole sentence), with preprocessing

submission = []

for i in range(len(data)):

    current = data.iloc[i]
    sentences = [prepare_text(current["compound"]), 
                 prepare_text(current["image1_caption"]),
                 prepare_text(current["image2_caption"]),
                 prepare_text(current["image3_caption"]),
                 prepare_text(current["image4_caption"]),
                 prepare_text(current["image5_caption"])]

    scores = sim_scores(current, sentences)
    ranking = rank_images(scores)

    submission.append({
        "compound": current["compound"],
        "expected_order":ranking 
    })

submission_df = pd.DataFrame(submission)
submission_df.to_csv("submission_EN.tsv", sep="\t", index=False)
print("File saved as submission_EN.tsv")

File saved as submission_EN.tsv


In [10]:
# combining "sentence" and "compound" embeddings (average)

def sim_scores_combined(current, sentences):
    # input = current line(example) & embeddings for sentence + captions
    
    scores = {}
    # keys = image names
    # values = scores

    embeddings = model.encode(sentences)

    # combining compound & sentence embeddings
    sent_comp = (embeddings[0] + embeddings[1]) / 2

    similarities = model.similarity(sent_comp, embeddings[2:])
    # compares the embedding for the sentence and compound combined
    # with each of the embeddings, including itself and all the captions

    # [0][x] required because similarities tensor has additional layer
    score1 = similarities[0][0].item()
    scores[current["image1_name"]] = score1

    score2 = similarities[0][1].item()
    scores[current["image2_name"]] = score2

    score3 = similarities[0][2].item()
    scores[current["image3_name"]] = score3

    score4 = similarities[0][3].item()
    scores[current["image4_name"]] = score4

    score5 = similarities[0][4].item()
    scores[current["image5_name"]] = score5
    
    return scores

In [None]:
# comparing embeddings for captions with that of compound and sentence, no preprocessing

submission = []

for i in range(len(data)):

    current = data.iloc[i]
    sentences = [current["compound"], 
                 current["sentence"], 
                 current["image1_caption"],
                 current["image2_caption"],
                 current["image3_caption"],
                 current["image4_caption"],
                 current["image5_caption"]]

    scores = sim_scores_combined(current, sentences)
    ranking = rank_images(scores)

    submission.append({
        "compound": current["compound"],
        "expected_order":ranking 
    })

submission_df = pd.DataFrame(submission)
submission_df.to_csv("submission_EN.tsv", sep="\t", index=False)
print("File saved as submission_EN.tsv")

File saved as submission_EN.tsv


In [12]:
# comparing embeddings for captions with that of compound and sentence, with preprocessing

submission = []

for i in range(len(data)):

    current = data.iloc[i]
    sentences = [prepare_text(current["compound"]), 
                 prepare_text(current["sentence"]),
                 prepare_text(current["image1_caption"]),
                 prepare_text(current["image2_caption"]),
                 prepare_text(current["image3_caption"]),
                 prepare_text(current["image4_caption"]),
                 prepare_text(current["image5_caption"])]

    scores = sim_scores_combined(current, sentences)
    ranking = rank_images(scores)

    submission.append({
        "compound": current["compound"],
        "expected_order":ranking 
    })

submission_df = pd.DataFrame(submission)
submission_df.to_csv("submission_EN.tsv", sep="\t", index=False)
print("File saved as submission_EN.tsv")

File saved as submission_EN.tsv
