approaching subtask A using sentence embeddings

- random baseline
- ranking based on similarity of sentence embeddings for the compound / sentence and image captions

In [34]:
# read tsv file
import csv
import sys
import ast
import numpy as np

import pandas as pd
dataDirectory = "./data/"

# read in competition data
dataA_train = pd.read_csv(dataDirectory + "subtask_a_train.tsv", sep='\t')
dataA_train['expected_order'] = dataA_train['expected_order'].apply(ast.literal_eval)
dataA_dev = pd.read_csv(dataDirectory + "subtask_a_dev.tsv", sep='\t')
dataA_test = pd.read_csv(dataDirectory +"subtask_a_test.tsv", sep='\t')

dataA = pd.concat([dataA_train,dataA_dev,dataA_test])
# reset index
dataA = dataA.reset_index(drop=True)

# read in chatGPT data from csv
data_chatGPT_train = pd.read_csv(dataDirectory + "chatGPTNew_train.csv")
data_chatGPT_dev = pd.read_csv(dataDirectory + "chatGPTNew_dev.csv")
data_chatGPT_test = pd.read_csv(dataDirectory + "chatGPTNew_test.csv")
data_chatGPT = pd.concat([data_chatGPT_train,data_chatGPT_dev,data_chatGPT_test])

data_chatGPT = data_chatGPT.reset_index(drop=True)

# rename each column with "gpt_" in front of the column name
data_chatGPT.rename(columns=lambda x: 'gpt_' + x, inplace=True)

# inserting the missing compound column
data_chatGPT["compound"] = [None for i in range(len(data_chatGPT))]
for i in range(len(data_chatGPT)):
    data_chatGPT["compound"][i] = data_chatGPT["gpt_idiomatic_meaning"][i].split(" is")[0].strip().lower()

# read in gpt image description data
data_gpt_image = pd.read_csv(dataDirectory  + "gpt_image_descriptions_all.csv", sep=',')

# merge data into one dataframe
dataA = pd.merge(dataA, data_chatGPT, on='compound')
dataA = pd.merge(dataA, data_gpt_image, on='compound')

sentence_type_columns = ['sentence', 
                         'image1_caption', 'image2_caption', 'image3_caption', 'image4_caption', 'image5_caption', 
                         'gpt_idiomatic_meaning', 'gpt_literal_meaning', 
                         'gpt_idiomatic_sentence', 'gpt_literal_sentence',
                         'gpt_idiomatic_image', 'gpt_literal_image']


# cleanup data
# replace ’ with ' in all columns
for column in sentence_type_columns:
    dataA[column] = dataA[column].str.replace("’","'")


preprocessed = True

In [35]:
# returns the dataframe sample and train (data items that have literal/idiomatic information given)
def only_train(dataA): # returns the dataframe sample and train (data items that have literal/idiomatic information given)
    return pd.concat([dataA[dataA["subset"] == "Sample"],dataA[dataA["subset"]== "Train"]])

In [54]:
# returns the dataframe of subset
def only_subset(dataA, subset): # returns the dataframe sample and train (data items that have literal/idiomatic information given)
    return dataA[dataA["subset"] == subset]

In [36]:
# read in predicted idiomatic/literal prediction

dataA["binary_pred"] = pd.read_pickle("binary_pred.pkl")


from sklearn.metrics import accuracy_score
dataA_train = only_train(dataA)
accuracy_score(dataA_train["binary_pred"],dataA_train["sentence_type"])

0.9

In [37]:
# preprocessing of text (from Victoria)
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

def prepare_text(raw_text):
    
    # Convert text to lowercase and remove punctuation
    normalized_text = raw_text.lower()
    normalized_text = re.sub(r"[^a-zA-Z\s]", "", normalized_text)

    # Tokenize the normalized text
    tokens = word_tokenize(normalized_text)

    # Apply POS tagging and retain only nouns, verbs
    pos_tags = nltk.pos_tag(tokens, tagset='universal')
    pos_tags_to_keep = {"NOUN", "VERB"}
    filtered_tokens = [word for word, pos in pos_tags if pos in pos_tags_to_keep]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_text = [word for word in filtered_tokens if word.lower() not in stop_words]

    # Lemmatize the remaining tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_text]
    
    return " ".join(lemmatized_text)


[nltk_data] Downloading package stopwords to C:\Users\Wiebke
[nltk_data]     Petersen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Wiebke
[nltk_data]     Petersen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Wiebke
[nltk_data]     Petersen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Wiebke
[nltk_data]     Petersen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Wiebke Petersen\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [38]:

if preprocessed == True:
    dataA["compound"] = dataA["compound"].apply(prepare_text)
    for column in sentence_type_columns:
        dataA[column] = dataA[column].apply(prepare_text) 
 

In [39]:
# functions to display images

from PIL import Image
from IPython.display import display


fileDirectory = 'D:\\Wiebke Petersen\\Downloads\\AdMIRe Subtask A Train\\train'

# Open the image file
def display_image(compound, fn):
    img = Image.open(fileDirectory + "\\" + compound + "\\" + fn)
    new_size = (150, 150)  # Width, Height
    img_resized = img.resize(new_size)  
    # Display the image
    display(img_resized)

# returns list of image names sorted from image1 to image5
def get_image_names(n,mydata):
    names = []
    for i in [1,2,3,4,5]:
         names.append(mydata['image' + str(i) + '_name'][n])
    return names

# print information of 1 item:

def print_item(n, mydata):
    # print  'sentence_type', 'sentence'
    compound = mydata['compound'][n]
    print(compound)
    print(mydata['sentence_type'][n])
    print(mydata['sentence'][n])
    print('---------------------------------------------------------------------')


    # for image_names in 'expected_order' print image_captions
    names  =  get_image_names(n,mydata)
    expected_order = mydata['expected_order'][n]
    print(expected_order)

    for image_name in expected_order:
        display_image(compound, image_name)
        # get index of image_name in names
        index = names.index(image_name) + 1
        print(mydata['image'+str(index)+'_caption'][n])
    

In [40]:
# pip install -U sentence-transformers

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [41]:
for type in sentence_type_columns:
    dataA[type + "_sbert_embedding"] = dataA[type].apply(lambda x: model.encode(x))

In [42]:
import pickle
if preprocessed == True:
    prep = "_preprocessed_"
else:
    prep = "_"

dataA.to_pickle("dataA_sbert"+ prep  + ".pkl")


In [43]:
preprocessed = False

if preprocessed == True:
    prep = "_preprocessed_"
else:
    prep = "_"

dataA = pd.read_pickle("dataA_sbert"+ prep + ".pkl")


In [44]:
from scipy.stats import spearmanr
import numpy as np


# evaluation functions for ranked orders
def top1accuracy(pred_rankings,expected_order):
    correct = 0
    for i in range(len(pred_rankings)):
        if pred_rankings[i][0] == expected_order[i][0]:
            correct += 1
    return round(correct/len(pred_rankings),3)


def spearman_correlation(pred_rankings,expected_order):
    corr = []
    for i in range(len(pred_rankings)):
        corr.append(spearmanr(pred_rankings[i],expected_order[i]).correlation)
    return round(np.mean(corr),3)


In [45]:
def sim_scores(current, comparator):
    # input = current line(example) & embeddings for sentence + captions
    
    scores = {}
    # keys = image names
    # values = scores
    embeddings = [current[comparator+ "_sbert_embedding"], 
                 current["image1_caption_sbert_embedding"],
                 current["image2_caption_sbert_embedding"],
                 current["image3_caption_sbert_embedding"],
                 current["image4_caption_sbert_embedding"],
                 current["image5_caption_sbert_embedding"]]

    #embeddings = model.encode(sentences)
    similarities = model.similarity(embeddings[0], embeddings)
    # compares the embedding for the sentence including the compound 
    # with each of the embeddings, including itself and all the captions

    # [0][x] required because similarities tensor has additional layer
    score1 = similarities[0][1].item()
    scores[current["image1_name"]] = score1

    score2 = similarities[0][2].item()
    scores[current["image2_name"]] = score2

    score3 = similarities[0][3].item()
    scores[current["image3_name"]] = score3

    score4 = similarities[0][4].item()
    scores[current["image4_name"]] = score4

    score5 = similarities[0][5].item()
    scores[current["image5_name"]] = score5
    
    return scores

In [46]:
def rank_images(scores):
    ranking = []
    # scores = dictionary containing the cos similarity scores
    # comparing the sentence with the captions of the five images
    # keys = image names
    # values = scores

    for i in range(5):
        # find key which corresponds to the highest value
        m = max(scores, key=scores.get)
        # add the key (image name) to the ranking
        ranking.append(m)
        # delete the entry in the dictionary
        del scores[m]

    return ranking

In [47]:
def dependent_preds(current, comparator):
    pred = current["binary_pred"]
    return rank_images(sim_scores(current, "gpt_" + pred + "_" + comparator ))

In [48]:
dataA["pred_order"] = dataA.apply(lambda x: rank_images(sim_scores(x, "sentence")), axis=1)

dataA_train = only_train(dataA)
print("Evaluation on training data (rank images by similarity to sentence):")
print("top1 accuracy", top1accuracy(dataA_train["expected_order"], dataA_train["pred_order"]))
print("spearman correlation", spearman_correlation(dataA_train["expected_order"], dataA_train["pred_order"]))

Evaluation on training data (rank images by similarity to sentence):
top1 accuracy 0.4
spearman correlation 0.201


In [49]:
for sent_type in ['sentence', 'meaning','image']:
    print("\nDepending on binary prediction (literal/idiomatic) rank images by similarity to gpt_" + sent_type)
    dataA["pred_order_dependent"] = dataA.apply(lambda x: dependent_preds(x,sent_type), axis=1)

    dataA_train = only_train(dataA)
    print("top1 accuracy", top1accuracy(dataA_train["expected_order"], dataA_train["pred_order_dependent"]))
    print("spearman correlation", spearman_correlation(dataA_train["expected_order"], dataA_train["pred_order_dependent"]))


Depending on binary prediction (literal/idiomatic) rank images by similarity to gpt_sentence
top1 accuracy 0.329
spearman correlation 0.054

Depending on binary prediction (literal/idiomatic) rank images by similarity to gpt_meaning
top1 accuracy 0.371
spearman correlation 0.183

Depending on binary prediction (literal/idiomatic) rank images by similarity to gpt_image
top1 accuracy 0.557
spearman correlation 0.086


In [50]:
dataA.columns

Index(['compound', 'subset', 'sentence_type', 'sentence', 'expected_order',
       'image1_name', 'image1_caption', 'image2_name', 'image2_caption',
       'image3_name', 'image3_caption', 'image4_name', 'image4_caption',
       'image5_name', 'image5_caption', 'gpt_idiomatic_meaning',
       'gpt_literal_meaning', 'gpt_idiomatic_sentence', 'gpt_literal_sentence',
       'gpt_idiomatic_image', 'gpt_literal_image', 'binary_pred',
       'sentence_sbert_embedding', 'image1_caption_sbert_embedding',
       'image2_caption_sbert_embedding', 'image3_caption_sbert_embedding',
       'image4_caption_sbert_embedding', 'image5_caption_sbert_embedding',
       'gpt_idiomatic_meaning_sbert_embedding',
       'gpt_literal_meaning_sbert_embedding',
       'gpt_idiomatic_sentence_sbert_embedding',
       'gpt_literal_sentence_sbert_embedding',
       'gpt_idiomatic_image_sbert_embedding',
       'gpt_literal_image_sbert_embedding', 'pred_order',
       'pred_order_dependent'],
      dtype='object')

In [51]:
# sim scores

def dependent_preds(current, sent_type):
       preds = [0 for i in range(5)]
       scores_lit = sim_scores(current, "gpt_literal_" + sent_type)
       scores_id = sim_scores(current, "gpt_idiomatic_" + sent_type)
       image_names = list(scores_lit.keys())
       # get highest scoring image for literal and idiomatic
       m_lit = max(scores_lit,key=scores_lit.get)
       preds[0] = m_lit

       del scores_id[m_lit]
       m_id = max(scores_id, key=scores_id.get)
       preds[3] = m_id
    
       m_lit_index = image_names.index(m_lit)
       m_id_index = image_names.index(m_id)
       scores_images_lit = sim_scores(current, "image" + str(m_lit_index + 1) + "_caption")
       scores_images_id = sim_scores(current, "image" + str(m_id_index +1) + "_caption")
       del scores_images_lit[m_lit]
       del scores_images_lit[m_id]
       del scores_images_id[m_lit]
       del scores_images_id[m_id]

       sim_max_lit = max(scores_images_lit, key=scores_images_lit.get)
       preds[1] = sim_max_lit
    
       del scores_images_id[sim_max_lit]
       sim_max_id = max(scores_images_id, key=scores_images_id.get)
       preds[2] = sim_max_id
       preds[4] = list(set(image_names).difference(set([m_lit,m_id,sim_max_lit,sim_max_id])))[0]
       if not(set(preds) == set(image_names)):
           print("there is some serious problem") 
       if current["binary_pred"] == "idiomatic":
          preds_new = [preds[i] for i in [3,2,1,0,4]]
          preds = preds_new
       return preds


In [52]:
dataA["pred_order_dependent"] = dataA.apply(lambda x: dependent_preds(x,"image"), axis=1)

dataA_train = only_train(dataA)
print("Evaluation on training data (rank images dependent on binary classification. For order use inter-image similarity):")
print("top1 accuracy", top1accuracy(dataA_train["expected_order"], dataA_train["pred_order_dependent"]))
print("spearman correlation", spearman_correlation(dataA_train["expected_order"], dataA_train["pred_order_dependent"]))

Evaluation on training data (rank images dependent on binary classification. For order use inter-image similarity):
top1 accuracy 0.543
spearman correlation 0.286


In [57]:
def make_submission(dataA, column, subset):
    subset_data = only_subset(dataA,subset)
    submission_df = pd.DataFrame()
    submission_df["compound"] = subset_data["compound"]
    submission_df["expected_order"] = subset_data[column]
    submission_df.to_csv("submission_EN.tsv", sep="\t", index=False)
    print("File saved as submission_EN.tsv")

make_submission(dataA,"pred_order_dependent", "Dev")

File saved as submission_EN.tsv


In [26]:
# sim scores

def dependent_preds_no_pairs(current, sent_type):
    preds = [0 for i in range(5)]
    scores_lit = sim_scores(current, "gpt_literal_" + sent_type)
    scores_id = sim_scores(current, "gpt_idiomatic_" + sent_type)
    image_names = list(scores_lit.keys())
    # get highest scoring image for literal and idiomatic
    m_lit = max(scores_lit,key=scores_lit.get)
    preds[0] = m_lit

    del scores_id[m_lit]
    m_id = max(scores_id, key=scores_id.get)
    preds[3] = m_id

    del scores_lit[m_lit]
    del scores_lit[m_id]

    m_lit_second = max(scores_lit, key=scores_lit.get)
    preds[1] = m_lit_second

    del scores_id[m_id]
    del scores_id[m_lit_second]
    m_id_second = max(scores_id, key=scores_id.get)
    preds[2] = m_id_second
    preds[4] = list(set(image_names).difference(set([m_lit,m_id,m_lit_second,m_id_second])))[0]
    if not(set(preds) == set(image_names)):
           print("there is some serious problem") 
    if current["binary_pred"] == "idiomatic":
          preds_new = [preds[i] for i in [3,2,1,0,4]]
          preds = preds_new
    return preds


In [27]:
dataA["pred_order_dependent"] = dataA.apply(lambda x: dependent_preds_no_pairs(x,"image"), axis=1)

dataA_train = only_train(dataA)
print("Evaluation on training data (rank images dependent on binary classification. For order use similarity to gpt_image only):")
print("top1 accuracy", top1accuracy(dataA_train["expected_order"], dataA_train["pred_order_dependent"]))
print("spearman correlation", spearman_correlation(dataA_train["expected_order"], dataA_train["pred_order_dependent"]))

Evaluation on training data (rank images dependent on binary classification. For order use similarity to gpt_image only):
top1 accuracy 0.543
spearman correlation 0.283


## Reste

In [40]:
# sentence

# entire subtask A training data (70):
# final_acc = 0.4
# final_spearman = 0.20142857142857135

# only literal (31):
# final_acc = 0.6774193548387096
# final_spearman = 0.3064516129032258

# only idiomatic (39):
# final_acc = 0.1794871794871795
# final_spearman = 0.11794871794871792

In [28]:
# comparing embeddings for captions with that of compound (not whole sentence)

data = dataA
#data = dataA[dataA["sentence_type"]=="literal"]
#data = dataA[dataA["sentence_type"]=="idiomatic"]

total_acc = 0
total_spearman = 0

for i in range(len(data)):

    scores = sim_scores(current, sentences)
    ranking = rank_images(scores)

    exp_order = current["expected_order"]
    evaluation = evaluation_single(ranking,exp_order)
    total_acc += evaluation[0]
    total_spearman += evaluation[1]

final_acc = total_acc / len(data)
print(final_acc)
final_spearman = total_spearman / len(data)
print(final_spearman)

TypeError: can only concatenate list (not "str") to list

In [15]:
# compound

#  entire subtask A training data (70):
# final_acc = 0.4
# final_spearman = 0.16714285714285712

# only literal (31):
# final_acc = 0.8064516129032258
# final_spearman = 0.39354838709677425

# only idiomatic (39):
# final_acc = 0.07692307692307693
# final_spearman = -0.012820512820512832

In [16]:
# combining "sentence" and "compound" embeddings (average)

def sim_scores_combined(current, sentences):
    # input = current line(example) & embeddings for sentence + captions
    
    scores = {}
    # keys = image names
    # values = scores

    embeddings = model.encode(sentences)

    # combining compound & sentence embeddings
    sent_comp = (embeddings[0] + embeddings[1]) / 2

    similarities = model.similarity(sent_comp, embeddings[2:])
    # compares the embedding for the sentence and compound combined
    # with each of the embeddings, including itself and all the captions

    # [0][x] required because similarities tensor has additional layer
    score1 = similarities[0][0].item()
    scores[current["image1_name"]] = score1

    score2 = similarities[0][1].item()
    scores[current["image2_name"]] = score2

    score3 = similarities[0][2].item()
    scores[current["image3_name"]] = score3

    score4 = similarities[0][3].item()
    scores[current["image4_name"]] = score4

    score5 = similarities[0][4].item()
    scores[current["image5_name"]] = score5
    
    return scores

In [17]:
# comparing embeddings for captions with that of compound and sentence (averaged)

data = dataA
#data = dataA[dataA["sentence_type"]=="literal"]
#data = dataA[dataA["sentence_type"]=="idiomatic"]

total_acc = 0
total_spearman = 0

for i in range(len(data)):

    current = data.iloc[i]
    sentences = [current["compound"], 
                 current["sentence"], 
                 current["image1_caption"],
                 current["image2_caption"],
                 current["image3_caption"],
                 current["image4_caption"],
                 current["image5_caption"]]

    scores = sim_scores_combined(current, sentences)
    ranking = rank_images(scores)

    exp_order = current["expected_order"]
    evaluation = evaluation_single(ranking,exp_order)
    total_acc += evaluation[0]
    total_spearman += evaluation[1]

final_acc = total_acc / len(data)
print(final_acc)
final_spearman = total_spearman / len(data)
print(final_spearman)

0.34285714285714286
0.2271428571428571


In [18]:
# sentence and compound combined

# entire subtask A training data:
# final_acc = 0.34285714285714286 -> worse than compound / sentence individually
# final_spearman = 0.2271428571428571 -> better than either individually

# only literal: -> worse than compound, better than sentence
# final_acc = 0.7096774193548387 
# final_spearman = 0.36774193548387096

# only idiomatic:
# final_acc = 0.05128205128205128 -> worse than either individually
# final_spearman = 0.11538461538461536 -> better than compound, about as good as sentence

In [19]:
# random baseline

import random
random.seed(2)

data = dataA

total_acc = 0
total_spearman = 0

for i in range(len(data)):

    current = data.iloc[i]

    ranking = []
    images = [current["image1_name"],
              current["image2_name"],
              current["image3_name"],
              current["image4_name"],
              current["image5_name"]]

    for i in range(5):
        rand_img = random.choice(images)
        ranking.append(rand_img)
        images.remove(rand_img)

    exp_order = current["expected_order"]
    evaluation = evaluation_single(ranking,exp_order)
    total_acc += evaluation[0]
    total_spearman += evaluation[1]

final_acc = total_acc / len(data)
print(final_acc)
final_spearman = total_spearman / len(data)
print(final_spearman)

0.17142857142857143
0.03428571428571425


In [20]:
# random

# final_acc = 0.17142857142857143
# final_spearman = 0.03428571428571425