approaching subtask A using sentence embeddings

- random baseline
- ranking based on similarity of sentence embeddings for the compound / sentence and image captions

**Note:** needs binar_pred.pkl file generated in subtaskA_predictions_fromBERT.ipynb

## Make embeddings

In [27]:
# read tsv file
import csv
import sys
import ast
import numpy as np

import pandas as pd
dataDirectory = "./data/"

# read in competition data
dataA_train = pd.read_csv(dataDirectory + "subtask_a_train.tsv", sep='\t')
dataA_train['expected_order'] = dataA_train['expected_order'].apply(ast.literal_eval)
dataA_dev = pd.read_csv(dataDirectory + "subtask_a_dev.tsv", sep='\t')
dataA_xe = pd.read_csv(dataDirectory + "subtask_a_xe.tsv", sep='\t')
dataA_test = pd.read_csv(dataDirectory +"subtask_a_test.tsv", sep='\t')

dataA = pd.concat([dataA_train,dataA_dev,dataA_test,dataA_xe])
# reset index
dataA = dataA.reset_index(drop=True)

# read in chatGPT data from csv
data_chatGPT_train = pd.read_csv(dataDirectory + "chatGPTNew_train.csv")
data_chatGPT_dev = pd.read_csv(dataDirectory + "chatGPTNew_dev.csv")
data_chatGPT_test = pd.read_csv(dataDirectory + "chatGPTNew_test.csv")
data_chatGPT = pd.concat([data_chatGPT_train,data_chatGPT_dev,data_chatGPT_test])

data_chatGPT = data_chatGPT.reset_index(drop=True)

# rename each column with "gpt_" in front of the column name
data_chatGPT.rename(columns=lambda x: 'gpt_' + x, inplace=True)

# inserting the missing compound column
data_chatGPT["compound"] = [None for i in range(len(data_chatGPT))]
for i in range(len(data_chatGPT)):
    data_chatGPT["compound"][i] = data_chatGPT["gpt_idiomatic_meaning"][i].split(" is")[0].strip().lower()

# read in gpt image description data
data_gpt_image = pd.read_csv(dataDirectory  + "gpt_image_descriptions_all.csv", sep=',')

# merge data into one dataframe
dataA = pd.merge(dataA, data_chatGPT, on='compound')
dataA = pd.merge(dataA, data_gpt_image, on='compound')

sentence_type_columns = ['sentence', 
                         'image1_caption', 'image2_caption', 'image3_caption', 'image4_caption', 'image5_caption', 
                         'gpt_idiomatic_meaning', 'gpt_literal_meaning', 
                         'gpt_idiomatic_sentence', 'gpt_literal_sentence',
                         'gpt_idiomatic_image', 'gpt_literal_image']


# cleanup data
# replace ’ with ' in all columns
for column in sentence_type_columns:
    dataA[column] = dataA[column].str.replace("’","'")


preprocessed = False
#preprocessed = True

In [28]:
# returns the dataframe sample and train (data items that have literal/idiomatic information given)
def only_train(dataA): # returns the dataframe sample and train (data items that have literal/idiomatic information given)
    return pd.concat([dataA[dataA["subset"] == "Sample"],dataA[dataA["subset"]== "Train"]])

# returns the dataframe of subset
def only_subset(dataA, subset): # returns the dataframe sample and train (data items that have literal/idiomatic information given)
    return dataA[dataA["subset"] == subset]

In [29]:
# minor cheating: add opposite sentence_type for each compound in extended data that is also in train/sample

dataA_train = only_train(dataA)
dataA_train = dataA_train.reset_index(drop=True)
train_compounds = [(dataA_train["compound"][i], dataA_train["sentence_type"][i], list(dataA_train["expected_order"][i])) for i in range(len(dataA_train))]
def cheat_items(item):
    new_order = [3,2,1,0,4]
    if item["subset"] == "Extended Evaluation":
        compound = item["compound"]
        t = [triple for triple in train_compounds if triple[0] == compound]
        if t != []:
            (c,sent_type,order) = t[0]
            if sent_type == "literal":
                item["sentence_type"] = "idiomatic"
            else: 
                item["sentence_type"] = "literal"
            item["expected_order"] = [order[i] for i in new_order]
        else: 
            item["sentence_type"] = None
            item["expected_order"] = None
    return item

dataA = dataA.apply(lambda x: cheat_items(x), axis=1)    

In [31]:
# read in predicted idiomatic/literal prediction
# best prediction gained from BERT-embeddings

dataA["binary_pred"] = pd.read_pickle("binary_pred.pkl")


from sklearn.metrics import accuracy_score

dataA_train = only_train(dataA)
print(accuracy_score(dataA_train["binary_pred"].tolist(),dataA_train["sentence_type"].tolist()))
dataA_trainext = dataA[dataA["sentence_type"].notnull()] 
print(accuracy_score(dataA_trainext["binary_pred"].tolist(),dataA_trainext["sentence_type"].tolist()))



0.9857142857142858
0.9357142857142857


In [32]:
# preprocessing of text (from Victoria)
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

def prepare_text(raw_text):
    
    # Convert text to lowercase and remove punctuation
    normalized_text = raw_text.lower()
    normalized_text = re.sub(r"[^a-zA-Z\s]", "", normalized_text)

    # Tokenize the normalized text
    tokens = word_tokenize(normalized_text)

    # Apply POS tagging and retain only nouns, verbs
    pos_tags = nltk.pos_tag(tokens, tagset='universal')
    pos_tags_to_keep = {"NOUN", "VERB"}
    filtered_tokens = [word for word, pos in pos_tags if pos in pos_tags_to_keep]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_text = [word for word in filtered_tokens if word.lower() not in stop_words]

    # Lemmatize the remaining tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_text]
    
    return " ".join(lemmatized_text)


[nltk_data] Downloading package stopwords to C:\Users\Wiebke
[nltk_data]     Petersen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Wiebke
[nltk_data]     Petersen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Wiebke
[nltk_data]     Petersen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Wiebke
[nltk_data]     Petersen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Wiebke Petersen\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [33]:

if preprocessed == True:
    for column in sentence_type_columns:
        dataA[column] = dataA[column].apply(prepare_text) 
 

In [34]:
# functions to display images

from PIL import Image
from IPython.display import display


fileDirectory = 'D:\\Wiebke Petersen\\Downloads\\AdMIRe Subtask A Train\\train'

# Open the image file
def display_image(compound, fn):
    img = Image.open(fileDirectory + "\\" + compound + "\\" + fn)
    new_size = (150, 150)  # Width, Height
    img_resized = img.resize(new_size)  
    # Display the image
    display(img_resized)

# returns list of image names sorted from image1 to image5
def get_image_names(n,mydata):
    names = []
    for i in [1,2,3,4,5]:
         names.append(mydata['image' + str(i) + '_name'][n])
    return names

# print information of 1 item:

def print_item(n, mydata):
    # print  'sentence_type', 'sentence'
    compound = mydata['compound'][n]
    print(compound)
    print(mydata['sentence_type'][n])
    print(mydata['sentence'][n])
    print('---------------------------------------------------------------------')


    # for image_names in 'expected_order' print image_captions
    names  =  get_image_names(n,mydata)
    expected_order = mydata['expected_order'][n]
    print(expected_order)

    for image_name in expected_order:
        display_image(compound, image_name)
        # get index of image_name in names
        index = names.index(image_name) + 1
        print(mydata['image'+str(index)+'_caption'][n])
    

In [35]:
# pip install -U sentence-transformers

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [36]:
#  SBert embeddings are generated  for all sentence like columns
for type in sentence_type_columns:
    dataA[type + "_sbert_embedding"] = dataA[type].apply(lambda x: model.encode(x))

In [37]:
import pickle
if preprocessed == True:
    prep = "_preprocessed_"
else:
    prep = "_"

dataA.to_pickle("dataA_sbert"+ prep  + ".pkl")


## Make Predictions 

In [38]:
import numpy as np
import pandas as pd

#preprocessed = False
preprocessed = True # better results

if preprocessed == True:
    prep = "_preprocessed_"
else:
    prep = "_"

dataA = pd.read_pickle("dataA_sbert"+ prep + ".pkl")

f = open('results_rankings.txt', 'a')
f.write("\n"+ "=====================================================================")
f.write("\n" + "Ranking results")
if prep == "_preprocessed_":
    f.write("\n" +"preprocessed: True " + "noun, verb")
else: 
    f.write("\n" +"preprocessed: False")
f.close()



In [39]:
# generates submission file from column for subset
from zipfile import ZipFile
def make_submission(dataA, column, subset):
    subset_data = only_subset(dataA,subset)
    submission_df = pd.DataFrame()
    submission_df["compound"] = subset_data["compound"]
    submission_df["expected_order"] = subset_data[column]
    submission_df.to_csv("submission_EN.tsv", sep="\t", index=False)
    ZipFile('submission_EN.zip', 'w').write('submission_EN.tsv')
    print("File zipped and saved as submission_EN.zip")




In [13]:
from scipy.stats import spearmanr
import numpy as np


# evaluation functions for ranked orders
def top1accuracy(pred_rankings,expected_order):
    pred_rankings = pred_rankings.to_list()
    expected_order = expected_order.to_list()
    correct = 0
    for i in range(len(pred_rankings)):
        if pred_rankings[i][0] == expected_order[i][0]:
            correct += 1
    return round(correct/len(pred_rankings),3)


def spearman_correlation(pred_rankings,expected_order):
    pred_rankings = pred_rankings.to_list()
    expected_order = expected_order.to_list()
    corr = []
    for i in range(len(pred_rankings)):
        corr.append(spearmanr(pred_rankings[i],expected_order[i]).correlation)
    return round(np.mean(corr),3)


In [14]:
def sim_scores(current, comparator):
    # input = current line(example) & embeddings for sentence + captions
    
    scores = {}
    # keys = image names
    # values = scores
    embeddings = [current[comparator+ "_sbert_embedding"], 
                 current["image1_caption_sbert_embedding"],
                 current["image2_caption_sbert_embedding"],
                 current["image3_caption_sbert_embedding"],
                 current["image4_caption_sbert_embedding"],
                 current["image5_caption_sbert_embedding"]]

    #embeddings = model.encode(sentences)
    similarities = model.similarity(embeddings[0], embeddings)
    # compares the embedding for the sentence including the compound 
    # with each of the embeddings, including itself and all the captions

    # [0][x] required because similarities tensor has additional layer
    score1 = similarities[0][1].item()
    scores[current["image1_name"]] = score1

    score2 = similarities[0][2].item()
    scores[current["image2_name"]] = score2

    score3 = similarities[0][3].item()
    scores[current["image3_name"]] = score3

    score4 = similarities[0][4].item()
    scores[current["image4_name"]] = score4

    score5 = similarities[0][5].item()
    scores[current["image5_name"]] = score5
    
    return scores

In [15]:
def rank_images(scores):
    ranking = []
    # scores = dictionary containing the cos similarity scores
    # comparing the sentence with the captions of the five images
    # keys = image names
    # values = scores

    for i in range(5):
        # find key which corresponds to the highest value
        m = max(scores, key=scores.get)
        # add the key (image name) to the ranking
        ranking.append(m)
        # delete the entry in the dictionary
        del scores[m]

    return ranking

In [16]:
def dependent_preds(current, comparator):
    pred = current["binary_pred"]
    return rank_images(sim_scores(current, "gpt_" + pred + "_" + comparator ))

In [17]:
dataA["pred_order"] = dataA.apply(lambda x: rank_images(sim_scores(x, "sentence")), axis=1)

dataA_train = only_train(dataA)
print("Evaluation on training data (rank images by similarity to original sentence):")
print("top1 accuracy", top1accuracy(dataA_train["expected_order"], dataA_train["pred_order"]))
print("spearman correlation", spearman_correlation(dataA_train["expected_order"], dataA_train["pred_order"]))

make_submission(dataA,"pred_order", "Dev")

f = open('results_rankings.txt', 'a')
f.write("\n" "-------------------------------------------------")
f.write("\n" + "Evaluation on training data (rank image captions by similarity to original sentence):")
f.write("\n" + "top1 accuracy " +  str(top1accuracy(dataA_train["expected_order"],  dataA_train["pred_order"])))
f.write("\n" +"spearman correlation " + str(spearman_correlation(dataA_train["expected_order"], dataA_train["pred_order"])))
f.close()


Evaluation on training data (rank images by similarity to original sentence):
top1 accuracy 0.414
spearman correlation 0.164
File zipped and saved as submission_EN.zip


  a = torch.tensor(a)


In [18]:
for sent_type in ['sentence', 'meaning','image']:
    print("\nDepending on binary prediction (literal/idiomatic) rank images by similarity to gpt_" + sent_type)
    dataA["pred_order_dependent"] = dataA.apply(lambda x: dependent_preds(x,sent_type), axis=1)

    dataA_train = only_train(dataA)
    print("top1 accuracy", top1accuracy(dataA_train["expected_order"], dataA_train["pred_order_dependent"]))
    print("spearman correlation", spearman_correlation(dataA_train["expected_order"], dataA_train["pred_order_dependent"]))
    f = open('results_rankings.txt', 'a')
    f.write("\n" "-------------------------------------------------")
    f.write("\n" + "\nDepending on binary prediction (literal/idiomatic) rank images by similarity to gpt_" + sent_type)
    f.write("\n" + "top1 accuracy " +  str(top1accuracy(dataA_train["expected_order"],  dataA_train["pred_order_dependent"])))
    f.write("\n" +"spearman correlation " + str(spearman_correlation(dataA_train["expected_order"], dataA_train["pred_order_dependent"])))
    f.close()



Depending on binary prediction (literal/idiomatic) rank images by similarity to gpt_sentence
top1 accuracy 0.257
spearman correlation 0.11

Depending on binary prediction (literal/idiomatic) rank images by similarity to gpt_meaning
top1 accuracy 0.386
spearman correlation 0.183

Depending on binary prediction (literal/idiomatic) rank images by similarity to gpt_image
top1 accuracy 0.514
spearman correlation 0.143


In [19]:
sent_type = "image"
dataA["pred_order_dependent"] = dataA.apply(lambda x: dependent_preds(x,sent_type), axis=1)
print("top1 accuracy", top1accuracy(dataA_train["expected_order"], dataA_train["pred_order_dependent"]))
print("spearman correlation", spearman_correlation(dataA_train["expected_order"], dataA_train["pred_order_dependent"]))
make_submission(dataA,"pred_order", "Dev")

top1 accuracy 0.514
spearman correlation 0.143
File zipped and saved as submission_EN.zip


In [20]:
# sim scores

def dependent_preds_compare_pairs(current, sent_type):
       preds = [0 for i in range(5)]
       scores_lit = sim_scores(current, "gpt_literal_" + sent_type)
       scores_id = sim_scores(current, "gpt_idiomatic_" + sent_type)
       image_names = list(scores_lit.keys())
       # get highest scoring image for literal and idiomatic
       m_lit = max(scores_lit,key=scores_lit.get)
       preds[0] = m_lit

       del scores_id[m_lit]
       m_id = max(scores_id, key=scores_id.get)
       preds[3] = m_id
    
       m_lit_index = image_names.index(m_lit)
       m_id_index = image_names.index(m_id)
       scores_images_lit = sim_scores(current, "image" + str(m_lit_index + 1) + "_caption")
       scores_images_id = sim_scores(current, "image" + str(m_id_index +1) + "_caption")
       del scores_images_lit[m_lit]
       del scores_images_lit[m_id]
       del scores_images_id[m_lit]
       del scores_images_id[m_id]

       sim_max_lit = max(scores_images_lit, key=scores_images_lit.get)
       preds[1] = sim_max_lit
    
       del scores_images_id[sim_max_lit]
       sim_max_id = max(scores_images_id, key=scores_images_id.get)
       preds[2] = sim_max_id
       preds[4] = list(set(image_names).difference(set([m_lit,m_id,sim_max_lit,sim_max_id])))[0]
       if not(set(preds) == set(image_names)):
           print("there is some serious problem") 
       if current["binary_pred"] == "idiomatic":
          preds_new = [preds[i] for i in [3,2,1,0,4]]
          preds = preds_new
       return preds


In [21]:
dataA["pred_order_dependent"] = dataA.apply(lambda x: dependent_preds_compare_pairs(x,"image"), axis=1)

dataA_train = only_train(dataA)
print("Evaluation on training data (rank images dependent on binary classification. For order use inter-image similarity):")
print("top1 accuracy", top1accuracy(dataA_train["expected_order"], dataA_train["pred_order_dependent"]))
print("spearman correlation", spearman_correlation(dataA_train["expected_order"], dataA_train["pred_order_dependent"]))
make_submission(dataA,"pred_order_dependent", "Dev")

f = open('results_rankings.txt', 'a')
f.write("\n" "-------------------------------------------------")
f.write("\n" + "Evaluation on training data (rank images dependent on binary classification. For order use inter-image similarity):")
f.write("\n" + "top1 accuracy " +  str(top1accuracy(dataA_train["expected_order"],  dataA_train["pred_order_dependent"])))
f.write("\n" +"spearman correlation " + str(spearman_correlation(dataA_train["expected_order"], dataA_train["pred_order_dependent"])))
f.close()


dataA_trainext = dataA[dataA["sentence_type"].notnull()] 
print("on extended data")
print("top1 accuracy", top1accuracy(dataA_trainext["expected_order"], dataA_trainext["pred_order_dependent"]))
print("spearman correlation", spearman_correlation(dataA_trainext["expected_order"], dataA_trainext["pred_order_dependent"]))


Evaluation on training data (rank images dependent on binary classification. For order use inter-image similarity):
top1 accuracy 0.543
spearman correlation 0.381
File zipped and saved as submission_EN.zip
on extended data
top1 accuracy 0.271
spearman correlation 0.182


In [22]:
# sim scores

def dependent_preds_no_pairs(current, sent_type):
    preds = [0 for i in range(5)]
    scores_lit = sim_scores(current, "gpt_literal_" + sent_type)
    scores_id = sim_scores(current, "gpt_idiomatic_" + sent_type)
    image_names = list(scores_lit.keys())
    # get highest scoring image for literal and idiomatic
    m_lit = max(scores_lit,key=scores_lit.get)
    preds[0] = m_lit

    del scores_id[m_lit]
    m_id = max(scores_id, key=scores_id.get)
    preds[3] = m_id

    del scores_lit[m_lit]
    del scores_lit[m_id]

    m_lit_second = max(scores_lit, key=scores_lit.get)
    preds[1] = m_lit_second

    del scores_id[m_id]
    del scores_id[m_lit_second]
    m_id_second = max(scores_id, key=scores_id.get)
    preds[2] = m_id_second
    preds[4] = list(set(image_names).difference(set([m_lit,m_id,m_lit_second,m_id_second])))[0]
    if not(set(preds) == set(image_names)):
           print("there is some serious problem") 
    if current["binary_pred"] == "idiomatic":
          preds_new = [preds[i] for i in [3,2,1,0,4]]
          preds = preds_new
    return preds


In [23]:
dataA["pred_order_dependent"] = dataA.apply(lambda x: dependent_preds_no_pairs(x,"image"), axis=1)

dataA_train = only_train(dataA)
print("Evaluation on training data (rank images dependent on binary classification. For order use similarity to gpt_image only):")
print("top1 accuracy", top1accuracy(dataA_train["expected_order"], dataA_train["pred_order_dependent"]))
print("spearman correlation", spearman_correlation(dataA_train["expected_order"], dataA_train["pred_order_dependent"]))
make_submission(dataA,"pred_order_dependent", "Dev")

f = open('results_rankings.txt', 'a')
f.write("\n" "-------------------------------------------------")
f.write("\n" + "Evaluation on training data (rank images dependent on binary classification. For order use similarity to gpt_image only):")
f.write("\n" + "top1 accuracy " +  str(top1accuracy(dataA_train["expected_order"],  dataA_train["pred_order_dependent"])))
f.write("\n" +"spearman correlation " + str(spearman_correlation(dataA_train["expected_order"], dataA_train["pred_order_dependent"])))
f.close()

print("on extended data")
print("top1 accuracy", top1accuracy(dataA_trainext["expected_order"], dataA_trainext["pred_order_dependent"]))
print("spearman correlation", spearman_correlation(dataA_trainext["expected_order"], dataA_trainext["pred_order_dependent"]))



Evaluation on training data (rank images dependent on binary classification. For order use similarity to gpt_image only):
top1 accuracy 0.543
spearman correlation 0.354
File zipped and saved as submission_EN.zip
on extended data
top1 accuracy 0.271
spearman correlation 0.182


In [24]:
def idiomatic_literal_prediction(current, sent_type):
    sims = model.similarity(current["sentence_sbert_embedding"], [current["gpt_idiomatic_" + sent_type + "_sbert_embedding"], current["gpt_literal_" + sent_type +  "_sbert_embedding"]])
    sims = sims.numpy()
    if np.argmax(sims[0]) == 0:
        return "idiomatic"
    else:
        return "literal"


In [25]:
f = open('results_rankings.txt', 'a')
f.write("\n" "-------------------------------------------------")
f.write("\n" + "binary classification literal/idiomatic with SBERT embeddings:")
for sent_type in ["image", "sentence", "meaning"]:
    dataA["binary_pred_sbert"] = dataA.apply(lambda x: idiomatic_literal_prediction(x, sent_type), axis = 1)
    dataA_train = only_train(dataA)
    dataA_trainext = dataA[dataA["sentence_type"].notnull()] 
    print("accuracy",sent_type,accuracy_score(dataA_train["binary_pred_sbert"], dataA_train["sentence_type"]))
    f.write("\n" + sent_type)
    f.write("\n" + "top1 accuracy " +  str(top1accuracy(dataA_train["binary_pred_sbert"],  dataA_train["sentence_type"])))
    print("on extended accuracy",sent_type,accuracy_score(dataA_trainext["binary_pred_sbert"], dataA_trainext["sentence_type"]))

f.close()

accuracy image 0.5428571428571428
on extended accuracy image 0.5857142857142857
accuracy sentence 0.7428571428571429
on extended accuracy sentence 0.7642857142857142
accuracy meaning 0.6285714285714286
on extended accuracy meaning 0.7


In [26]:
dataA["subset"].value_counts()

Extended Evaluation    100
Train                   60
Dev                     15
Test                    15
Sample                  10
Name: subset, dtype: int64