# Make predictions

Needs pkl-data provided by notebook subtaskA_compute_embeddings.ipynb.

## Load data and investigate

In [1]:
import csv
import sys
import ast
import pandas as pd
import numpy as np

# Parameter setting

# model
checkpoint_ptlm = 'bert-base-uncased' 
checkpoint_ftlm ='jlsalim/bert-uncased-idiomatic-literal-recognizer'

checkpoint = checkpoint_ptlm

# full sentences or preprocessed (filtered to content words of specific POS)
preprocessed = False 
#preprocessed = True

#remove_CLS_SEP = True 
remove_CLS_SEP = False


# load pkl-file
import pickle
def load_pkl(preprocessed, checkpoint, remove_CLS_SEP):
    if preprocessed == True:
        prep = "_preprocessed_"
    else:
        prep = "_"

    if remove_CLS_SEP == True:
        cls_sep = "_without_CLS_SEP"
    else:
        cls_sep = ""

    checkpoint_write = checkpoint.replace("/", "_").replace("\\", "_")
    dataA = pd.read_pickle("dataA"+ prep + checkpoint_write + cls_sep + ".pkl")
    # some data cleaning 
    # if expected order is string, convert to list split by comma
    for i in range(len(dataA["expected_order"])):
        if isinstance(dataA["expected_order"][i], str):
            dataA["expected_order"][i] = ast.literal_eval(dataA["expected_order"][i])

    return dataA

dataA = load_pkl(preprocessed, checkpoint, remove_CLS_SEP)

### Data analysis

In [2]:
dataA["subset"].value_counts()

Extended Evaluation    100
Train                   60
Dev                     15
Test                    15
Sample                  10
Name: subset, dtype: int64

In [3]:
dataA['sentence_type'].value_counts()

idiomatic    100
literal      100
Name: sentence_type, dtype: int64

### Data handling functions

In [4]:
# returns the dataframe sample and train (data items that have literal/idiomatic information given)
def only_train(dataA): # returns the dataframe sample and train (data items that have literal/idiomatic information given)
    return pd.concat([dataA[dataA["subset"] == "Sample"],dataA[dataA["subset"]== "Train"]])

# returns the dataframe of subset
def only_subset(dataA, subset): # returns the dataframe sample and train (data items that have literal/idiomatic information given)
    return dataA[dataA["subset"] == subset]

In [5]:
# generates submission file from column for subset
from zipfile import ZipFile
def make_submission(dataA, column, subset):
    if subset == "Dev":
        name = "EN"
    if subset == "Extended Evaluation":
        name = "xe"
    if subset == "Test":
        name = "EN"
    full_name = "submission_" + name
    subset_data = only_subset(dataA,subset)
    submission_df = pd.DataFrame()
    submission_df["compound"] = subset_data["compound"]
    submission_df["expected_order"] = subset_data[column]
    submission_df.to_csv(full_name + ".tsv", sep="\t", index=False)
    ZipFile(full_name + '.zip', 'w').write(full_name + '.tsv')
    print("File zipped and saved as " +  full_name + ".zip")




In [6]:
# functions to display images

from PIL import Image
from IPython.display import display


fileDirectory = 'D:\\Wiebke Petersen\\Downloads\\AdMIRe Subtask A Train\\train' # set to your directory with the images

# Open the image file
def display_image(compound, fn):
    img = Image.open(fileDirectory + "\\" + compound + "\\" + fn)
    new_size = (150, 150)  # Width, Height
    img_resized = img.resize(new_size)  
    # Display the image
    display(img_resized)

# returns list of image names sorted from image1 to image5
def get_image_names(n,mydata):
    names = []
    for i in [1,2,3,4,5]:
         names.append(mydata['image' + str(i) + '_name'][n])
    return names


In [7]:
# print information of 1 item:

def print_item(n, mydata):
    # print  'sentence_type', 'sentence'
    compound = mydata['compound'][n]
    print(compound)
    print(mydata['sentence_type'][n])
    print(mydata['sentence'][n])
    print('---------------------------------------------------------------------')


    # for image_names in 'expected_order' print image_captions
    names  =  get_image_names(n,mydata)
    expected_order = mydata['expected_order'][n]
    print(expected_order)

    for image_name in expected_order:
        display_image(compound, image_name)
        # get index of image_name in names
        index = names.index(image_name) + 1
        print(mydata['image'+str(index)+'_caption'][n])
    

In [8]:
#dataA_train = only_train(dataA)
#print_item(10,dataA_train)

## Functions


### Evaluation functions

In [9]:
from scipy.stats import spearmanr
import numpy as np


# evaluation functions for ranked orders
def top1accuracy(pred_rankings,expected_order):
    pred_rankings = pred_rankings.to_list()
    expected_order = expected_order.to_list()
    correct = 0
    for i in range(len(pred_rankings)):
        if pred_rankings[i][0] == expected_order[i][0]:
            correct += 1
    return round(correct/len(pred_rankings),4)


def spearman_correlation(pred_rankings,expected_order):
    pred_rankings = pred_rankings.to_list()
    expected_order = expected_order.to_list()
    corr = []
    for i in range(len(pred_rankings)):
        corr.append(spearmanr(pred_rankings[i],expected_order[i]).correlation)
    return round(np.mean(corr),4)


### Similarity functions

In [10]:
# simlarity functions
sim_function = "cosine_similarity" # "manhattan" "cosine_similarity"
#sim_function = "manhattan" # "cosine_similarity"


def cosine_similarity(u, v):
    u = np.array(u)
    v = np.array(v)
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

def manhattan(u, v):
    u = np.array(u)
    v = np.array(v)
#    return 1/(np.sum(np.abs(u - v))+1)
    return -np.sum(np.abs(u - v))

# similarity of two embeddings
def similarity(u,v):
    if sim_function == "cosine_similarity":
        return cosine_similarity(u,v)
    elif sim_function == "manhattan":
        return manhattan(u,v)
    else:
        raise ValueError("Unknown similarity function: {}".format(sim_function))


# compare two embeddings to a comparator embedding and return the index of the more similar one
def compare(emb,emb0,emb1):
    sim0 = similarity(emb,emb0)
    sim1 = similarity(emb,emb1)
    if sim0 > sim1:
        return 0
    else:
        return 1

def binary2values(list,val0,val1):
    final = []
    for element in list:
        if element == 1:
            final.append(val1)
        else:
            final.append(val0)
    return final


In [11]:
# fundction that compares the embeddings of the image captions to the embedding of the comparator and returns
# the similarity scores for each image as a dictionary
# the keys are the image names and the values are the similarity scores
def sim_scores(current, comparator,method):
    # input = current line(example) & embeddings for sentence + captions
    
    scores = {}
    # keys = image names
    # values = scores
    embeddings = [current[comparator+ "_embedding_" + method], 
                 current["image1_caption_embedding_" + method],
                 current["image2_caption_embedding_" + method],
                 current["image3_caption_embedding_" + method],
                 current["image4_caption_embedding_" + method],
                 current["image5_caption_embedding_" + method]]

    similarities = [similarity(embeddings[0], embeddings[i]) for i in range(len(embeddings))]
    # compares the embedding for the sentence including the compound 
    # with each of the embeddings, including itself and all the captions

    score1 = similarities[1].item()
    scores[current["image1_name"]] = score1

    score2 = similarities[2].item()
    scores[current["image2_name"]] = score2

    score3 = similarities[3].item()
    scores[current["image3_name"]] = score3

    score4 = similarities[4].item()
    scores[current["image4_name"]] = score4

    score5 = similarities[5].item()
    scores[current["image5_name"]] = score5
    
    return scores


# fundction that compares the embeddings of the image captions to the embedding of the compound in the 
# comparator and returns
# the similarity scores for each image as a dictionary
# the keys are the image names and the values are the similarity scores
def sim_scores_compound(current, comparator,method):
    # input = current line(example) & embeddings for sentence + captions
    
    scores = {}
    # keys = image names
    # values = scores
    embeddings = [current["compound_embedding_" + comparator + "_" +  method], 
                 current["image1_caption_embedding_" + method],
                 current["image2_caption_embedding_" + method],
                 current["image3_caption_embedding_" + method],
                 current["image4_caption_embedding_" + method],
                 current["image5_caption_embedding_" + method]]

    similarities = [similarity(embeddings[0], embeddings[i]) for i in range(len(embeddings))]
    # compares the embedding for the sentence including the compound 
    # with each of the embeddings, including itself and all the captions

    score1 = similarities[1].item()
    scores[current["image1_name"]] = score1

    score2 = similarities[2].item()
    scores[current["image2_name"]] = score2

    score3 = similarities[3].item()
    scores[current["image3_name"]] = score3

    score4 = similarities[4].item()
    scores[current["image4_name"]] = score4

    score5 = similarities[5].item()
    scores[current["image5_name"]] = score5
    
    return scores

### Ranking functions

In [12]:
# basic ranking by similarity scores (most similar first)
# used as baseline ranker in paper 
# returns a list of image names sorted by similarity scores (most similar first)
def rank_images(scores):
    # scores = dictionary containing the cos similarity scores
    # comparing the captions of the five images with some comparator embedding
    # keys = image names
    # values = scores

    ranking = []

    for i in range(5):
        # find key which corresponds to the highest value
        m = max(scores, key=scores.get)
        # add the key (image name) to the ranking
        ranking.append(m)
        # delete the entry in the dictionary
        del scores[m]

    return ranking

## Experiment 1 (only given data, no GPT-generated additional data used)

In [13]:
preprocessed = False 
dataA = load_pkl(preprocessed, checkpoint_ptlm, False)

In [14]:
comparator = "sentence"

methods = [
    'meanSecondToLast',
    'meanLast4',
    'meanLast',
    #'meanFirst',
    #'firstCLS',
    'lastCLS',
    'sbert']
subset = "train"

print("Rank images by similarity to original sentence:")
for method in methods:
    dataA["pred_order"] = dataA.apply(lambda x: rank_images(sim_scores(x, comparator, method)), axis=1)
    if subset == "train":
        mydata = only_train(dataA)
    else:
        mydata = only_subset(dataA,subset)

    print("Method: ", method, "on", subset)
    print("top1", top1accuracy(mydata["expected_order"], mydata["pred_order"]), " spearman", spearman_correlation(mydata["expected_order"], mydata["pred_order"]))

methods_compound = [
    'meanSecondToLast',
    'meanLast4',
    'meanLast',
    #'meanFirst'
    ]

# compare to compound embedding
print("\nRank images by similarity to compound embedding:")
for method in methods_compound:
    dataA["pred_order"] = dataA.apply(lambda x: rank_images(sim_scores_compound(x, comparator, method)), axis=1)
    if subset == "train":
        mydata = only_train(dataA)
    else:
        mydata = only_subset(dataA,subset)

    print("Method: ", method, "on", subset)
    print("top1", top1accuracy(mydata["expected_order"], mydata["pred_order"]), " spearman", spearman_correlation(mydata["expected_order"], mydata["pred_order"]))

Rank images by similarity to original sentence:
Method:  meanSecondToLast on train
top1 0.3714  spearman 0.0686
Method:  meanLast4 on train
top1 0.3571  spearman 0.0971
Method:  meanLast on train
top1 0.4  spearman 0.0071
Method:  lastCLS on train
top1 0.2143  spearman -0.0129
Method:  sbert on train
top1 0.4  spearman 0.2014

Rank images by similarity to compound embedding:
Method:  meanSecondToLast on train
top1 0.2571  spearman 0.0871
Method:  meanLast4 on train
top1 0.2286  spearman 0.0743
Method:  meanLast on train
top1 0.2571  spearman 0.0314


In [15]:
print("only on literal sentences")
for method in methods:
    dataA["pred_order"] = dataA.apply(lambda x: rank_images(sim_scores(x, comparator, method)), axis=1)
    if subset == "train":
        mydata = only_train(dataA)
    else:
        mydata = only_subset(dataA,subset)
    mydata = mydata[mydata["sentence_type"] == "literal"]
    print("Method: ", method, "on", subset)
    print("top1", top1accuracy(mydata["expected_order"], mydata["pred_order"]), " spearman", spearman_correlation(mydata["expected_order"], mydata["pred_order"]))

print("\nonly on idiomatic sentences")
for method in methods:
    dataA["pred_order"] = dataA.apply(lambda x: rank_images(sim_scores(x, comparator, method)), axis=1)
    if subset == "train":
        mydata = only_train(dataA)
    else:
        mydata = only_subset(dataA,subset)
    mydata = mydata[mydata["sentence_type"] == "idiomatic"]
    print("Method: ", method, "on", subset)
    print("top1", top1accuracy(mydata["expected_order"], mydata["pred_order"]), " spearman", spearman_correlation(mydata["expected_order"], mydata["pred_order"]))

only on literal sentences
Method:  meanSecondToLast on train
top1 0.6129  spearman 0.1968
Method:  meanLast4 on train
top1 0.5484  spearman 0.1903
Method:  meanLast on train
top1 0.5484  spearman 0.1452
Method:  lastCLS on train
top1 0.2903  spearman 0.1161
Method:  sbert on train
top1 0.6774  spearman 0.3065

only on idiomatic sentences
Method:  meanSecondToLast on train
top1 0.1795  spearman -0.0333
Method:  meanLast4 on train
top1 0.2051  spearman 0.0231
Method:  meanLast on train
top1 0.2821  spearman -0.1026
Method:  lastCLS on train
top1 0.1538  spearman -0.1154
Method:  sbert on train
top1 0.1795  spearman 0.1179


In [16]:
print("compound embedding only on literal sentences")
for method in methods_compound:
    dataA["pred_order"] = dataA.apply(lambda x: rank_images(sim_scores_compound(x, comparator, method)), axis=1)
    if subset == "train":
        mydata = only_train(dataA)
    else:
        mydata = only_subset(dataA,subset)
    mydata = mydata[mydata["sentence_type"] == "literal"]
    print("Method: ", method, "on", subset)
    print("top1", top1accuracy(mydata["expected_order"], mydata["pred_order"]), " spearman", spearman_correlation(mydata["expected_order"], mydata["pred_order"]))

print("\ncompound embedding only on idiomatic sentences")
for method in methods_compound:
    dataA["pred_order"] = dataA.apply(lambda x: rank_images(sim_scores_compound(x, comparator, method)), axis=1)
    if subset == "train":
        mydata = only_train(dataA)
    else:
        mydata = only_subset(dataA,subset)
    mydata = mydata[mydata["sentence_type"] == "idiomatic"]
    print("Method: ", method, "on", subset)
    print("top1", top1accuracy(mydata["expected_order"], mydata["pred_order"]), " spearman", spearman_correlation(mydata["expected_order"], mydata["pred_order"]))

compound embedding only on literal sentences
Method:  meanSecondToLast on train
top1 0.5161  spearman 0.1774
Method:  meanLast4 on train
top1 0.4839  spearman 0.2387
Method:  meanLast on train
top1 0.4839  spearman 0.2097

compound embedding only on idiomatic sentences
Method:  meanSecondToLast on train
top1 0.0513  spearman 0.0154
Method:  meanLast4 on train
top1 0.0256  spearman -0.0564
Method:  meanLast on train
top1 0.0769  spearman -0.1103


**with preprocessing**

In [17]:
preprocessed = True
dataA = load_pkl(preprocessed, checkpoint_ptlm, False)

In [18]:

print("Rank images by similarity to original sentence:")
for method in methods:
    dataA["pred_order"] = dataA.apply(lambda x: rank_images(sim_scores(x, comparator, method)), axis=1)
    if subset == "train":
        mydata = only_train(dataA)
    else:
        mydata = only_subset(dataA,subset)

    print("Method: ", method, "on", subset)
    print("top1", top1accuracy(mydata["expected_order"], mydata["pred_order"]), " spearman", spearman_correlation(mydata["expected_order"], mydata["pred_order"]))

methods_compound = [
    'meanSecondToLast',
    'meanLast4',
    'meanLast',
    #'meanFirst'
    ]

# compare to compound embedding
print("\nRank images by similarity to compound embedding:")
for method in methods_compound:
    dataA["pred_order"] = dataA.apply(lambda x: rank_images(sim_scores_compound(x, comparator, method)), axis=1)
    if subset == "train":
        mydata = only_train(dataA)
    else:
        mydata = only_subset(dataA,subset)

    print("Method: ", method, "on", subset)
    print("top1", top1accuracy(mydata["expected_order"], mydata["pred_order"]), " spearman", spearman_correlation(mydata["expected_order"], mydata["pred_order"]))

Rank images by similarity to original sentence:
Method:  meanSecondToLast on train
top1 0.3286  spearman 0.0771
Method:  meanLast4 on train
top1 0.3143  spearman 0.0943
Method:  meanLast on train
top1 0.3571  spearman 0.0971
Method:  lastCLS on train
top1 0.2714  spearman 0.0543
Method:  sbert on train
top1 0.4571  spearman 0.2057

Rank images by similarity to compound embedding:
Method:  meanSecondToLast on train
top1 0.3143  spearman 0.1729
Method:  meanLast4 on train
top1 0.3143  spearman 0.1114
Method:  meanLast on train
top1 0.2857  spearman 0.0857


In [19]:
# only on idiomatic sentences
print("\nonly on idiomatic sentences")
for method in methods:
    dataA["pred_order"] = dataA.apply(lambda x: rank_images(sim_scores(x, comparator, method)), axis=1)
    if subset == "train":
        mydata = only_train(dataA)
    else:
        mydata = only_subset(dataA,subset)
    mydata = mydata[mydata["sentence_type"] == "idiomatic"]
    print("Method: ", method, "on", subset)
    print("top1", top1accuracy(mydata["expected_order"], mydata["pred_order"]), " spearman", spearman_correlation(mydata["expected_order"], mydata["pred_order"]))

# compound embedding only on idiomatic sentences
print("\ncompound embedding only on idiomatic sentences")
for method in methods_compound:
    dataA["pred_order"] = dataA.apply(lambda x: rank_images(sim_scores_compound(x, comparator, method)), axis=1)
    if subset == "train":
        mydata = only_train(dataA)
    else:
        mydata = only_subset(dataA,subset)
    mydata = mydata[mydata["sentence_type"] == "idiomatic"]
    print("Method: ", method, "on", subset)
    print("top1", top1accuracy(mydata["expected_order"], mydata["pred_order"]), " spearman", spearman_correlation(mydata["expected_order"], mydata["pred_order"]))


only on idiomatic sentences
Method:  meanSecondToLast on train
top1 0.2051  spearman 0.1026
Method:  meanLast4 on train
top1 0.2051  spearman 0.1051
Method:  meanLast on train
top1 0.2564  spearman 0.0538
Method:  lastCLS on train
top1 0.1795  spearman 0.0205
Method:  sbert on train
top1 0.2051  spearman 0.1333

compound embedding only on idiomatic sentences
Method:  meanSecondToLast on train
top1 0.1026  spearman 0.0949
Method:  meanLast4 on train
top1 0.1026  spearman 0.0359
Method:  meanLast on train
top1 0.1282  spearman -0.0487


In [20]:
# only on literal sentences
print("\nonly on literal sentences")
for method in methods:
    dataA["pred_order"] = dataA.apply(lambda x: rank_images(sim_scores(x, comparator, method)), axis=1)
    if subset == "train":
        mydata = only_train(dataA)
    else:
        mydata = only_subset(dataA,subset)
    mydata = mydata[mydata["sentence_type"] == "literal"]
    print("Method: ", method, "on", subset)
    print("top1", top1accuracy(mydata["expected_order"], mydata["pred_order"]), " spearman", spearman_correlation(mydata["expected_order"], mydata["pred_order"]))

# compound embedding only on literal sentences
print("\ncompound embedding only on literal sentences")
for method in methods_compound:
    dataA["pred_order"] = dataA.apply(lambda x: rank_images(sim_scores_compound(x, comparator, method)), axis=1)
    if subset == "train":
        mydata = only_train(dataA)
    else:
        mydata = only_subset(dataA,subset)
    mydata = mydata[mydata["sentence_type"] == "literal"]
    print("Method: ", method, "on", subset)
    print("top1", top1accuracy(mydata["expected_order"], mydata["pred_order"]), " spearman", spearman_correlation(mydata["expected_order"], mydata["pred_order"]))


only on literal sentences
Method:  meanSecondToLast on train
top1 0.4839  spearman 0.0452
Method:  meanLast4 on train
top1 0.4516  spearman 0.0806
Method:  meanLast on train
top1 0.4839  spearman 0.1516
Method:  lastCLS on train
top1 0.3871  spearman 0.0968
Method:  sbert on train
top1 0.7742  spearman 0.2968

compound embedding only on literal sentences
Method:  meanSecondToLast on train
top1 0.5806  spearman 0.271
Method:  meanLast4 on train
top1 0.5806  spearman 0.2065
Method:  meanLast on train
top1 0.4839  spearman 0.2548


## Experiment 2 (using GPT data and idiomatic/literal gold label)

In [21]:
preprocessed = True
dataA = load_pkl(preprocessed, checkpoint_ptlm, False)


In [22]:
def get_comparator_prefix(sentence_type, gpt_type):
    return "gpt_" + sentence_type + "_" + gpt_type

gpt_types = ["image","meaning","meaning_cutted","sentence"]

print("Rank images by similarity (based on gold label):")
for gpt_type in gpt_types: 
    print("\n",gpt_type)
    for method in ["sbert","meanLast"]:    
        dataA["pred_order"] = dataA.apply(lambda x: 
                                      rank_images(
                                          sim_scores(x, get_comparator_prefix(x["sentence_type"],gpt_type), method)) 
                                          , axis=1)
        if subset == "train":
            mydata = only_train(dataA)
        else:
            mydata = only_subset(dataA,subset)


        print("Method: ", method, "on", subset)
        print("top1", top1accuracy(mydata["expected_order"], mydata["pred_order"]), " spearman", spearman_correlation(mydata["expected_order"], mydata["pred_order"]))


# compare to compound embedding
print("\nRank images by similarity to compound embedding:")
for gpt_type in ["meaning","sentence"]: 
    print("\n",gpt_type)
    for method in ["meanLast4"]:
        dataA["pred_order"] = dataA.apply(lambda x:  
                                      rank_images(
                                          sim_scores_compound(x, get_comparator_prefix(x["sentence_type"],gpt_type), method)), axis=1)
    
        if subset == "train":
            mydata = only_train(dataA)
        else:
            mydata = only_subset(dataA,subset)
        print("Method: ", method, "on", subset)
        print("top1", top1accuracy(mydata["expected_order"], mydata["pred_order"]), " spearman", spearman_correlation(mydata["expected_order"], mydata["pred_order"]))

Rank images by similarity (based on gold label):

 image
Method:  sbert on train
top1 0.6143  spearman 0.1586
Method:  meanLast on train
top1 0.4857  spearman 0.1371

 meaning
Method:  sbert on train
top1 0.4429  spearman 0.2914
Method:  meanLast on train
top1 0.4714  spearman 0.1743

 meaning_cutted
Method:  sbert on train
top1 0.5286  spearman 0.1857
Method:  meanLast on train
top1 0.5143  spearman 0.1257

 sentence
Method:  sbert on train
top1 0.3857  spearman 0.1471
Method:  meanLast on train
top1 0.3571  spearman 0.1386

Rank images by similarity to compound embedding:

 meaning
Method:  meanLast4 on train
top1 0.3429  spearman 0.0743

 sentence
Method:  meanLast4 on train
top1 0.2571  spearman 0.1329


## Experiment 3 (Binary classification idiomatic/literal)

##### sentence embedding versus gpt sentence/meaning embedding

In [23]:
preprocessed = False
dataA = load_pkl(preprocessed, checkpoint_ptlm, False)

In [24]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
# only training data has sentence_type information
dataA_train = only_train(dataA)


compound_methods = ['meanSecondToLast',#
                    'meanLast4',
                    'meanLast',
                    #'meanFirst'
                    ]
methods = ['meanSecondToLast',
           'meanLast4',
           'meanLast',#'meanFirst',
           'lastCLS','sbert']

# systematic analysis sentence compared to gpt sentence embeddings
gpt_sents = ["sentence", "meaning", "meaning_cutted", "image"]




print("sentence compared to gpt sentence embeddings (classification accuracy)")

for sent in gpt_sents:
    for m in methods:
        pred = dataA.apply(lambda x: compare(x["sentence_embedding_" + m],
                                         x["gpt_literal_" + sent + "_embedding_" + m], 
                                         x["gpt_idiomatic_" + sent + "_embedding_" + m]), axis=1)
        pred = binary2values(pred,"literal","idiomatic")
        dataA["pred_sentence_embedding_compared_to_gpt_" + sent + "_embedding_" +m  ] =pred

        dataA_train = only_train(dataA)
        pred = dataA_train["pred_sentence_embedding_compared_to_gpt_" + sent + "_embedding_" +m ]
        # evaluate
        y = dataA_train["sentence_type"]
        print(round(accuracy_score(y, pred),3),sent + "_embedding_" + m)
    print("")



sentence compared to gpt sentence embeddings (classification accuracy)
0.829 sentence_embedding_meanSecondToLast
0.814 sentence_embedding_meanLast4
0.857 sentence_embedding_meanLast
0.829 sentence_embedding_lastCLS
0.786 sentence_embedding_sbert

0.729 meaning_embedding_meanSecondToLast
0.714 meaning_embedding_meanLast4
0.757 meaning_embedding_meanLast
0.657 meaning_embedding_lastCLS
0.814 meaning_embedding_sbert

0.743 meaning_cutted_embedding_meanSecondToLast
0.7 meaning_cutted_embedding_meanLast4
0.757 meaning_cutted_embedding_meanLast
0.586 meaning_cutted_embedding_lastCLS
0.571 meaning_cutted_embedding_sbert

0.757 image_embedding_meanSecondToLast
0.771 image_embedding_meanLast4
0.786 image_embedding_meanLast
0.614 image_embedding_lastCLS
0.6 image_embedding_sbert



In [25]:
# classification report for winning system
for sent in ["sentence"]:
    for m in ["meanLast"]:
        pred = dataA.apply(lambda x: compare(x["sentence_embedding_" + m],
                                         x["gpt_literal_" + sent + "_embedding_" + m], 
                                         x["gpt_idiomatic_" + sent + "_embedding_" + m]), axis=1)
        pred = binary2values(pred,"literal","idiomatic")
        dataA["pred_sentence_embedding_compared_to_gpt_" + sent + "_embedding_" +m  ] =pred

        dataA_train = only_train(dataA)
        pred = dataA_train["pred_sentence_embedding_compared_to_gpt_" + sent + "_embedding_" +m ]
        # evaluate
        print(sent + "_embedding_" + m)
        y = dataA_train["sentence_type"]
        print(classification_report(y, pred))


sentence_embedding_meanLast
              precision    recall  f1-score   support

   idiomatic       0.87      0.87      0.87        39
     literal       0.84      0.84      0.84        31

    accuracy                           0.86        70
   macro avg       0.86      0.86      0.86        70
weighted avg       0.86      0.86      0.86        70



##### compound in sentence embedding versus compound in gpt sentence/meaning embedding

In [26]:
# compare compound embeddings
gpt_sents = ["sentence", "meaning"]



for sent in gpt_sents:
    for m in compound_methods:
        pred = dataA.apply(lambda x: compare(x["compound_embedding_sentence_" + m],
                                         x["compound_embedding_gpt_literal_" + sent + "_" + m], 
                                         x["compound_embedding_gpt_idiomatic_" + sent + "_" + m]), axis=1)
        pred = binary2values(pred,"literal","idiomatic")
        dataA["pred_compound_embedding_sentence_compared_to_compound_embedding_gpt_" + sent + "_" +m  ] = pred

        dataA_train = only_train(dataA)
        pred = dataA_train["pred_compound_embedding_sentence_compared_to_compound_embedding_gpt_" + sent + "_" +m  ] 

        # evaluate
        y = dataA_train["sentence_type"]
        print(round(accuracy_score(y, pred),3),sent + "_embedding_" + m)

        




0.9 sentence_embedding_meanSecondToLast
0.9 sentence_embedding_meanLast4
0.857 sentence_embedding_meanLast
0.743 meaning_embedding_meanSecondToLast
0.743 meaning_embedding_meanLast4
0.671 meaning_embedding_meanLast


In [27]:
# systematic analysis compound embeddings of winning systems
gpt_sents = ["sentence"]

print("compound in sentence compared to gpt compound in sentence embeddings")
for sent in gpt_sents:
    for m in ["meanLast4"]:

        dataA_train = only_train(dataA)
        pred = dataA_train["pred_compound_embedding_sentence_compared_to_compound_embedding_gpt_" + sent + "_" +m  ] 

        # evaluate
        print(sent + "_embedding_" + m)
        y = dataA_train["sentence_type"]
        print(round(accuracy_score(y, pred),3))
        print(classification_report(y, pred,zero_division=0))


compound in sentence compared to gpt compound in sentence embeddings
sentence_embedding_meanLast4
0.9
              precision    recall  f1-score   support

   idiomatic       0.92      0.90      0.91        39
     literal       0.88      0.90      0.89        31

    accuracy                           0.90        70
   macro avg       0.90      0.90      0.90        70
weighted avg       0.90      0.90      0.90        70



## Experiment 4 (ranking strategies) 

### Baseline ranker

In [28]:
def dependent_preds(current, comparator, method):
    pred = current["binary_pred"]
    return rank_images(sim_scores(current, "gpt_" + pred + "_" + comparator , method))

In [29]:
# select best classificator:

pred_all = dataA["pred_compound_embedding_sentence_compared_to_compound_embedding_gpt_sentence_meanLast4"] 
dataA["binary_pred"] = pred_all


#### Rank images by similarity to original sentence (baseline ranker, no GPT)

In [30]:
method = "sbert" # "lastCLS"
dataA["pred_order"] = dataA.apply(lambda x: rank_images(sim_scores(x, "sentence", method)), axis=1)

dataA_train = only_train(dataA)
print("baseline ranker (rank images by similarity to original sentence):")
print("train")
print("top1 accuracy", top1accuracy(dataA_train["expected_order"], dataA_train["pred_order"]))
print("spearman correlation", spearman_correlation(dataA_train["expected_order"], dataA_train["pred_order"]))


for s in ['Test', 'Extended Evaluation']:
    print(s)
    mydata = only_subset(dataA,s)
    print("top1 accuracy", top1accuracy(mydata["expected_order"], mydata["pred_order"]))
    print("spearman correlation", spearman_correlation(mydata["expected_order"], mydata["pred_order"]))


baseline ranker (rank images by similarity to original sentence):
train
top1 accuracy 0.4
spearman correlation 0.2014
Test
top1 accuracy 0.2667
spearman correlation 0.0533
Extended Evaluation
top1 accuracy 0.41
spearman correlation 0.169


#### baseline ranker + use GPT-data and idiomaticity classifier

In [31]:
method = "sbert" #"meanLast4" # "meanLast" #
for sent_type in ['sentence', 'meaning','image']:
    print("\nDepending on binary prediction (literal/idiomatic) rank images by similarity to gpt_" + sent_type)
    dataA["pred_order_dependent"] = dataA.apply(lambda x: dependent_preds(x,sent_type, method), axis=1)

    dataA_train = only_train(dataA)
    print("top1 accuracy", top1accuracy(dataA_train["expected_order"], dataA_train["pred_order_dependent"]))
    print("spearman correlation", spearman_correlation(dataA_train["expected_order"], dataA_train["pred_order_dependent"]))



Depending on binary prediction (literal/idiomatic) rank images by similarity to gpt_sentence
top1 accuracy 0.3429
spearman correlation 0.0557

Depending on binary prediction (literal/idiomatic) rank images by similarity to gpt_meaning
top1 accuracy 0.3714
spearman correlation 0.1829

Depending on binary prediction (literal/idiomatic) rank images by similarity to gpt_image
top1 accuracy 0.5571
spearman correlation 0.0857


In [32]:
method = "sbert" 
sent_type  = 'image'
dataA["pred_order_dependent"] = dataA.apply(lambda x: dependent_preds(x,sent_type, method), axis=1)

dataA_train = only_train(dataA)
print("baseline ranker to gpt-caption")
print("train")
print("top1 accuracy", top1accuracy(dataA_train["expected_order"], dataA_train["pred_order_dependent"]))
print("spearman correlation", spearman_correlation(dataA_train["expected_order"], dataA_train["pred_order_dependent"]))

for s in ['Test', 'Extended Evaluation']:
    print(s)
    mydata = only_subset(dataA,s)
    print("top1 accuracy", top1accuracy(mydata["expected_order"], mydata["pred_order_dependent"]))
    print("spearman correlation", spearman_correlation(mydata["expected_order"], mydata["pred_order_dependent"]))



baseline ranker to gpt-caption
train
top1 accuracy 0.5571
spearman correlation 0.0857
Test
top1 accuracy 0.4667
spearman correlation 0.0267
Extended Evaluation
top1 accuracy 0.48
spearman correlation 0.14


### Ranking images by similarity to gpt material -- pair ranker

In [33]:
# pair of most literal images, pair of most idiomatic image, unrelated

def dependent_preds_compare_pairs(current, sent_type,method):
    preds = [0 for i in range(5)]
    # similarities of all captions with gpt_literal_"method" as a dictionary with keys = image names and values = sim. scores
    scores_lit = sim_scores(current, "gpt_literal_" + sent_type,method)  
    # similarities of all captions to gpt_idiomatic_"method"
    scores_id = sim_scores(current, "gpt_idiomatic_" + sent_type,method)
    image_names = list(scores_lit.keys())
    scores = {"literal": scores_lit, "idiomatic": scores_id}
    type = current["binary_pred"] # idiomatic/literal classification
    if type =="idiomatic":
        nottype = "literal"
    else:
        nottype = "idiomatic"
    
    max_type = max(scores[type],key=scores[type].get)
    preds[0] = max_type
    del scores[nottype][max_type]
    max_nottype = max(scores[nottype],key=scores[nottype].get)
    preds[3] = max_nottype
    
    # score images by similarity to most literal and most idiomatic image
    max_type_index = image_names.index(max_type)
    max_nottype_index = image_names.index(max_nottype)
    scores_images_type = sim_scores(current, "image" + str(max_type_index + 1) + "_caption",method)
    scores_images_nottype = sim_scores(current, "image" + str(max_nottype_index +1) + "_caption",method)
    del scores_images_type[max_type]
    del scores_images_type[max_nottype]
    del scores_images_nottype[max_type]
    del scores_images_nottype[max_nottype]

    # choose most similar to max_type and to max_nottype
    sim_max_type = max(scores_images_type, key=scores_images_type.get)
    preds[1] = sim_max_type

    del scores_images_nottype[sim_max_type]
    sim_max_nottype = max(scores_images_nottype, key=scores_images_nottype.get)
    preds[2] = sim_max_nottype
    # remaining image is unrelated image
    preds[4] = list(set(image_names).difference(set([max_type,max_nottype,sim_max_type,sim_max_nottype])))[0]

    return preds


In [34]:
method = "sbert" # "meanLast" #  
sent_type = "image" #  "meaning" # "sentence" #  
dataA["pred_order_dependent"] = dataA.apply(lambda x: dependent_preds_compare_pairs(x,sent_type,method), axis=1)

dataA_train = only_train(dataA)
print("pair ranker: Evaluation on training data (rank images dependent on binary classification. For order use inter-image similarity):")
print("top1 accuracy", top1accuracy(dataA_train["expected_order"], dataA_train["pred_order_dependent"]))
print("spearman correlation", spearman_correlation(dataA_train["expected_order"], dataA_train["pred_order_dependent"]))

for s in ['Test', 'Extended Evaluation']:
    print(s)
    mydata = only_subset(dataA,s)
    print("top1 accuracy", top1accuracy(mydata["expected_order"], mydata["pred_order_dependent"]))
    print("spearman correlation", spearman_correlation(mydata["expected_order"], mydata["pred_order_dependent"]))


pair ranker: Evaluation on training data (rank images dependent on binary classification. For order use inter-image similarity):
top1 accuracy 0.5571
spearman correlation 0.3243
Test
top1 accuracy 0.4667
spearman correlation 0.18
Extended Evaluation
top1 accuracy 0.48
spearman correlation 0.298


### Ranking images by similarity to gpt material -- extreme ranker

In [35]:
# sim scores

def dependent_preds_no_pairs(current, sent_type, method):
    preds = [0 for i in range(5)]
    # similarities of all captions with gpt_literal_"method" as a dictionary with keys = image names and values = sim. scores
    scores_lit = sim_scores(current, "gpt_literal_" + sent_type,method)  
    # similarities of all captions to gpt_idiomatic_"method"
    scores_id = sim_scores(current, "gpt_idiomatic_" + sent_type,method)
    image_names = list(scores_lit.keys())
    scores = {"literal": scores_lit, "idiomatic": scores_id}
    type = current["binary_pred"] # idiomatic/literal classification
    if type =="idiomatic":
        nottype = "literal"
    else:
        nottype = "idiomatic"

    # get highest scoring image for literal and idiomatic
    max_type = max(scores[type],key=scores[type].get)
    preds[0] = max_type
    del scores[nottype][max_type]
    max_nottype = max(scores[nottype],key=scores[nottype].get)
    preds[3] = max_nottype
    del scores[type][max_type]
    del scores[type][max_nottype]

    # get second highest scoring lit and id
    max_type_second = max(scores[type], key=scores[type].get)
    preds[1] = max_type_second
    del scores[nottype][max_nottype]
    del scores[nottype][max_type_second]
    max_nottype_second = max(scores[nottype], key=scores[nottype].get)
    preds[2] = max_nottype_second
    
    preds[4] = list(set(image_names).difference(set([max_type, max_nottype, max_type_second, max_nottype_second])))[0]
    return preds


In [36]:
method = "sbert" # "meanLast" # 
dataA["pred_order_dependent"] = dataA.apply(lambda x: dependent_preds_no_pairs(x,"image", method), axis=1)

dataA_train = only_train(dataA)
print("extreme ranker: Evaluation on training data (rank images dependent on binary classification. For order use similarity to gpt_image only):")
print("top1 accuracy", top1accuracy(dataA_train["expected_order"], dataA_train["pred_order_dependent"]))
print("spearman correlation", spearman_correlation(dataA_train["expected_order"], dataA_train["pred_order_dependent"]))
#make_submission(dataA,"pred_order_dependent", "Dev")

for s in ['Test', 'Extended Evaluation']:
    print(s)
    mydata = only_subset(dataA,s)
    print("top1 accuracy", top1accuracy(mydata["expected_order"], mydata["pred_order_dependent"]))
    print("spearman correlation", spearman_correlation(mydata["expected_order"], mydata["pred_order_dependent"]))


extreme ranker: Evaluation on training data (rank images dependent on binary classification. For order use similarity to gpt_image only):
top1 accuracy 0.5571
spearman correlation 0.2457
Test
top1 accuracy 0.4667
spearman correlation 0.0867
Extended Evaluation
top1 accuracy 0.48
spearman correlation 0.334
