## Load data

In [1]:
# read tsv file
import csv
import sys
import ast

# file name: subtask_a_sample.tsv

import pandas as pd


dataA_train = pd.read_csv("subtask_a_train.tsv", sep='\t')
dataA_train['expected_order'] = dataA_train['expected_order'].apply(ast.literal_eval)
dataA_dev = pd.read_csv("subtask_a_dev.tsv", sep='\t')
dataA_test = pd.read_csv("subtask_a_test.tsv", sep='\t')

dataA = pd.concat([dataA_train,dataA_dev,dataA_test])
# reset index
dataA = dataA.reset_index(drop=True)




In [2]:
def only_train(dataA):
    return pd.concat([dataA[dataA["subset"] == "Sample"],dataA[dataA["subset"]== "Train"]])

## BERT model 

### without fine-tuning, classification of idiomatic/literal

Idea: we use a pretrained BERT model to generate embeddings of sentences and of the compound in the context of the sentence.

In [3]:
from transformers import AutoTokenizer

# model is selected from https://huggingface.co/models
checkpoint = 'bert-base-uncased' #"distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer.model_max_length

512

In [4]:
def tokenize(batch):
    return tokenizer(batch, padding=True, truncation=True)

In [5]:
demonstrate = tokenize(dataA["sentence"][:3].tolist())
for i in range(3):
    print(demonstrate["input_ids"][i])  
    print(demonstrate["attention_mask"][i])
    print(dataA["sentence"][i])
    print()


[101, 2009, 2165, 1037, 2843, 1997, 8999, 21956, 2000, 2131, 1996, 2214, 3194, 2770, 2153, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
It took a lot of elbow grease to get the old engine running again.

[101, 2009, 1005, 1055, 1037, 5377, 2645, 2005, 2149, 1010, 2004, 2002, 2003, 1037, 2851, 2711, 1998, 1045, 2572, 1037, 2305, 13547, 1010, 2061, 1045, 2424, 2008, 2183, 2000, 3637, 2012, 1023, 1012, 2382, 2428, 7659, 2041, 1996, 2190, 2847, 1012, 102]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
It's a constant battle for us, as he is a morning person and I am a night owl, so I find that going to sleep at 9.30 really cuts out the best hours.

[101, 2130, 1996, 5399, 6534, 2100, 3478, 2797, 3239, 2038, 1037, 2540, 1997, 2751, 1006, 1998, 103

"sentence"-column is tokenized and passed to the model.

In [6]:
# tokenize all sentences
dataA_sentence_tokenized = tokenize(dataA["sentence"].tolist())
dataA_compound_tokenized = tokenize(dataA["compound"].tolist())

print(len(dataA_compound_tokenized["input_ids"][0]), len(dataA_sentence_tokenized["input_ids"][0]))
dataA_compound_tokenized["input_ids"][:3]

6 42


[[101, 8999, 21956, 102, 0, 0],
 [101, 2305, 13547, 102, 0, 0],
 [101, 2540, 1997, 2751, 102, 0]]

In [23]:
import torch
from transformers import AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(checkpoint, output_hidden_states=True).to(device)
model = model.eval()


In [24]:
# convert input_ids to tensor
input_ids_sentence = torch.tensor(dataA_sentence_tokenized["input_ids"]).to(device)
attention_mask_sentence = torch.tensor(dataA_sentence_tokenized["attention_mask"]).to(device)

# pass input_ids to model
with torch.no_grad():
    output = model(input_ids_sentence, attention_mask_sentence)



In [None]:
# number of items, number of 
output.last_hidden_state.shape
# number of items, length of input_ids ,number of hidden units

In [26]:
hidden_states_sentence = output.hidden_states

In [None]:
print ("Number of layers:", len(hidden_states_sentence), "  (including initial embeddings)")
layer_i = 0

print ("Number of batches:", len(hidden_states_sentence[layer_i]))
batch_i = 0

print ("Number of tokens:", len(hidden_states_sentence[layer_i][batch_i]))
token_i = 0

print ("Number of hidden units:", len(hidden_states_sentence[layer_i][batch_i][token_i]))


In [28]:
import numpy as np

# different pooling methods for embeddings are computed
# NOTE: padding tokens should be excluded (not done yet)

def get_sentence_embedding(hidden_states,method):
    sentence_embedding = []
    if method == 'meanLast4': # average of all tokens of the last 4 layers
        for i in range(len(hidden_states[0])):
            # token_vecs is mean of last 4 layers
            token_tensor = torch.stack([hidden_states[-1][i], hidden_states[-2][i], hidden_states[-3][i], hidden_states[-4][i]], dim=0)
            token_vecs = torch.mean(token_tensor, dim=0)
            sentence_embedding.append(torch.mean(token_vecs, dim=0))
    elif method == 'meanSecondToLast': # average of second to last layer
        for i in range(len(hidden_states[-2])):
            token_vecs = hidden_states[-2][i]
            sentence_embedding.append(torch.mean(token_vecs, dim=0))
    elif method == 'meanLast': # average of last layer
        for i in range(len(hidden_states[-1])):
            token_vecs = hidden_states[-1][i]
            sentence_embedding.append(torch.mean(token_vecs, dim=0))
    elif method == 'lastCLS': # CLS token of last layer
        sentence_embedding = hidden_states[-1][:, 0, :]
    elif method == 'meanFirst': # average of first layer
        for i in range(len(hidden_states[0])):
            token_vecs = hidden_states[0][i]
            sentence_embedding.append(torch.mean(token_vecs, dim=0))
    elif method == 'firstCLS': # CLS token of first layer
        sentence_embedding = hidden_states[0][:, 0, :]
    return sentence_embedding

In [29]:
# use all methods for getting sentence embeddings and add them to dataA

methods = ['meanSecondToLast','meanLast4','meanLast','meanFirst','firstCLS','lastCLS']

for method in methods:
    X = get_sentence_embedding(hidden_states_sentence,method)
    X = np.array([x.cpu().numpy() for x in X]).tolist()
    # add a new column to dataA 
    dataA['sentence_embedding_'+ method] = X
    



Die Ergebnisse sind erstaunlich gut. Allerdings sind sie auch sehr abhängig von random_state (teste 0,10,13,42)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
# mlp
from sklearn.neural_network import MLPClassifier


# evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


for method in methods: 
    # get data for training (subset column has value train or sample)
    dataA_train = only_train(dataA)

    X = dataA_train['sentence_embedding_'+ method].tolist()
    y = dataA_train["sentence_type"]

    # split in train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    print('=========================================================')
    print(method)
    # logistic regression
#    clf = LogisticRegression(random_state=0).fit(X_train, y_train)
#    clf = RandomForestClassifier(max_depth=2, random_state=0).fit(X_train, y_train)
    clf = SVC(kernel='linear').fit(X_train, y_train)
#    clf = MLPClassifier(random_state=0, max_iter=300).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(accuracy_score(y_test, y_pred))

    #classification report
    print(classification_report(y_test, y_pred,zero_division=0))



In [None]:
# dummy classifier most frequent
from sklearn.dummy import DummyClassifier


# dummy classifier most frequent
clf = DummyClassifier(strategy="most_frequent")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("dummy classifier most frequent")
print(accuracy_score(y_test, y_pred))

#classification report
print(classification_report(y_test, y_pred,zero_division=0))


In [None]:
dataA.columns

### Compare compound embeddings in sentence and in chatGPT definitions.
#### 1) get contextualized compound embeddings for compounds in sentences

In [24]:
##!pip install inflect


In [25]:
# sometimes the compound occurs in plural form in the sentence

# add a new column to dataA with the plural form of the compound 


from re import *
import inflect

engine = inflect.engine()

dataA["compound_plural"] = [None for i in range(len(dataA))]

for i in range(len(dataA["compound"])):
    dataA["compound_plural"][i] = engine.plural(dataA["compound"][i])

dataA_compound_plural_tokenized = tokenize(dataA["compound_plural"].tolist())

In [None]:
engine = inflect.engine()
t = engine.plural("color in the grass")
tt = engine.plural(t)
t, tt


In [None]:
dataA["compound_plural"][70:].value_counts

Sometimes the compound occurs in the sentence only in plural form. So both forms are needed.

In [None]:
tokenizer.tokenize("rotten apples and oranges are in the woodbasket")

In [29]:
# returns the index of the compound in the sentence
def get_idx(compound_tokens, compound_plural_tokens, sentence_tokens):
    # remove 0-tokens from compound_tokens (removes tokens that are due to padding)
    compound_tokens = [i for i in compound_tokens if i != 0]
    # remove [CLS] and [SEP] from compound_tokens
    compound_tokens = compound_tokens[1:-1]
    compound_plural_tokens = [i for i in compound_plural_tokens if i != 0]
    compound_plural_tokens = compound_plural_tokens[1:-1]
    idx = []
    # find the first occurence of the sequence of compound_tokens in sentence_tokens (singular and plural forms)
    for i in range(len(sentence_tokens)):
        if sentence_tokens[i:i+len(compound_tokens)] == compound_tokens:
            for j in range(i, i+ len(compound_tokens)):
                idx.append(j)
    for i in range(len(sentence_tokens)):
        if sentence_tokens[i:i+len(compound_plural_tokens)] == compound_plural_tokens:
            for j in range(i, i+ len(compound_plural_tokens)):
                idx.append(j)
    # remove duplicates from idx
    idx = list(set(idx))
    return idx



In [None]:
#testen
get_idx([99,1,5,100,0,0],[99,1,5,2,100,0,0],[1,5,3,7,4,1,5,2,1,9,0,0,0])

In [None]:
hidden_states_sentence[-1].shape

In [32]:
# returns the embeddings of the tokens in idxList. 
# The embeddings are combined to a single embedding by different averaging methods
import numpy as np
def get_idxList_embedding(hidden_states,idxLists,method):
    embedding = []
    if method == 'meanLast4':
        for i in range(len(hidden_states[-1])):
            # token_vecs is mean of last 4 layers
            idxList = idxLists[i]
            token_tensor = torch.stack([hidden_states[-1][i][idxList], hidden_states[-2][i][idxList], hidden_states[-3][i][idxList], hidden_states[-4][i][idxList]], dim=0)
            token_vecs = torch.mean(token_tensor, dim=0)
            embedding.append(torch.mean(token_vecs, dim=0).tolist())
    elif method == 'meanSecondToLast':
        for i in range(len(hidden_states[-2])):
            idxList = idxLists[i]
            token_vecs = hidden_states[-2][i][idxList]
            embedding.append(torch.mean(token_vecs, dim=0).tolist())
    elif method == 'meanLast':
        for i in range(len(hidden_states[-1])):
            idxList = idxLists[i]
            token_vecs = hidden_states[-1][i][idxList]
            embedding.append(torch.mean(token_vecs, dim=0).tolist())
    elif method == 'meanFirst':
        for i in range(len(hidden_states[0])):
            idxList = idxLists[i]
            token_vecs = hidden_states[0][i][idxList]
            embedding.append(torch.mean(token_vecs, dim=0).tolist())
    return embedding

In [33]:
# add column to dataA with the indices of the compound in the sentence
dataA["compound_idx"] = [get_idx(dataA_compound_tokenized["input_ids"][i], dataA_compound_plural_tokenized["input_ids"][i], dataA_sentence_tokenized["input_ids"][i]) for i in range(len(dataA))]



In [34]:
# print dataA["sentence"] if compound_idx is empty (ideally there should be no empty compound_idx)
for i in range(len(dataA)):
    if len(dataA["compound_idx"][i]) == 0:
        print(dataA["compound"][i])
        print(dataA["sentence"][i])

In [35]:

# add column compound_embedding to dataA use method 'meanLast'
compound_methods = ['meanSecondToLast','meanLast4','meanLast','meanFirst']
for method in compound_methods:
    dataA['compound_embedding_'+ method] = get_idxList_embedding(hidden_states_sentence,dataA["compound_idx"],method) 

In [None]:
dataA.columns

compound_embedding_ enthält die Embedding des Compounds im Sentence

In [None]:

dataA_train = only_train(dataA)


for method in compound_methods:
    X = dataA_train['compound_embedding_'+ method].tolist()
    y = dataA_train["sentence_type"]

    # split in train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    print('=========================================================')
    print(method)
    # logistic regression
    clf = LogisticRegression(random_state=0).fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    print(accuracy_score(y_test, y_pred))

    #classification report
    print(classification_report(y_test, y_pred,zero_division=0))

#### 2) Get contextualized compound embeddings in gpt_sentences and gpt_meaning and gpt_pic embeddings


In [None]:
# read chatGPT data from csv

data_chatGPT_train = pd.read_csv("chatGPTNew_train.csv")
data_chatGPT_dev = pd.read_csv("chatGPTNew_dev.csv")
data_chatGPT_test = pd.read_csv("chatGPTNew_test.csv")
data_chatGPT = pd.concat([data_chatGPT_train,data_chatGPT_dev,data_chatGPT_test])

data_chatGPT = data_chatGPT.reset_index(drop=True)


# rename each column with "gpt_" in front of the column name
data_chatGPT.rename(columns=lambda x: 'gpt_' + x, inplace=True)
data_chatGPT


In [None]:
data_chatGPT.columns

In [None]:
data_chatGPT["gpt_idiomatic_meaning"][3]

In [41]:
data_chatGPT["compound"] = [None for i in range(len(data_chatGPT))]
for i in range(len(data_chatGPT)):
    data_chatGPT["compound"][i] = data_chatGPT["gpt_idiomatic_meaning"][i].split(" is")[0].strip().lower()

In [42]:
# gpt data needs some cleanup:

# replace ’ with ' in all columns
for column in data_chatGPT.columns:
    data_chatGPT[column] = data_chatGPT[column].str.replace("’","'")



In [43]:
# combine dataA and data_chatGPT
# 
merged_df = pd.merge(dataA, data_chatGPT, on='compound')

In [None]:
merged_df.shape, data_chatGPT.shape, dataA.shape

In [None]:
dataA = merged_df
dataA.columns

In [46]:
# Sentence embeddings 

types = ["literal_sentence","idiomatic_sentence","literal_meaning","idiomatic_meaning"]

compound_tokenized = tokenize(dataA["compound"].tolist())
compound_plural_tokenized = tokenize(dataA["compound_plural"].tolist())    

for t in types:
    # tokenize the column
    tokenized = tokenize(dataA["gpt_"+t].tolist())

    # hidden states for gpt_Meaning
    input_ids = torch.tensor(tokenized["input_ids"]).to(device)
    attention_mask = torch.tensor(tokenized["attention_mask"]).to(device)

    with torch.no_grad():
        output = model(input_ids, attention_mask)

    hidden_states = output.hidden_states

    # add a columns to data_chatGPT with the embeddings of the gpt_sentence for each method in methods
    for method in methods:
        X = get_sentence_embedding(hidden_states,method)
        X = np.array([x.cpu().numpy() for x in X]).tolist()
        dataA["gpt_"+ t + "_embedding_"+ method] = X
    # add gpt_compound_embeddings

    # identify the indices of the compound in the sentence and use it to get the embeddings of the compound
    dataA["gpt_compound_idx_"+t] = [get_idx(compound_tokenized["input_ids"][i], compound_plural_tokenized["input_ids"][i], tokenized["input_ids"][i]) for i in range(len(dataA))]
    # apply the methods in compound_methods to get the embeddings of the compound
    for method in compound_methods:
        dataA['gpt_compound_embedding_'+ t + "_"+ method] = get_idxList_embedding(hidden_states,dataA["gpt_compound_idx_" + t],method) 


In [None]:
dataA['gpt_compound_embedding_literal_sentence_meanLast'][0]

3) ### Use cosine similarity between compound_embedding in sentence and in gpt sentence/meaning to decide idiomatic/literal

In [48]:
def cosine_similarity(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))


def compare(emb,emb0,emb1):
    sim0 = cosine_similarity(emb,emb0)
    sim1 = cosine_similarity(emb,emb1)
    if sim0 > sim1:
        return 0
    else:
        return 1

In [None]:
print(dataA.columns)

In [50]:
for t in ["sentence", "meaning"]:
    m = "meanLast4"

    gpt_compound_embedding_idiomatic = dataA["gpt_compound_embedding_" + "idiomatic_" + t + "_" + m]
    gpt_compound_embedding_literal = dataA["gpt_compound_embedding_" + "literal_" + t + "_" + m]

    preds = []
    for i in range(len(dataA)):
        idx = compare(dataA["compound_embedding_" + m][i],gpt_compound_embedding_literal[i],gpt_compound_embedding_idiomatic[i])
        preds.append(["literal","idiomatic"][idx])

    dataA["pred_compound_sentence_and_compound_" + t] = preds


In [None]:
dataA["pred_compound_sentence_and_compound_sentence"].value_counts(), dataA["pred_compound_sentence_and_compound_meaning"].value_counts()

In [None]:
dataA_train = only_train(dataA)

# check accuracy of the predictions: idiomaticity predicted by compound embedding in sentence compared to 
# compound embedding in gpt_sentence 
y = dataA_train["sentence_type"]
y_pred = dataA_train["pred_compound_sentence_and_compound_sentence"]
print("compound sentence")
print(accuracy_score(y, y_pred))
print(classification_report(y, y_pred,zero_division=0))

y_pred = dataA_train["pred_compound_sentence_and_compound_meaning"]
print("compound meaning")
print(accuracy_score(y, y_pred))
print(classification_report(y, y_pred,zero_division=0))


## Cosine similarity between sentences and gpt sentences/meanings

In [53]:
for t in ["sentence", "meaning"]:
    for m in ["lastCLS", "meanLast4"]: 

        gpt_embedding_idiomatic = dataA["gpt_idiomatic_" + t + "_embedding_" + m]
        gpt_embedding_literal = dataA["gpt_literal_" + t + "_embedding_" + m]

        preds = []
        for i in range(len(dataA)):
            idx = compare(dataA["sentence_embedding_" + m][i],gpt_embedding_literal[i],gpt_embedding_idiomatic[i])
            preds.append(["literal","idiomatic"][idx])

        dataA["pred_sentence_and_" + t + "_" + m] = preds


In [None]:
dataA_train = only_train(dataA)

for m in  ["lastCLS", "meanLast4"]:
    # check accuracy of the predictions: idiomaticity predicted by sentence embedding compared to 
    # gpt embedding 
    y = dataA_train["sentence_type"]
    y_pred = dataA_train["pred_sentence_and_sentence_" + m]
    print(m)
    print("sentence")
    print(accuracy_score(y, y_pred))
    print(classification_report(y, y_pred,zero_division=0))
    print("-----------------")

    y_pred = dataA_train["pred_sentence_and_meaning_" + m]
    print("meaning")
    print(accuracy_score(y, y_pred))
    print(classification_report(y, y_pred,zero_division=0))
    print("==========================================")

## Sorting images by cosine similarity of embeddings

In [55]:
# tokenize captions
dataA_image1_caption_tokenized = tokenize(dataA["image1_caption"].tolist())
dataA_image2_caption_tokenized = tokenize(dataA["image2_caption"].tolist())
dataA_image3_caption_tokenized = tokenize(dataA["image3_caption"].tolist())
dataA_image4_caption_tokenized = tokenize(dataA["image4_caption"].tolist())
dataA_image5_caption_tokenized = tokenize(dataA["image5_caption"].tolist())


In [None]:
dataA["image1_caption"][0]

In [57]:
# get embeddings of images
methods = ['meanLast4','lastCLS']

# hidden states for image1_caption
input_ids = torch.tensor(dataA_image1_caption_tokenized["input_ids"]).to(device)
attention_mask = torch.tensor(dataA_image1_caption_tokenized["attention_mask"]).to(device)

with torch.no_grad():
    output = model(input_ids, attention_mask)

hidden_states_gpt_image1 = output.hidden_states

# add a columns to dataA with the embeddings of the image1_caption for each method in methods
for method in methods:
    X = get_sentence_embedding(hidden_states_gpt_image1,method)
    X = np.array([x.cpu().numpy() for x in X]).tolist()
    dataA['image1_caption_embedding_'+ method] = X

# hidden states for image2_caption
input_ids = torch.tensor(dataA_image2_caption_tokenized["input_ids"]).to(device)
attention_mask = torch.tensor(dataA_image2_caption_tokenized["attention_mask"]).to(device)

with torch.no_grad():
    output = model(input_ids, attention_mask)

hidden_states_gpt_image2 = output.hidden_states

# add a columns to dataA with the embeddings of the image2_caption for each method in methods
for method in methods:
    X = get_sentence_embedding(hidden_states_gpt_image2,method)
    X = np.array([x.cpu().numpy() for x in X]).tolist()
    dataA['image2_caption_embedding_'+ method] = X

# hidden states for image3_caption
input_ids = torch.tensor(dataA_image3_caption_tokenized["input_ids"]).to(device)
attention_mask = torch.tensor(dataA_image3_caption_tokenized["attention_mask"]).to(device)

with torch.no_grad():
    output = model(input_ids, attention_mask)

hidden_states_gpt_image3 = output.hidden_states

# add a columns to dataA with the embeddings of the image3_caption for each method in methods

for method in methods:
    X = get_sentence_embedding(hidden_states_gpt_image3,method)
    X = np.array([x.cpu().numpy() for x in X]).tolist()
    dataA['image3_caption_embedding_'+ method] = X

# hidden states for image4_caption
input_ids = torch.tensor(dataA_image4_caption_tokenized["input_ids"]).to(device)
attention_mask = torch.tensor(dataA_image4_caption_tokenized["attention_mask"]).to(device)

with torch.no_grad():
    output = model(input_ids, attention_mask)

hidden_states_gpt_image4 = output.hidden_states

# add a columns to dataA with the embeddings of the image4_caption for each method in methods
for method in methods:
    X = get_sentence_embedding(hidden_states_gpt_image4,method)
    X = np.array([x.cpu().numpy() for x in X]).tolist()
    dataA['image4_caption_embedding_'+ method] = X

# hidden states for image5_caption
input_ids = torch.tensor(dataA_image5_caption_tokenized["input_ids"]).to(device)
attention_mask = torch.tensor(dataA_image5_caption_tokenized["attention_mask"]).to(device)

with torch.no_grad():
    output = model(input_ids, attention_mask)

hidden_states_gpt_image5 = output.hidden_states

# add a columns to dataA with the embeddings of the image5_caption for each method in methods
for method in methods:
    X = get_sentence_embedding(hidden_states_gpt_image5,method)
    X = np.array([x.cpu().numpy() for x in X]).tolist()
    dataA['image5_caption_embedding_'+ method] = X
    

In [None]:
dataA.columns

In [59]:
pred_rankings = []
for i in range(len(dataA)):
    # compare cosine similarity of each picture_caption_embedding_meanLast4 and compound_embedding_meanLast4
    compound_embedding = dataA['compound_embedding_meanLast4'][i]
    image1_caption_embedding = dataA['image1_caption_embedding_meanLast4'][i]
    image2_caption_embedding = dataA['image2_caption_embedding_meanLast4'][i]
    image3_caption_embedding = dataA['image3_caption_embedding_meanLast4'][i]
    image4_caption_embedding = dataA['image4_caption_embedding_meanLast4'][i]
    image5_caption_embedding = dataA['image5_caption_embedding_meanLast4'][i]
    similarity1 = cosine_similarity(compound_embedding,image1_caption_embedding)
    similarity2 = cosine_similarity(compound_embedding,image2_caption_embedding)
    similarity3 = cosine_similarity(compound_embedding,image3_caption_embedding)
    similarity4 = cosine_similarity(compound_embedding,image4_caption_embedding)
    similarity5 = cosine_similarity(compound_embedding,image5_caption_embedding)
    # sort the similarities
    similarities = [similarity1,similarity2,similarity3,similarity4,similarity5]
    # sort and give indices
    idx = np.argsort(similarities)
    # reverse the indices
    idx = idx[::-1]
    pred_rankings.append(idx)
dataA['pred_rankings'] = pred_rankings



In [None]:
similarities, idx

In [None]:
dataA["pred_rankings"][0]

In [62]:
for i in range(len(dataA)):
    idxs = dataA['pred_rankings'][i]
    picture_names = get_image_names(i,dataA)
    # sort picture_names according to idxs
    picture_names = [picture_names[i] for i in idxs]
    dataA["pred_rankings"][i] = picture_names    


In [None]:
get_image_names(69,dataA)

In [None]:
dataA["pred_rankings"][69], dataA["expected_order"][69]

In [None]:
dataA['expected_order'][0],dataA['pred_rankings'][0]

In [66]:
def top1accuracy(pred_rankings,expected_order):
    correct = 0
    for i in range(len(pred_rankings)):
        if pred_rankings[i][0] == expected_order[i][0]:
            correct += 1
    return correct/len(pred_rankings)

from scipy.stats import spearmanr

def spearman_correlation(pred_rankings,expected_order):
    corr = []
    for i in range(len(pred_rankings)):
        corr.append(spearmanr(pred_rankings[i],expected_order[i]).correlation)
    return np.mean(corr)


In [None]:
dataA_train = only_train(dataA)
print("compare image captions with sentence")
print("top1 accuracy")
print(top1accuracy(dataA_train['pred_rankings'],dataA_train['expected_order']))

print("spearman rank correlation")
print(spearman_correlation(dataA_train['pred_rankings'],dataA_train['expected_order']))



## use gpt sentence/meaning (compound) embeddings to classify image captions 

Ziel: sortiere die 5 Bilder wie folgt
1: literal, 2: ähnlich wie literal, 3: ähnlich wie idiomatic, 4: idiomatic, 5: unrelated 

In [None]:
dataA.columns

In [69]:
sim_to_literal = []
sim_to_idiomatic = []
m = "meanLast4"

n = 40

#emb_lit = dataA["gpt_compound_embedding_literal_meaning_" +m][n]
#emb_id = dataA["gpt_compound_embedding_idiomatic_meaning_" +m][n]


emb_lit = dataA["gpt_literal_sentence_embedding_" +m][n]
emb_id = dataA["gpt_idiomatic_sentence_embedding_" +m][n]


#emb_lit = dataA["gpt_literal_meaning_embedding_" +m][n]
#emb_id = dataA["gpt_idiomatic_meaning_embedding_" +m][n]

for i in [1,2,3,4,5]:
    emb_image = dataA["image" + str(i) + "_caption_embedding_" + m][n]
    sim_to_literal.append(cosine_similarity(emb_lit,emb_image))
    sim_to_idiomatic.append(cosine_similarity(emb_id,emb_image))

idx_lit = np.argsort(sim_to_literal )
idx_id = np.argsort(sim_to_idiomatic )
# reverse the indices
idx_lit = idx_lit[::-1].tolist()
idx_id = idx_id[::-1].tolist()


name_list = get_image_names(n,dataA)

name_list_lit = [name_list[j] for j in idx_lit]
name_list_id = [name_list[j] for j in idx_id]



In [None]:
fileDirectory = 'D:\\Wiebke Petersen\\Downloads\\AdMIRe Subtask A Train\\train'

compound = dataA["compound"][n]
print(compound)
print("literal")
print(dataA["gpt_literal_meaning"][n])
for fn in name_list_lit:
    display_image(compound,fn)

print("idiomatic")
print(dataA["gpt_idiomatic_meaning"][n])
for fn in name_list_id:
    display_image(compound,fn)



In [None]:
dataA["expected_order"]

In [72]:
# compare all pictures to each other and get 2 pairs of most similar ones.

#m="meanLast4"
m= "lastCLS"
n= 45

dataA["pairs"] = [None for p in range(len(dataA))]

for n in range(len(dataA)):
    sim = np.zeros((5,5))

    for i in range(5):
        emb_image_i = dataA["image" + str(i+1) + "_caption_embedding_" + m][n]
        for j in range(i+1,5):
            emb_image_j = dataA["image" + str(j+1) + "_caption_embedding_" + m][n]
            sim[i,j] = cosine_similarity(emb_image_i,emb_image_j)
        
       
    name_list = get_image_names(n,dataA)
    compound = dataA["compound"][n]
    
    u,v = np.unravel_index(np.argmax(sim),sim.shape) # indices of pair with highest similarity 
    simN = np.delete(sim,[u,v],0)
    simN = np.delete(simN,[u,v],1)
    max = np.max(simN)
    b = np.where(sim == max)
    u1 = b[0][0] # u1,v1 indices of pair with second highest similarity 
    v1 = b[1][0]

    unrel = list(set([0,1,2,3,4])-set([u,v,u1,v1]))[0]
    dataA["pairs"][n] = [[(v+1,u+1),(v1+1,u1+1),unrel+1],[(name_list[v],name_list[u]),(name_list[v1],name_list[u1]),name_list[unrel]]]


     


In [None]:
dataA["pairs"][0][0]

In [74]:
dataA["literal_image"] = [None for p in range(len(dataA))]
dataA["idiomatic_image"] = [None for p in range(len(dataA))]
dataA["unrelated_image"] = [None for p in range(len(dataA))]

m = "meanLast4"
for n in range(len(dataA)):
    [(u1,v1),(u2,v2),un] = dataA["pairs"][n][0]
    # compare to gpt_sentence_embedding
    comp_lit = dataA["gpt_literal_meaning_embedding_" + m][n]
    comp_id = dataA["gpt_idiomatic_meaning_embedding_" + m][n]
    lit_sim11 = cosine_similarity(dataA["image"+str(u1)+"_caption_embedding_"+ m][n], comp_lit)
    lit_sim12 = cosine_similarity(dataA["image"+str(v1)+"_caption_embedding_"+ m][n], comp_lit)
    lit_sim21 = cosine_similarity(dataA["image"+str(u2)+"_caption_embedding_"+ m][n], comp_lit)
    lit_sim22 = cosine_similarity(dataA["image"+str(v2)+"_caption_embedding_"+ m][n], comp_lit)
    id_sim11 = cosine_similarity(dataA["image"+str(u1)+"_caption_embedding_"+ m][n], comp_id)
    id_sim12 = cosine_similarity(dataA["image"+str(v1)+"_caption_embedding_"+ m][n], comp_id)
    id_sim21 = cosine_similarity(dataA["image"+str(u2)+"_caption_embedding_"+ m][n], comp_id)
    id_sim22 = cosine_similarity(dataA["image"+str(v2)+"_caption_embedding_"+ m][n], comp_id)
    if np.max([lit_sim11,  lit_sim12]) > np.max([lit_sim21 , lit_sim22]):
        if lit_sim11 > lit_sim12:
            dataA["literal_image"][n] = (u1,v1)
        else:
            dataA["literal_image"][n] = (v1,u1)
        if id_sim21 > id_sim22:
            dataA["idiomatic_image"][n] = (u2,v2)
        else: 
            dataA["idiomatic_image"][n] = (v2,u2)
    else:
        if lit_sim21 > lit_sim22:
            dataA["literal_image"][n] = (u2,v2)
        else:
            dataA["literal_image"][n] = (v2,u2)
        if id_sim11 > id_sim12:
            dataA["idiomatic_image"][n] = (u1,v1)
        else:
            dataA["idiomatic_image"][n] = (v1,u1)
    dataA["unrelated_image"][n] = un  


            
    
    
    
     
                                 
                               

In [None]:
dataA["pred_rankings_combined"] = [0 for p in range(len(dataA))]

for n in range(len(dataA)):
    if dataA["pred_sentence_and_sentence_meanLast4"][n] == "literal":
        pred_idx = [dataA["literal_image"][n][0],
                    dataA["literal_image"][n][1],
                    dataA["idiomatic_image"][n][1],
                    dataA["idiomatic_image"][n][0],
                    dataA["unrelated_image"][n]]
    else:
        pred_idx = [dataA["idiomatic_image"][n][0],
                    dataA["idiomatic_image"][n][1],
                    dataA["literal_image"][n][1],
                    dataA["literal_image"][n][0],
                    dataA["unrelated_image"][n]]
    names = get_image_names(n,dataA)
    preds = [names[i-1] for i in pred_idx]
    dataA["pred_rankings_combined"][n] = preds
    




In [None]:
dataA["pred_rankings_combined"]

In [None]:
dataA_train = only_train(dataA)

print("combined approach idiomaticity + image caption analysis")
print("top1 accuracy")
print(top1accuracy(dataA_train['pred_rankings_combined'],dataA_train['expected_order']))

print("spearman rank correlation")
print(spearman_correlation(dataA_train['pred_rankings_combined'],dataA_train['expected_order']))


In [78]:



def find_permutation(L1, L2):
    oneline = []
    for n in L1:
        # Look for n in L2.
        # Sage's one-line permutation format expects indices to start at 1, not 0,
        # so add 1 to all indices here.
        j = L2.index(n) + 1
        # If we've already found this instance, look in the rest of the list for another one.
        while j in oneline:
            j += L2[j:].index(n) + 1
        oneline.append(j)
    return oneline



In [None]:
dataA_train = only_train(dataA)

lit_acc = 0
id_acc = 0
un_acc = 0
for n in range(len(dataA_train)):
    [lit,id,un] = dataA_train["pairs"][n][0]
    order = dataA_train["expected_order"][n]
    names = get_image_names(n,dataA_train)
    order = find_permutation(names,order)
#    print(order)
    if order[4] == un:
        un_acc = un_acc + 1
    if dataA_train["sentence_type"][n] == "literal":
        order = [(order[0],order[1]),(order[3],order[2]),order[4]]
    else:
        order = [(order[3],order[2]),(order[1],order[0]),order[4]]
    lit_acc = lit_acc + 1 - len(list(set(lit)-set(order[0])))/2 
    id_acc = id_acc + 1 - len(list(set(id)-set(order[1])))/2 
print("accuracy on unrelated: ", un_acc/len(dataA_train))
print("accuracy on literal: ", lit_acc/len(dataA_train))
print("accuracy on idiomatic: ", id_acc/len(dataA_train))




In [80]:
#save dataA pickle
import pickle
dataA.to_pickle("dataA.pkl")


In [89]:
# read in pickle file dataA.pkl

dataA = pd.read_pickle('dataA.pkl')

In [None]:
dataA.shape

Mehr in Funktionen packen. 
Bei Captions nur 2 Sätze betrachten.
Weg über idiomatic/literal und dann erst Bilder auswählen. 

## chatgpt image descriptions

In [91]:
data_gpt_image = pd.read_csv("gpt_image_descriptions_all.csv", sep=',')
# gpt data needs some cleanup:
# replace ’ with ' in all columns
for column in data_gpt_image.columns:
    data_gpt_image[column] = data_gpt_image[column].str.replace("’","'")



merged_df = pd.merge(dataA, data_gpt_image, on='compound')
dataA = merged_df 


In [94]:
# Sentence embeddings 

types = ["literal_image","idiomatic_image"]


for t in types:
    # tokenize the column
    tokenized = tokenize(dataA["gpt_" + t].tolist())

    # hidden states for gpt_Meaning
    input_ids = torch.tensor(tokenized["input_ids"]).to(device)
    attention_mask = torch.tensor(tokenized["attention_mask"]).to(device)

    with torch.no_grad():
        output = model(input_ids, attention_mask)

    hidden_states = output.hidden_states

    # add a columns to data_chatGPT with the embeddings of the gpt_sentence for each method in methods
    for method in methods:
        X = get_sentence_embedding(hidden_states,method)
        X = np.array([x.cpu().numpy() for x in X]).tolist()
        dataA["gpt_"+ t + "_embedding_"+ method] = X
    # add gpt_compound_embeddings



In [None]:
dataA.columns

## Zweistufig: (a) +/- idiomatic, (b) compare picture captions