trying out different alterations to the original sentence embeddings stuff

- different ranking for literal vs idiomatic (based on sentence type provided for training data)
- reducing how much of the image captions is considered

In [1]:
import csv
import sys
import pandas as pd
import ast
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

from evaluation_a import evaluation_single

# Code von Wiebke
fileNameA = "subtask_a_train.tsv"
fileDirectoryA = "AdMIRe Subtask A Train/train"
dataA = pd.read_csv(fileDirectoryA + "/" + fileNameA, sep='\t')
dataA['expected_order'] = dataA['expected_order'].apply(ast.literal_eval)

In [2]:
def sim_scores(current, sentences):
    # input = current line(example) & embeddings for sentence + captions
    
    scores = {}
    # keys = image names
    # values = scores

    embeddings = model.encode(sentences)
    similarities = model.similarity(embeddings[0], embeddings)
    # compares the embedding for the sentence including the compound 
    # with each of the embeddings, including itself and all the captions

    # [0][x] required because similarities tensor has additional layer
    score1 = similarities[0][1].item()
    scores[current["image1_name"]] = score1

    score2 = similarities[0][2].item()
    scores[current["image2_name"]] = score2

    score3 = similarities[0][3].item()
    scores[current["image3_name"]] = score3

    score4 = similarities[0][4].item()
    scores[current["image4_name"]] = score4

    score5 = similarities[0][5].item()
    scores[current["image5_name"]] = score5
    
    return scores

In [3]:
# change for idiomatic vs literal

def rank_images(scores, type):
    if type == "literal":
        return rank_literal(scores)
    else:
        return rank_idiomatic(scores)

def rank_literal(scores):
    ranking = []
    # scores = dictionary containing the cos similarity scores
    # comparing the sentence with the captions of the five images
    # keys = image names
    # values = scores

    for i in range(5):
        # find key which corresponds to the highest value
        m = max(scores, key=scores.get)
        # add the key (image name) to the ranking
        ranking.append(m)
        # delete the entry in the dictionary
        del scores[m]

    return ranking

def rank_idiomatic(scores):
    # order of highest(1) to lowest(5) scores in ranking:
    # 4-3-2-1-5

    ranking = []

    # look for lowest score (not related picture) and save for the end
    worst = min(scores, key=scores.get)
    del scores[worst]

    for i in range(4):
        # (second) lowest to highest
        m = min(scores, key=scores.get)
        ranking.append(m)
        del scores[m]

    ranking.append(worst)
    return ranking

In [4]:
# evaluation taking idiomatic vs literal info from training data

data = dataA
#data = dataA[dataA["sentence_type"]=="literal"]
#data = dataA[dataA["sentence_type"]=="idiomatic"]

total_acc = 0
total_spearman = 0

for i in range(len(data)):

    current = data.iloc[i]
    sentences = [current["sentence"], 
                 current["image1_caption"],
                 current["image2_caption"],
                 current["image3_caption"],
                 current["image4_caption"],
                 current["image5_caption"]]

    scores = sim_scores(current, sentences)
    ranking = rank_images(scores, current["sentence_type"])

    exp_order = current["expected_order"]
    evaluation = evaluation_single(ranking, exp_order)
    total_acc += evaluation[0]
    total_spearman += evaluation[1]

final_acc = total_acc / len(data)
print(final_acc)
final_spearman = total_spearman / len(data)
print(final_spearman)

0.4142857142857143
0.29428571428571426


In [5]:
# entire data:
# final_acc = 0.4142857142857143
# final_spearman = 0.29428571428571426

# only literal:
# final_acc = 0.6774193548387096
# final_spearman = 0.3064516129032258

# only idiomatic:
# final_acc = 0.20512820512820512
# final_spearman = 0.2846153846153845

In [6]:
# not comparing sentence but only compound
# with / without special ranking for idiomatic

data = dataA
#data = dataA[dataA["sentence_type"]=="literal"]
#data = dataA[dataA["sentence_type"]=="idiomatic"]

total_acc = 0
total_spearman = 0

for i in range(len(data)):

    current = data.iloc[i]
    sentences = [current["compound"], 
                 current["image1_caption"],
                 current["image2_caption"],
                 current["image3_caption"],
                 current["image4_caption"],
                 current["image5_caption"]]

    scores = sim_scores(current, sentences)
    # ranking = rank_images(scores, current["sentence_type"])
    ranking = rank_literal(scores)

    exp_order = current["expected_order"]
    evaluation = evaluation_single(ranking, exp_order)
    total_acc += evaluation[0]
    total_spearman += evaluation[1]

final_acc = total_acc / len(data)
print(final_acc)
final_spearman = total_spearman / len(data)
print(final_spearman)

0.4
0.16714285714285712


In [7]:
# with special idiomatic ranking

# entire data:
# final_acc = 0.5428571428571428
# final_spearman = 0.35285714285714287

# only literal:
# final_acc = 0.8064516129032258
# final_spearman = 0.39354838709677425

# only idiomatic:
# final_acc = 0.3333333333333333
# final_spearman = 0.3205128205128204

In [8]:
# without special idiomatic ranking

# entire data:
# final_acc = 0.4
# final_spearman = 0.16714285714285712

# only literal:
# final_acc = 0.8064516129032258
# final_spearman = 0.39354838709677425

# only idiomatic:
# final_acc = 0.07692307692307693
# final_spearman = -0.012820512820512832

In [9]:
# only use certain part of caption

# To delete "The image depicts" from all captions
def cut_beginning(caption):
    # caption = string
    # delete "The image depicts "
    cut_cap = " ".join(caption.split(" ")[3:])
    return cut_cap

def cut_caption_chars(caption, num_chars):
    # caption = string
    caption = cut_beginning(caption)
    # identify where a sentence ends after num_chars characters
    x = caption[num_chars:].find(".")
    # cut caption at the first period after num_chars
    cut_cap = caption[:num_chars + x + 1]

    return cut_cap

def cut_caption_words(caption, num_words):
    # caption = string
    caption = cut_beginning(caption)

    caption_words = caption.split(" ")

    if num_words >= len(caption_words):
        return caption

    cut_list = caption_words[:num_words]

    # check if last word already end with "."
    if cut_list[-1] == "" or cut_list[-1][-1] != ".":
        for word in caption_words[num_words:]:
        # append more words until the end of a sentence (".") is reached
            if word == "":
                continue
            elif word[-1] != ".":
                cut_list.append(word)
            else:
                cut_list.append(word)
                break
    
    cut_cap = " ".join(cut_list)
    return cut_cap

def cut_caption_sents(caption, num_sent):
    # caption = string
    caption = cut_beginning(caption)

    # split caption into sentences
    caption_list = caption.split(".")
    # delete after the first x sentences
    cut_list = caption_list[:num_sent]
    # turn list back into caption
    cut_cap = ".". join(cut_list)

    return cut_cap

In [10]:
# cutting caption by sentences

data = dataA

accuracy = {}
spearman = {}

for x in range(1,10):
    total_acc = 0
    total_spearman = 0

    for i in range(len(data)):

        current = data.iloc[i]
        sentences = [current["compound"], 
                    cut_caption_sents(current["image1_caption"], x),
                    cut_caption_sents(current["image2_caption"], x),
                    cut_caption_sents(current["image3_caption"], x),
                    cut_caption_sents(current["image4_caption"], x),
                    cut_caption_sents(current["image5_caption"], x)]

        scores = sim_scores(current, sentences)
        ranking = rank_images(scores, current["sentence_type"])

        exp_order = current["expected_order"]
        evaluation = evaluation_single(ranking, exp_order)
        total_acc += evaluation[0]
        total_spearman += evaluation[1]

    final_acc = total_acc / len(data)
    accuracy[x] = final_acc
    final_spearman = total_spearman / len(data)
    spearman[x] = final_spearman

In [11]:
# cut_caption_sents, compound, ranking separate by sentence type, entire data
# best results

# 4 sentences:
# final_acc = 0.5428571428571428
# final_spearman = 0.37428571428571417

# 9 sentences:
# final_acc = 0.5428571428571428
# final_spearman = 0.40428571428571425

# best result when using entire caption

In [13]:
# cutting caption by words

data = dataA

accuracy = {}
spearman = {}

for x in range(1, 212, 10):
    total_acc = 0
    total_spearman = 0

    for i in range(len(data)):
        #print(i)

        current = data.iloc[i]
        sentences = [current["compound"], 
                    cut_caption_words(current["image1_caption"], x),
                    cut_caption_words(current["image2_caption"], x),
                    cut_caption_words(current["image3_caption"], x),
                    cut_caption_words(current["image4_caption"], x),
                    cut_caption_words(current["image5_caption"], x)]

        scores = sim_scores(current, sentences)
        ranking = rank_images(scores, current["sentence_type"])

        exp_order = current["expected_order"]
        evaluation = evaluation_single(ranking, exp_order)
        total_acc += evaluation[0]
        total_spearman += evaluation[1]

    #print(x)
    final_acc = total_acc / len(data)
    #print(final_acc)
    accuracy[x] = final_acc
    final_spearman = total_spearman / len(data)
    #print(final_spearman)
    spearman[x] = final_spearman

In [None]:
# cut_caption_words, compound, ranking separate by sentence type, entire data
# best results

# 61 words:
# final_acc = 0.5428571428571428 -> same as no cut
# final_spearman = 0.38142857142857134

# 161 words:
# final_acc = 0.5714285714285714 -> better than no cut
# final_spearman = 0.37142857142857133