In [1]:
import pickle
import pandas as pd
from utils.eval_metrics import evaluate_performance_metric
from IPython.display import display
from utils.analysis_helpers import load_performances

### Task performance

In [2]:
no_translation = load_performances((("en", "en"), ("de", "de"), ("zh", "zh"), 
                                   ("en", "de"), ("en", "zh"), 
                                   ("de", "en"), ("zh", "en")))
display(no_translation)

Unnamed: 0,subtask,task_language,prompt_language,performance
0,paws-x,en,en,0.7655
1,paws-x,de,de,0.7075
2,paws-x,zh,zh,0.5985
3,paws-x,en,de,0.749
4,paws-x,en,zh,0.6695
5,paws-x,de,en,0.7265
6,paws-x,zh,en,0.6785
7,xnli,en,en,0.705589
8,xnli,de,de,0.481038
9,xnli,zh,zh,0.560878


In [12]:
instruction = load_performances((("en", "de_from_en-translation"), ("en", "zh_from_en-translation"), 
                                ("de", "en_from_de-translation"), ("zh", "en_from_zh-translation")))
display(instruction)

Unnamed: 0,subtask,task_language,prompt_language,performance
0,paws-x,en,de_from_en-translation,0.768
1,paws-x,en,zh_from_en-translation,0.753
2,paws-x,de,en_from_de-translation,0.723
3,paws-x,zh,en_from_zh-translation,0.6715
4,xnli,en,de_from_en-translation,0.625948
5,xnli,en,zh_from_en-translation,0.625948
6,xnli,de,en_from_de-translation,0.636327
7,xnli,zh,en_from_zh-translation,0.589022


In [13]:
task = load_performances((("de_from_en-translation", "en"), ("zh_from_en-translation", "en"), 
                          ("en_from_de-translation", "de"), ("en_from_zh-translation", "zh")))
display(task)

Unnamed: 0,subtask,task_language,prompt_language,performance
0,paws-x,de_from_en-translation,en,0.769
1,paws-x,zh_from_en-translation,en,0.6965
2,paws-x,en_from_de-translation,de,0.701
3,paws-x,en_from_zh-translation,zh,0.63
4,xnli,de_from_en-translation,en,0.671058
5,xnli,zh_from_en-translation,en,0.615768
6,xnli,en_from_de-translation,de,0.494411
7,xnli,en_from_zh-translation,zh,0.558084


In [14]:
both = load_performances((("de_from_en-translation", "de_from_en-translation"), 
                         ("zh_from_en-translation", "zh_from_en-translation"), 
                         ("en_from_de-translation", "en_from_de-translation"), 
                         ("en_from_zh-translation", "en_from_zh-translation")))
display(both)

Unnamed: 0,subtask,task_language,prompt_language,performance
0,paws-x,de_from_en-translation,de_from_en-translation,0.756
1,paws-x,zh_from_en-translation,zh_from_en-translation,0.6625
2,paws-x,en_from_de-translation,en_from_de-translation,0.727
3,paws-x,en_from_zh-translation,en_from_zh-translation,0.68
4,xnli,de_from_en-translation,de_from_en-translation,0.599002
5,xnli,zh_from_en-translation,zh_from_en-translation,0.596806
6,xnli,en_from_de-translation,en_from_de-translation,0.645309
7,xnli,en_from_zh-translation,en_from_zh-translation,0.610778


In [12]:
es_fr = load_performances((("es", "es"),
                           ("en_from_es-translation", "en_from_es-translation"), 
                           ("en_from_es-translation", "es"),
                           ("es", "en_from_es-translation"),
                           ("fr", "fr"),
                           ("en_from_fr-translation", "en_from_fr-translation"),
                           ("en_from_fr-translation", "fr"),
                           ("fr", "en_from_fr-translation")))
display(es_fr)

Unnamed: 0,subtask,task_language,prompt_language,performance
0,paws-x,es,es,0.721
1,paws-x,en_from_es-translation,en_from_es-translation,0.731
2,paws-x,en_from_es-translation,es,0.7075
3,paws-x,es,en_from_es-translation,0.7315
4,paws-x,fr,fr,0.718
5,paws-x,en_from_fr-translation,en_from_fr-translation,0.7245
6,paws-x,en_from_fr-translation,fr,0.713
7,paws-x,fr,en_from_fr-translation,0.7175
8,xnli,es,es,0.600998
9,xnli,en_from_es-translation,en_from_es-translation,0.650699


In [4]:
no_translation = load_performances((("en", "en"), 
                                    ("de_from_en-translation", "de_from_en-translation"), 
                                    ("zh_from_en-translation", "zh_from_en-translation")),
                                   subtasks=["boolq"])
display(no_translation)

Unnamed: 0,subtask,task_language,prompt_language,performance
0,boolq,en,en,0.859939
1,boolq,de_from_en-translation,de_from_en-translation,0.82263
2,boolq,zh_from_en-translation,zh_from_en-translation,0.777064


### Translation performance 

In [6]:
temperature = 0.25
max_tokens = 2048
top_p = 1.0

results_dict = {
    "subtask": [],
    "language": [],
    "bleu": [],
    "rouge1": [],
    "rouge2": [],
    "rouge-l": [],
    "comet": []
}

In [7]:
for subtask in ["paws-x", "xnli"]:
    
    if subtask == "paws-x": 
        path = "translations_gpt-turbo-0301/task/individual_tasks/paws-x/"
    elif subtask == "xnli": 
        path = "translations_gpt-turbo-0301/task/xglue/xnli/"
    
    for language in ["de_from_en", "zh_from_en", "en_from_de", "en_from_zh"]:
        
        specific_path = (path + "temp-" + str(temperature) + "_topp-" + str(top_p) + "_maxt-" + str(max_tokens) + 
                         "/" + str(language) + "/")
        
        try: 
            with open(specific_path + "translation_scores.pkl", "rb") as f: 
                scores = pickle.load(f)
            results_dict["subtask"].append(subtask)
            results_dict["language"].append(language)
            results_dict["bleu"].append(scores["bleu"])
            results_dict["rouge1"].append(scores["rouge1"])
            results_dict["rouge2"].append(scores["rouge2"])
            results_dict["rouge-l"].append(scores["rouge-l"])
            results_dict["comet"].append(scores["comet"])
        except: 
            continue

In [8]:
results_df = pd.DataFrame(data=results_dict)
print(results_df[results_df.subtask=="paws-x"].sort_values(by=["subtask"], ascending=False))
# print(results_df.to_latex(float_format="%.3f"))

  subtask    language       bleu    rouge1    rouge2   rouge-l     comet
0  paws-x  de_from_en  56.482763  0.800305  0.641972  0.766555  0.893909
1  paws-x  zh_from_en  49.226764  0.675349  0.419873  0.620782  0.863570
2  paws-x  en_from_de  60.027835  0.874564  0.718083  0.829756  0.878960
3  paws-x  en_from_zh  37.562532  0.733436  0.489468  0.662369  0.850904


In [9]:
results_df = pd.DataFrame(data=results_dict)
print(results_df[results_df.subtask=="xnli"].sort_values(by=["subtask"], ascending=False))
# print(results_df.to_latex(float_format="%.3f"))

  subtask    language       bleu    rouge1    rouge2   rouge-l     comet
4    xnli  de_from_en  41.407608  0.706171  0.518325  0.679977  0.877839
5    xnli  zh_from_en  43.478552  0.658971  0.393526  0.623633  0.869904
6    xnli  en_from_de  45.807534  0.758807  0.569625  0.737338  0.890955
7    xnli  en_from_zh  27.967035  0.613693  0.368344  0.574501  0.856795
