In [1]:
import os
import pickle
import evaluate
import pandas as pd
import numpy as np
from tqdm import tqdm
from utils.task_config import task_config
from utils.analysis_helpers import get_table, get_consistency_matrix, load_standardized_response

# Consistencies PAWS-X

In [2]:
task_instruction = [("en", "en"), 
                    ("en", "de_from_en-translation"), 
                    ("de_from_en-translation", "en"),
                    ("de_from_en-translation", "de_from_en-translation")]
consistencies = get_consistency_matrix(task_instruction, "individual_tasks", "paws-x")
consistencies_df = pd.DataFrame(data=consistencies)
table = get_table(consistencies_df).sort_values(by="en / en")

maximum % of invalid responses 0.75


In [3]:
print(table["en / en"])

task/prompt_1
de_from_en-translation / de_from_en-translation    0.8355
de_from_en-translation / en                        0.8525
en / de_from_en-translation                        0.9305
en / en                                            1.0000
Name: en / en, dtype: float64


In [4]:
task_instruction = [("en", "en"), 
                    ("en", "zh_from_en-translation"), 
                    ("zh_from_en-translation", "en"),
                    ("zh_from_en-translation", "zh_from_en-translation")]
consistencies = get_consistency_matrix(task_instruction, "individual_tasks", "paws-x")
consistencies_df = pd.DataFrame(data=consistencies)
table = get_table(consistencies_df).sort_values(by="en / en")

maximum % of invalid responses 0.15


In [5]:
print(table["en / en"])

task/prompt_1
zh_from_en-translation / zh_from_en-translation    0.7600
zh_from_en-translation / en                        0.7910
en / zh_from_en-translation                        0.9085
en / en                                            1.0000
Name: en / en, dtype: float64


In [6]:
task_instruction = [("de", "de"), 
                    ("de", "en_from_de-translation"), 
                    ("en_from_de-translation", "de"),
                    ("en_from_de-translation", "en_from_de-translation")]
consistencies = get_consistency_matrix(task_instruction, "individual_tasks", "paws-x")
consistencies_df = pd.DataFrame(data=consistencies)
table = get_table(consistencies_df).sort_values(by="de / de")

maximum % of invalid responses 0.8999999999999999


In [7]:
print(table["de / de"])

task/prompt_1
en_from_de-translation / en_from_de-translation    0.8595
en_from_de-translation / de                        0.8645
de / en_from_de-translation                        0.9325
de / de                                            1.0000
Name: de / de, dtype: float64


In [8]:
task_instruction = [("zh", "zh"), 
                    ("zh", "en_from_zh-translation"), 
                    ("en_from_zh-translation", "zh"),
                    ("en_from_zh-translation", "en_from_zh-translation")]
consistencies = get_consistency_matrix(task_instruction, "individual_tasks", "paws-x")
consistencies_df = pd.DataFrame(data=consistencies)
table = get_table(consistencies_df).sort_values(by="zh / zh")

maximum % of invalid responses 0.25


In [9]:
print(table["zh / zh"])

task/prompt_1
en_from_zh-translation / en_from_zh-translation    0.6995
en_from_zh-translation / zh                        0.7500
zh / en_from_zh-translation                        0.8695
zh / zh                                            1.0000
Name: zh / zh, dtype: float64


In [10]:
task_instruction = [("en", "en"), 
                    ("en", "de"), 
                    ("de", "en"),
                    ("de", "de"),
                    ("en", "zh"),
                    ("zh", "en"),
                    ("zh", "zh")]
consistencies_pawsx = get_consistency_matrix(task_instruction, "individual_tasks", "paws-x")
consistencies_pawsx_df = pd.DataFrame(data=consistencies_pawsx)
table = get_table(consistencies_pawsx_df)

maximum % of invalid responses 0.5499999999999999


In [11]:
table = get_table(consistencies_pawsx_df)
print(table.sort_index()["en / en"])

task/prompt_1
de / de    0.7965
de / en    0.8080
en / de    0.9255
en / en    1.0000
en / zh    0.8320
zh / en    0.7405
zh / zh    0.7090
Name: en / en, dtype: float64


In [15]:
table = get_table(consistencies_pawsx_df)
print(table.sort_index()["de / de"])

task/prompt_1
de / de    1.0000
de / en    0.9220
en / de    0.8170
en / en    0.7965
en / zh    0.8110
zh / en    0.7430
zh / zh    0.7355
Name: de / de, dtype: float64


In [16]:
table = get_table(consistencies_pawsx_df)
print(table.sort_index()["zh / zh"])

task/prompt_1
de / de    0.7355
de / en    0.7185
en / de    0.7365
en / en    0.7090
en / zh    0.8000
zh / en    0.8335
zh / zh    1.0000
Name: zh / zh, dtype: float64


# Consistencies XNLI

In [12]:
task_instruction = [("en", "en"), 
                    ("en", "de_from_en-translation"), 
                    ("de_from_en-translation", "en"),
                    ("de_from_en-translation", "de_from_en-translation")]
consistencies = get_consistency_matrix(task_instruction, "xglue", "xnli")
consistencies_df = pd.DataFrame(data=consistencies)
table = get_table(consistencies_df).sort_values(by="en / en")

maximum % of invalid responses 0


In [13]:
print(table["en / en"])

task/prompt_1
de_from_en-translation / de_from_en-translation    0.734930
de_from_en-translation / en                        0.761078
en / de_from_en-translation                        0.813373
en / en                                            1.000000
Name: en / en, dtype: float64


In [14]:
task_instruction = [("en", "en"), 
                    ("en", "zh_from_en-translation"), 
                    ("zh_from_en-translation", "en"),
                    ("zh_from_en-translation", "zh_from_en-translation")]
consistencies = get_consistency_matrix(task_instruction, "xglue", "xnli")
consistencies_df = pd.DataFrame(data=consistencies)
table = get_table(consistencies_df).sort_values(by="en / en")

maximum % of invalid responses 0


In [15]:
print(table["en / en"])

task/prompt_1
zh_from_en-translation / zh_from_en-translation    0.669062
zh_from_en-translation / en                        0.714770
en / zh_from_en-translation                        0.767465
en / en                                            1.000000
Name: en / en, dtype: float64


In [16]:
task_instruction = [("de", "de"), 
                    ("de", "en_from_de-translation"), 
                    ("en_from_de-translation", "de"),
                    ("en_from_de-translation", "en_from_de-translation")]
consistencies = get_consistency_matrix(task_instruction, "xglue", "xnli")
consistencies_df = pd.DataFrame(data=consistencies)
table = get_table(consistencies_df).sort_values(by="de / de")

maximum % of invalid responses 0.059880239520958084


In [17]:
print(table["de / de"])

task/prompt_1
en_from_de-translation / en_from_de-translation    0.633134
de / en_from_de-translation                        0.694810
en_from_de-translation / de                        0.805589
de / de                                            1.000000
Name: de / de, dtype: float64


In [18]:
task_instruction = [("zh", "zh"), 
                    ("zh", "en_from_zh-translation"), 
                    ("en_from_zh-translation", "zh"),
                    ("en_from_zh-translation", "en_from_zh-translation")]
consistencies = get_consistency_matrix(task_instruction, "xglue", "xnli")
consistencies_df = pd.DataFrame(data=consistencies)
table = get_table(consistencies_df).sort_values(by="zh / zh")

maximum % of invalid responses 0.01996007984031936


In [19]:
print(table["zh / zh"])

task/prompt_1
en_from_zh-translation / en_from_zh-translation    0.666068
en_from_zh-translation / zh                        0.719760
zh / en_from_zh-translation                        0.786826
zh / zh                                            1.000000
Name: zh / zh, dtype: float64


In [20]:
task_instruction = [("en", "en"), 
                    ("en", "de"), 
                    ("de", "en"),
                    ("de", "de"),
                    ("en", "zh"),
                    ("zh", "en"),
                    ("zh", "zh")]
consistencies_xnli = get_consistency_matrix(task_instruction, "xglue", "xnli")
consistencies_xnli_df = pd.DataFrame(data=consistencies_xnli)
table = get_table(consistencies_xnli_df)

maximum % of invalid responses 0.059880239520958084


In [21]:
table = get_table(consistencies_xnli_df)
print(table["en / en"])

task/prompt_1
de / de    0.610778
de / en    0.723952
en / de    0.690020
en / en    1.000000
en / zh    0.762475
zh / en    0.651697
zh / zh    0.651098
Name: en / en, dtype: float64


In [22]:
table = get_table(consistencies_xnli_df)
print(table["de / de"])

task/prompt_1
de / de    1.000000
de / en    0.650100
en / de    0.789222
en / en    0.610778
en / zh    0.646707
zh / en    0.585828
zh / zh    0.587226
Name: de / de, dtype: float64


In [23]:
table = get_table(consistencies_xnli_df)
print(table["zh / zh"])

task/prompt_1
de / de    0.587226
de / en    0.612974
en / de    0.617764
en / en    0.651098
en / zh    0.695210
zh / en    0.731138
zh / zh    1.000000
Name: zh / zh, dtype: float64


# Consistencies BOOLQ

In [2]:
task_instruction = [("en", "en"), 
                    ("de_from_en-translation", "de_from_en-translation"),
                    ("zh_from_en-translation", "zh_from_en-translation")
                   ]
consistencies = get_consistency_matrix(task_instruction, "super_glue", "boolq")
consistencies_df = pd.DataFrame(data=consistencies)
table = get_table(consistencies_df).sort_values(by="en / en")

maximum % of invalid responses 1.9877675840978593


In [3]:
print(table["en / en"])

task/prompt_1
zh_from_en-translation / zh_from_en-translation    0.806116
de_from_en-translation / de_from_en-translation    0.892966
en / en                                            1.000000
Name: en / en, dtype: float64


# Relation between consistency / translation quality / correctness

## Evaluations on translations of the input sentences with English instruction

In [2]:
import load_task
import json
import numpy as np

In [3]:
def init_task_translation_dict():
    
    init = {
        "paws-x": {"de_from_en": [], "zh_from_en": [], "en_from_de": [], "en_from_zh": []}, 
        "xnli": {"de_from_en": [], "zh_from_en": [], "en_from_de": [], "en_from_zh": []}
    }
    return init

def init_task_language_dict():
    
    init = {"paws-x": {"en": [], "de": [], "zh": []}, "xnli": {"en": [], "de": [], "zh": []}}
    return init

In [4]:
predicted_labels_source = init_task_language_dict()
predictions_eval_source = init_task_language_dict()

In [None]:
for collection, subtask in [("individual_tasks", "paws-x"), ("xglue", "xnli")]:
    
    for language in ["en", "de", "zh"]:
    
        test_data = load_task.load_test_split(collection, subtask, language)
        labels = test_data["label"]
        label_map = task_config[collection][subtask]["label_map"][language]
        label_map[-1] = -1 
        
        responses = load_standardized_response(collection, subtask, language, language, temperature=0.25)
        predicted_labels = [label_map[r] for r in responses] 
        predicted_labels_source[subtask][language] = predicted_labels
        predictions_eval_source[subtask][language] = [int(predicted_labels[i] == labels[i]) for i in range(len(labels))]

In [None]:
predicted_labels_trans = init_task_translation_dict()
predictions_eval_trans = init_task_translation_dict()
comet = init_task_translation_dict()
bleu = init_task_translation_dict()


for collection, subtask in [("individual_tasks", "paws-x"), ("xglue", "xnli")]:
    
    trans_path = "translations_gpt-turbo-0301/task/" + collection + "/" + subtask + "/temp-0.25_topp-1.0_maxt-2048/"
    sentence_keys = task_config[collection][subtask]["sentence_keys"]
    
    for language in ["de_from_en", "zh_from_en", "en_from_de", "en_from_zh"]:
        
        source_language = language[-2:]
        target_language = language[0:2]
        
        test_data_source = load_task.load_test_split(collection, subtask, source_language)
        test_data_target = load_task.load_test_split(collection, subtask, target_language)
        label_map = task_config[collection][subtask]["label_map"][source_language]
        labels = test_data_source["label"]        
        
        # get comet scores
        with open (trans_path + language + "/translation_scores.pkl", "rb") as f: 
            translation_scores = pickle.load(f)
        comet[subtask][language] = translation_scores["comet_all"][0]
        
        # calculate bleu per sentence: done once, results stored and loaded below
        
        # data_path = (
        #     "translations_gpt-turbo-0301/task/" + str(collection) + "/" + str(subtask) + "/" +
        #     "temp-0.25_topp-1.0_maxt-2048/" + language + "/"
        # )
        # print("Use translated inputs: ", data_path)
        # with open(data_path + "dataset.pkl", "rb") as f:
        #     test_data_translation = pickle.load(f)
        # 
        # mt = []
        # references = []
        # for key in sentence_keys: 
        #     references += test_data_target[key]
        #     mt += test_data_translation[key]
        #     
        # bleu_metric = evaluate.load("sacrebleu")
        # if target_language == "zh":
        #     tokenize = "zh"
        # else:
        #     tokenize = "13a"
        # 
        # for i in tqdm(range(len(mt))):
        #     score = bleu_metric.compute(predictions=[mt[i]], references=[references[i]], tokenize=tokenize)
        #     bleu[subtask][language].append(score["score"])
        
        responses = load_standardized_response(collection, 
                                               subtask, 
                                               language + "-translation", 
                                               source_language, 
                                               temperature=0.25)
        
        predictions = [label_map[response] for response in responses]
        predicted_labels_trans[subtask][language] = predictions
        predictions_eval = [int(predictions[i] == labels[i]) for i in range(len(labels))]
        predictions_eval_trans[subtask][language] = predictions_eval

# load calculated bleu scores instead of calculating every time (commented part above)
with open("translations_gpt-turbo-0301/all_bleu_scores.pkl", "rb") as f: 
    pickle.load(f)

In [32]:
correlations = {}
consistency_thresholded = {}

for subtask in ["paws-x", "xnli"]:
    
    correlations[subtask] = {"de_from_en": {}, "zh_from_en": {}, "en_from_de": {}, "en_from_zh": {}}
    consistency_thresholded[subtask] = {"de_from_en": {}, "zh_from_en": {}, "en_from_de": {}, "en_from_zh": {}}
    
    for language in ["de_from_en", "zh_from_en", "en_from_de", "en_from_zh"]:
        
        pred_source = predicted_labels_source[subtask][language[-2:]]
        eval_source = predictions_eval_source[subtask][language[-2:]]
        n = len(pred_source)

        # average scores for input sentence 1 and input sentence 2
        bleu_all = np.array(bleu[subtask][language])
        bleu_mean = (bleu_all[0:n] + bleu_all[n:n*2]) / 2
        trans_scores = bleu_mean
        
        pred_trans = predicted_labels_trans[subtask][language]
        eval_trans = predictions_eval_trans[subtask][language]
        consistency = [int(pred_source[i] == pred_trans[i]) for i in range(n)]
        
        threshold = 50
        
        consistency_subset = np.array(consistency)[trans_scores > threshold]
        consistency_thresholded[subtask][language]["consistency"] = np.mean(consistency_subset).round(3)
        consistency_thresholded[subtask][language]["percent"] = round(len(consistency_subset) / len(consistency), 3) 

        corr1 = np.corrcoef(consistency, trans_scores)
        corr2 = np.corrcoef(eval_trans, trans_scores)
        correlations[subtask][language]["translations_consistency"] = [corr1[0,1].round(3)]
        correlations[subtask][language]["translations_performance"] = [corr2[0,1].round(3)]

In [33]:
print("--- paws-x ---")
print(pd.DataFrame(correlations["paws-x"]))
print("\n--- xnli ---")
print(pd.DataFrame(correlations["xnli"]))

--- paws-x ---
                         de_from_en zh_from_en en_from_de en_from_zh
translations_consistency    [0.024]    [0.067]    [0.064]    [0.027]
translations_performance    [0.084]    [0.071]    [0.068]    [0.044]

--- xnli ---
                         de_from_en zh_from_en en_from_de en_from_zh
translations_consistency    [0.025]     [0.02]    [0.048]    [0.085]
translations_performance    [0.007]    [0.017]     [0.03]    [0.045]


In [34]:
print("--- paws-x ---")
print(pd.DataFrame(consistency_thresholded["paws-x"]))
print("\n--- xnli ---")
print(pd.DataFrame(consistency_thresholded["xnli"]))

--- paws-x ---
             de_from_en  zh_from_en  en_from_de  en_from_zh
consistency       0.859       0.818       0.871       0.779
percent           0.566       0.401       0.671       0.206

--- xnli ---
             de_from_en  zh_from_en  en_from_de  en_from_zh
consistency       0.773       0.717       0.823       0.800
percent           0.356       0.323       0.396       0.105


# Percentages on complete translation

In [35]:
predictions_complete = init_task_translation_dict()
predictions_eval_complete = init_task_translation_dict()

In [None]:
for collection, subtask in [("individual_tasks", "paws-x"), ("xglue", "xnli")]:
   
    for language in ["en-2nd-run", "de_from_en", "zh_from_en", "en_from_de", "en_from_zh"]:
        
        if language != "en-2nd-run":
            source_language = language[-2:]
            label_map_key = language + "-translation"
        else: 
            source_language = "en"
            label_map_key = "en"
        test_data_source = load_task.load_test_split(collection, subtask, source_language)
        label_map = task_config[collection][subtask]["label_map"][label_map_key]
        label_map[-1] = -1
        labels = test_data_source["label"]
        
        if language != "en-2nd-run": 
            responses = load_standardized_response(collection,
                                                  subtask,
                                                  language + "-translation", 
                                                  language + "-translation", 
                                                  temperature=0.25)
        else: 
            responses = load_standardized_response(collection, 
                                                  subtask, 
                                                  language, 
                                                  "en", 
                                                  temperature=.25)
        
        predictions = [label_map[response] for response in responses]
        predictions_complete[subtask][language] = predictions
        predictions_eval = [int(predictions[i] == labels[i]) for i in range(len(labels))]
        predictions_eval_complete[subtask][language] = predictions_eval

In [43]:
percentages_complete = {}

for subtask in ["paws-x", "xnli"]:
    
    percentages_complete[subtask] = {
        "en-2nd-run": {"consistent": {}, #"consistent_correct": {}, "consistent_false": {}, 
                       "fraction_of_correct": {}, "fraction_of_false": {}},
        "de_from_en": {"consistent": {}, #"consistent_correct": {}, "consistent_false": {}, 
                       "fraction_of_correct": {}, "fraction_of_false": {}},
        "zh_from_en": {"consistent": {}, #"consistent_correct": {}, "consistent_false": {}, 
                       "fraction_of_correct": {}, "fraction_of_false": {}},
        "en_from_de": {"consistent": {}, #"consistent_correct": {}, "consistent_false": {}, 
                       "fraction_of_correct": {}, "fraction_of_false": {}},
        "en_from_zh": {"consistent": {}, #"consistent_correct": {}, "consistent_false": {}, 
                       "fraction_of_correct": {}, "fraction_of_false": {}}}
    
    for language in ["en-2nd-run", "de_from_en", "zh_from_en", "en_from_de", "en_from_zh"]:
        
        if language != "en-2nd-run": 
            pred_source = predicted_labels_source[subtask][language[-2:]]
            eval_source = predictions_eval_source[subtask][language[-2:]]
        else: 
            pred_source = predicted_labels_source[subtask]["en"]
            eval_source = predictions_eval_source[subtask]["en"]
        
        n = len(pred_source)
        
        pred_trans = predictions_complete[subtask][language]
        eval_trans = predictions_eval_complete[subtask][language]
        consistency = [int(pred_source[i] == pred_trans[i]) for i in range(n)]
        
        correct_and_consistent = [int(consistency[i] == 1 and eval_source[i] == 1) for i in range(n)]
        false_and_consistent = [int(consistency[i] == 1 and eval_source[i] == 0) for i in range(n)]
        correct_and_inconsistent = [int(consistency[i] == 0 and eval_source[i] == 1) for i in range(n)]
        false_and_inconsistent = [int(consistency[i] == 0 and eval_source[i] == 0) for i in range(n)]
        
        false = np.sum([eval_source[i] == 0 for i in range(n)])
        correct = np.sum([eval_source[i] == 1 for i in range(n)])
        percentages_complete[subtask][language]["fraction_of_false"] = (np.sum(false_and_consistent) / false).round(3)
        percentages_complete[subtask][language]["fraction_of_correct"] = (np.sum(correct_and_consistent) /correct).round(3)
        percentages_complete[subtask][language]["consistent"] = np.mean(consistency).round(3)

In [44]:
print("--- paws-x --- \n")
print(percentages_complete["paws-x"]["en-2nd-run"])
print(percentages_complete["paws-x"]["de_from_en"])
print(percentages_complete["paws-x"]["zh_from_en"])
print(percentages_complete["paws-x"]["en_from_de"])
print(percentages_complete["paws-x"]["en_from_zh"])
print("--- xnli --- \n")
print(percentages_complete["xnli"]["en-2nd-run"]) 
print(percentages_complete["xnli"]["de_from_en"]) 
print(percentages_complete["xnli"]["zh_from_en"]) 
print(percentages_complete["xnli"]["en_from_de"])
print(percentages_complete["xnli"]["en_from_zh"]) 


--- paws-x --- 

{'consistent': 0.986, 'fraction_of_correct': 0.99, 'fraction_of_false': 0.977}
{'consistent': 0.836, 'fraction_of_correct': 0.887, 'fraction_of_false': 0.667}
{'consistent': 0.76, 'fraction_of_correct': 0.776, 'fraction_of_false': 0.708}
{'consistent': 0.86, 'fraction_of_correct': 0.917, 'fraction_of_false': 0.721}
{'consistent': 0.7, 'fraction_of_correct': 0.817, 'fraction_of_false': 0.524}
--- xnli --- 

{'consistent': 0.979, 'fraction_of_correct': 0.988, 'fraction_of_false': 0.958}
{'consistent': 0.735, 'fraction_of_correct': 0.765, 'fraction_of_false': 0.663}
{'consistent': 0.669, 'fraction_of_correct': 0.711, 'fraction_of_false': 0.569}
{'consistent': 0.633, 'fraction_of_correct': 0.829, 'fraction_of_false': 0.452}
{'consistent': 0.666, 'fraction_of_correct': 0.795, 'fraction_of_false': 0.501}
