In [1]:
from comet import download_model, load_from_checkpoint
import sacrebleu
import pickle
import numpy as np
import pandas as pd



In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="7"

In [3]:
def read_file(fname):
    output = []
    with open(fname) as f:
        for line in f:
            output.append(line.strip())
    return output

In [4]:
import pickle
with open("all_submissions/data_dict.pkl", "rb") as f:
    data_dict = pickle.load(f)

In [5]:
from typing import List

# Referred from https://github.com/amazon-science/doc-mt-metrics/blob/main/Prism/add_context.py
def add_context(orig_txt: List[str], context_same: List[str], 
                context_other: List[str], sender_ids: List[str], 
                sep_token: str = "</s>", ws: int = 2) -> List[str]:
    if not (len(orig_txt) == len(context_same)== len(context_other)):
        raise Exception(f'Lengths should match: len(orig_txt)={len(orig_txt)}, len(context_same)={len(context_same)}, len(context_other)={len(context_other)}')
    i = 0
    augm_txt = []
    for i in range(len(orig_txt)):
      context_window = []
      for j in range(max(0, i - ws), i):
        if sender_ids[j] == sender_ids[i]:
          context_window.append(context_same[j])
        else:
          context_window.append(context_other[j])
      augm_txt.append(" {} ".format(sep_token).join(context_window + [orig_txt[i]]))
    return augm_txt

class DocCometMetric():
  def __init__(self, model_name="Unbabel/wmt20-comet-qe-da", batch_size=64, ref_based=True):
    checkpoint_path = download_model(model_name)
    self.model = load_from_checkpoint(checkpoint_path)
    self.batch_size = batch_size
    self.model.enable_context()
    self.ref_based = ref_based

  def get_score(self, source, outputs, references=None):
    if not self.ref_based:
      del references
      return self.model.predict([{"mt": y, "src": x} for x, y in zip(source, outputs)],
        batch_size=self.batch_size, gpus=1, progress_bar=True)['scores']
    else:
       return self.model.predict([{"mt": y, "ref":z, "src": x} for x, y, z in zip(source, outputs, references)],
        batch_size=self.batch_size, gpus=1, progress_bar=False, devices=[self.device_id])['scores']

In [None]:
ref_metric = load_from_checkpoint(download_model("Unbabel/wmt22-comet-da"))
context_metric = DocCometMetric(model_name="Unbabel/wmt20-comet-qe-da", batch_size=256, ref_based=False)

In [5]:
def get_scores(df, columns):
    score_dict = {}
    for col in columns:
        try:
            score_dict[col] = {}
            score_dict[col]["comet"] = np.mean(ref_metric.predict([{"mt": y, "ref":z, "src": x} for x, y, z in zip(df["source"].to_list(),
                                                                                df[col].to_list(),
                                                                                df["reference"].to_list())],
                                batch_size=256, gpus=1)['scores'])
            score_dict[col]["chrf"] = sacrebleu.corpus_chrf(df[col].to_list(), [df["reference"].to_list()]).score
            score_dict[col]["bleu"] = sacrebleu.corpus_bleu(df[col].to_list(), [df["reference"].to_list()]).score
        except:
            continue
    return score_dict

In [None]:
import json
import sys
from typing import Any, Callable

import numpy as np

import sys
sys.path.append("metrics/MuDA")
from muda.langs import create_tagger
from muda.metrics import compute_metrics

def read_file(fname):
    output = []
    with open(fname) as f:
        for line in f:
            output.append(line.strip())
    return output


def recursive_map(func: Callable[[Any], Any], obj: Any) -> Any:
    if isinstance(obj, dict):
        return {k: recursive_map(func, v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [recursive_map(func, v) for v in obj]
    else:
        return func(obj)

from pathlib import Path
def get_muda_accuracy_score(
    srcs,
    refs,
    docids,
    tgt_lang="de",
    awesome_align_model="bert-base-multilingual-cased",
    awesome_align_cachedir=None,
    load_refs_tags_file=None,
    cohesion_threshold=3,
    dump_hyps_tags_file=None,
    dump_refs_tags_file=None,
    dump_stats_file=None,
    phenomena=["lexical_cohesion", "formality", "verb_form", "pronouns"],
    hyps=None,
) -> None:

    if not Path(dump_stats_file).is_file():
        tagger = create_tagger(
            tgt_lang,
            align_model=awesome_align_model,
            align_cachedir=awesome_align_cachedir,
            cohesion_threshold=cohesion_threshold,
        )

        if not load_refs_tags_file:
            preproc = tagger.preprocess(srcs, refs, docids)
            tagged_refs = []
            for doc in zip(*preproc):
                tagged_doc = tagger.tag(*doc, phenomena=phenomena)
                tagged_refs.append(tagged_doc)
        else:
            tagged_refs = json.load(open(load_refs_tags_file))

        preproc = tagger.preprocess(srcs, hyps, docids)
        tagged_hyps = []
        for doc in zip(*preproc):
            tagged_doc = tagger.tag(*doc, phenomena=phenomena)
            tagged_hyps.append(tagged_doc)

        tag_prec, tag_rec, tag_f1 = compute_metrics(tagged_refs, tagged_hyps)
        stat_dicts = []
        for tag in tag_f1:
            print(
                f"{tag} -- Prec: {tag_prec[tag]:.2f} Rec: {tag_rec[tag]:.2f} F1: {tag_f1[tag]:.2f}"
            )
            stat_dicts.append(
                {
                    "tag": tag,
                    "precision": tag_prec[tag],
                    "recall": tag_rec[tag],
                    "f1": tag_f1[tag],
                }
            )
        with open(dump_stats_file, "w") as f:
            for d in stat_dicts:
                f.write(json.dumps(d, ensure_ascii=False) + "\n")

        if dump_hyps_tags_file:
            with open(dump_hyps_tags_file, "w", encoding="utf-8") as f:
                json.dump(recursive_map(lambda t: t._asdict(), tagged_refs), f, indent=2)

        if not load_refs_tags_file and dump_refs_tags_file:
            with open(dump_refs_tags_file, "w", encoding="utf-8") as f:
                json.dump(recursive_map(lambda t: t._asdict(), tagged_refs), f, indent=2)
    else:
        print(f"{dump_stats_file} exists")

In [None]:
score_dict_all_lps = {}
for lp in ['en_nl', 'en_pt', 'en_de', 'en_ko', 'en_fr']:
    submission_cols = list(data_dict[lp.replace("-", '_')].keys())
    test_df = pd.read_csv(f"all_submissions/{lp.replace('_', '-')}.csv")
    test_df.fillna('', inplace=True)
    # score_dict_all_lps[lp] = get_scores(test_df, submission_cols)

    # MuDA
    df = test_df[test_df.source_language == "en"]
    src_lang, tgt_lang = lp.split("_")

    for col in submission_cols:
        test_df[f"{col}-comet"] = ref_metric.predict([{"mt": y, "ref":z, "src": x} for x, y, z in zip(test_df["source"].to_list(),
                                                                                test_df[col].to_list(),
                                                                                test_df["reference"].to_list())],
                                batch_size=256, gpus=1)['scores']
        get_muda_accuracy_score(
            df["source"].to_list(),
            df["reference"].to_list(),
            df["doc_id"].to_list(),
            hyps=df[col].to_list(),
            tgt_lang=tgt_lang,
            awesome_align_model="bert-base-multilingual-cased",
            awesome_align_cachedir=None,
            dump_hyps_tags_file=f"muda_accuracy_results/{tgt_lang}.{col}.tags.json",
            dump_refs_tags_file=f"muda_accuracy_results/{tgt_lang}.ref.tags.json",
            dump_stats_file=f"muda_accuracy_results/{tgt_lang}.{col}.stats.json",
            phenomena=["lexical_cohesion", "formality", "verb_form", "pronouns"],
            cohesion_threshold=3,
        )

    # Context Comet QE
    for col in submission_cols:
        doc_dfs = []
        for _, df_group in test_df.groupby(["doc_id"]):
            df_group['seg_id'] = list(range(len(df_group)))
            df_group[f"source_with_context"]  = add_context(
                                                    orig_txt=df_group["source"].to_list(),
                                                    context_same=df_group["source"].to_list(),
                                                    context_other=df_group[col].to_list(),
                                                    sender_ids=df_group["sender"].to_list(),
                                                    sep_token=context_metric.model.encoder.tokenizer.sep_token,)
            df_group[f"mt_with_context"]  = add_context(
                                                    orig_txt=df_group[col].to_list(),
                                                    context_same=df_group[col].to_list(),
                                                    context_other=df_group["source"].to_list(),
                                                    sender_ids=df_group["sender"].to_list(),
                                                    sep_token=context_metric.model.encoder.tokenizer.sep_token,)
            doc_dfs.append(df_group)

        dfs_all = pd.concat(doc_dfs)
        test_df[f"{col}-context-comet-qe"] = context_metric.get_score(dfs_all[f"source_with_context"], dfs_all[f"mt_with_context"])
        score_dict_all_lps[lp][col]["context-comet-qe"] =  np.mean(test_df[f"{col}-context-comet-qe"])
    test_df.to_csv(f"all_submissions/{lp.replace('_', '-')}-scores.csv")

# Paper Results

In [4]:
submission_cols = ['submission_DeepText_Lab',
                   'submission_HW-TSC',
                   'submission_MULTITAN-GML',
                   'submission_ADAPT',
                   'submission_SheffieldGATE',
                   'submission_clteam',
                   'submission_DCUGenNLP',
                   'submission_unbabel+it',
                   'submission_baseline']

In [None]:
score_dict_all_lps_bydir = {}
for lp in ['en_de', 'en_fr', 'en_nl', 'en_pt', 'en_ko']:
    test_df = pd.read_csv(f"all_submissions/{lp.replace('_', '-')}-scores.csv")
    score_dict_all_lps_bydir[lp] = {}
    for dir, dir_df in test_df.groupby("source_language"):
        score_dict_all_lps_bydir[lp][dir] = {}
        for col in submission_cols:
            if f"{col}-comet" in dir_df.columns:
                score_dict_all_lps_bydir[lp][dir][col] = dir_df[f"{col}-comet"].mean()

In [None]:
for col in submission_cols:
    scores_str = f"{col}\t"
    for lp in ['en_de', 'en_fr', 'en_nl', 'en_pt', 'en_ko']:
        src_lang, tgt_lang = lp.split("_")
        if col in score_dict_all_lps_bydir[lp][src_lang]:
            scores_str+=f" & {score_dict_all_lps_bydir[lp][src_lang][col]*100:.2f} & {score_dict_all_lps_bydir[lp][tgt_lang.replace('pt', 'pt-br')][col]*100:.2f} "
        else:
            scores_str+=f" & & "
    print(col, scores_str)

In [16]:
score_dict_all_lps_bydir = {}
for lp in ['en_de', 'en_fr', 'en_nl', 'en_pt', 'en_ko']:
    test_df = pd.read_csv(f"all_submissions/{lp.replace('_', '-')}-scores.csv")
    score_dict_all_lps_bydir[lp] = {}
    for dir, dir_df in test_df.groupby("source_language"):
        score_dict_all_lps_bydir[lp][dir] = {}
        for col in submission_cols:
            if f"{col}-context-comet-qe" in dir_df.columns:
                score_dict_all_lps_bydir[lp][dir][col] = dir_df[f"{col}-context-comet-qe"].mean()

In [None]:

for col in submission_cols:
    scores_str = f"{col}\t"
    for lp in ['en_de', 'en_fr', 'en_nl', 'en_pt', 'en_ko']:
        src_lang, tgt_lang = lp.split("_")
        if col in score_dict_all_lps_bydir[lp][src_lang]:
            scores_str+=f" & {score_dict_all_lps_bydir[lp][src_lang][col]*100:.2f} & {score_dict_all_lps_bydir[lp][tgt_lang.replace('pt', 'pt-br')][col]*100:.2f} "
        else:
            scores_str+=f" & & "
    print(scores_str)

In [8]:
import pandas as pd
import sacrebleu

score_dict_all_lps_bydir = {}
for lp in ['en_de', 'en_fr', 'en_nl', 'en_pt', 'en_ko']:
    test_df = pd.read_csv(f"all_submissions/{lp.replace('_', '-')}-scores.csv")
    test_df.fillna('', inplace=True)
    score_dict_all_lps_bydir[lp] = {}
    for dir, dir_df in test_df.groupby("source_language"):
        score_dict_all_lps_bydir[lp][dir] = {}
        for col in submission_cols:
            if f"{col}" in dir_df.columns:
                score_dict_all_lps_bydir[lp][dir][col] =  sacrebleu.corpus_chrf(dir_df[col].to_list(), [dir_df["reference"].to_list()]).score

In [None]:

for col in submission_cols:
    scores_str = f"{col}\t"
    for lp in ['en_de', 'en_fr', 'en_nl', 'en_pt', 'en_ko']:
        src_lang, tgt_lang = lp.split("_")
        if col in score_dict_all_lps_bydir[lp][src_lang]:
            scores_str+=f" & {score_dict_all_lps_bydir[lp][src_lang][col]:.2f} & {score_dict_all_lps_bydir[lp][tgt_lang.replace('pt', 'pt-br')][col]:.2f} "
        else:
            scores_str+=f" & & "
    print(scores_str)

In [11]:
import pandas as pd
import sacrebleu

score_dict_all_lps_bydir = {}
for lp in ['en_de', 'en_fr', 'en_nl', 'en_pt', 'en_ko']:
    test_df = pd.read_csv(f"all_submissions/{lp.replace('_', '-')}-scores.csv")
    test_df.fillna('', inplace=True)
    score_dict_all_lps_bydir[lp] = {}
    for dir, dir_df in test_df.groupby("source_language"):
        score_dict_all_lps_bydir[lp][dir] = {}
        for col in submission_cols:
            if f"{col}" in dir_df.columns:
                score_dict_all_lps_bydir[lp][dir][col] =  sacrebleu.corpus_bleu(dir_df[col].to_list(), [dir_df["reference"].to_list()]).score

In [None]:

for col in submission_cols:
    scores_str = f"{col}\t"
    for lp in ['en_de', 'en_fr', 'en_nl', 'en_pt', 'en_ko']:
        src_lang, tgt_lang = lp.split("_")
        if col in score_dict_all_lps_bydir[lp][src_lang]:
            scores_str+=f" & {score_dict_all_lps_bydir[lp][src_lang][col]:.2f} & {score_dict_all_lps_bydir[lp][tgt_lang.replace('pt', 'pt-br')][col]:.2f} "
        else:
            scores_str+=f" & & "
    print(scores_str)

# HW-TSC vs Unbabel

In [5]:
import pandas as pd
score_dict_all_lps_bydir = {}
lang_threshold = {"de": 31,"fr": 33, "nl": 35, "ko": 48, "pt": 28}

test_df = pd.read_csv(f"all_submissions/en-de-scores.csv")
lang_threshold = 31

In [None]:
len(test_df.doc_id.unique()), len(test_df)

In [None]:
import numpy as np
scores_ours = []
scores_hw_tsc = []
for gr, gr_df in test_df.groupby("doc_id"):
    if len(gr_df) >= lang_threshold: continue
    scores_ours.extend(gr_df['submission_unbabel+it-comet'])
    scores_hw_tsc.extend(gr_df['submission_HW-TSC-comet'])

print(np.mean(scores_ours), np.mean(scores_hw_tsc), len(scores_ours))

In [None]:
import numpy as np
scores_ours = []
scores_hw_tsc = []
for gr, gr_df in test_df.groupby("doc_id"):
    if len(gr_df) < lang_threshold: continue
    scores_ours.extend(gr_df['submission_unbabel+it-comet'])
    scores_hw_tsc.extend(gr_df['submission_HW-TSC-comet'])

print(np.mean(scores_ours), np.mean(scores_hw_tsc), len(scores_ours))

# Muda results

In [1]:

import pickle

with open("all_submissions/data_dict.pkl", "rb") as f:
    data_dict = pickle.load(f)

In [2]:
data_dict.keys()

dict_keys(['en_nl', 'en_pt', 'en_de', 'en_ko', 'en_fr'])

In [3]:
import json
import pandas as pd
langs = ["de", "fr", "pt", "nl", "ko"]

def load_jsonl_file(file_path):
    with open(file_path, "r") as f:
        json_list = list(f)

    return [json.loads(l) for l in json_list]

df_dict = {
    "model": [],
    "lp": [],
    "tag": [],
    "precision": [],
    "recall": [],
    "f1": [],
}

data = []
for tgt_lang in langs:
    for col in data_dict[f"en_{tgt_lang}"]:
        try:
            stats = load_jsonl_file(
                f"muda_accuracy_results/{tgt_lang}.{col}.stats.json"
            )
            for s in stats:
                for metric in ["precision", "recall", "f1"]:
                    data.append(
                        {
                            "model": col.split("submission_")[1].replace('DeepText_Lab','DeepText Lab').replace('clteam','CLTeam').replace('unbabel+it','Unbabel-IT').replace('baseline','NLLB-3.3B'),
                            "lp": tgt_lang,
                            "tag": s["tag"].replace('verb_form', 'Verb Form').replace('lexical_cohesion', 'Lexical Cohesion').replace('pronouns', 'Pronouns').replace('formality', 'Formality'),
                            "metric": metric,
                            "value": s[metric],
                        }
                    )
        except:
            continue
df = pd.DataFrame().from_dict(data)

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("ticks")
sns.set_context("poster")

# activate tex
plt.rc("text", usetex=False)
sns.set_context("paper", font_scale=2.5)

In [5]:
from matplotlib.backends.backend_pdf import PdfPages  # Import PdfPages


In [10]:
other_models = df['model'].unique().tolist()
other_models.remove('NLLB-3.3B')

palette = sns.color_palette("colorblind", len(other_models))

model_palette = {model: color for model, color in zip(other_models, palette)}
model_palette['NLLB-3.3B'] = 'grey'  # Set 'nllb-3.3b' to grey
model_order = ['NLLB-3.3B'] + other_models

with sns.axes_style("darkgrid"):
    fig, axs = plt.subplots(len(df['lp'].unique()), 1, figsize=(12, 8 * len(df['lp'].unique())), sharex=True)
    for i, lp in  enumerate(df['lp'].unique()):
        # Filter the dataframe for the current lp
        df_lp = df[df['lp'] == lp]
        
        # Create a barplot for the current lp
        sns.barplot(data=df_lp, 
                    x="tag", 
                    y="value", 
                    hue="model", 
                    palette=model_palette, 
                    hue_order=model_order, 
                    ax=axs[i], 
                    legend=(i==0))

        # Add vertical lines between x-axis categories
        for j in range(len(df[df['lp'] == "fr"]['tag'].unique()) - 1):
            axs[i].axvline(x=j + 0.5, color='black', linestyle='--', linewidth=0.8)
        
        
        # Set the plot title and labels
        axs[i].set_title(f'EN-{lp.upper()}')
        axs[i].set_ylim(0, 1)  # Set y-axis limits between 0 and 1
        axs[i].set_ylabel('F1 Score')
        axs[i].set_xlabel('')
        if i == 0:
            handles, labels = axs[0].get_legend_handles_labels()  # Get handles and labels
            axs[i].legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, 1.4), ncol=3)  # Place legend above the plot


# plt.show()
with PdfPages('plots/all_barplots.pdf') as pdf:
    pdf.savefig(fig, bbox_inches='tight')


# Context MQM

In [2]:
import pandas as pd
ende_score = pd.read_csv("all_submissions/en-de-scores.csv")

In [3]:
doc_dfs = []
for _, df_group in ende_score.groupby(["doc_id"]):
    df_group['segment_id'] = list(range(len(df_group)))
    doc_dfs.append(df_group)
dfs_all = pd.concat(doc_dfs)

In [4]:
scores_df = pd.read_csv(f"human_eval/docsqm2deu_ann.csv")
scores_df.drop_duplicates(subset=['model_app'], keep='last', inplace=True) 
scores_df.rename(columns={"sent_id": "segment_id"}, inplace=True)

In [19]:
for system in ['submission_HW-TSC',  'submission_ADAPT', 'submission_SheffieldGATE', 'submission_clteam',  'submission_DCUGenNLP', 'submission_unbabel+it', 'submission_baseline']:
    gpt_out = pd.read_csv(f"gpt-4o-mini/gpt-4o-mini-en-de-{system}.csv")
    merged_df = gpt_out.merge(dfs_all, on=['doc_id', 'segment_id'])
    error_counts = (' & ').join(list(map(str, gpt_out[['gpt-4o-mini-minor', 'gpt-4o-mini-major',  'gpt-4o-mini-critical-count']].sum().values)))
    print(f"{system} & {len(gpt_out[gpt_out['gpt-4o-mini-score']==0])/len(gpt_out)*100:.2f} & {error_counts} & {gpt_out['gpt-4o-mini-score'].mean():.3f}")

submission_HW-TSC & 89.12 & 100 & 88 & 59 & -0.554
submission_ADAPT & 82.61 & 158 & 139 & 99 & -0.903
submission_SheffieldGATE & 77.95 & 220 & 178 & 95 & -1.009
submission_clteam & 86.28 & 139 & 82 & 79 & -0.656
submission_DCUGenNLP & 83.10 & 143 & 158 & 80 & -0.849
submission_unbabel+it & 94.41 & 51 & 47 & 18 & -0.228
submission_baseline & 80.50 & 161 & 143 & 117 & -1.002
