Colab to reproduce results from the WMT23 metrics shared task

## Dependencies

In [9]:
# @title Imports

from mt_metrics_eval import meta_info
from mt_metrics_eval import data
from mt_metrics_eval import tasks

# Evaluate a new metric

This section shows how to evaluate a new metric online. Another
possibility is to generate scores offline, write score files to disk, and use
EvalSet.AddMetricsFromDir() to read them in.

In [10]:
# @title Load EvalSets

wmt23_lps = ['zh-en', 'en-de', 'he-en']
evs_dict = {('wmt23', lp): data.EvalSet('wmt23', lp, True) for lp in wmt23_lps}

In [11]:
import numpy as np
import os
os.chdir("/home/jiayuansu/M-MAD")
def extract_sys_scores(file_path):
    scores = {}
    
    with open(file_path, 'r') as file:
        for line in file:
            # Split the line by tab character and strip any extra whitespace
            parts = line.strip().split('\t')
            if len(parts) == 2:  # Ensure there are exactly two parts
                system_name = parts[0]
                score = float(parts[1])  # Convert score to float
                scores[system_name] = [score]
    
    return scores

def extract_seg_scores(file_path):
    scores = {}
    
    with open(file_path, 'r') as file:
        for line in file:
            # Split the line by tab character and strip any extra whitespace
            parts = line.strip().split('\t')
            if len(parts) == 2:  # Ensure there are exactly two parts
                system_name = parts[0]
                score = float(parts[1])  # Convert score to float

                # Append the score to the array for the system_name
                if system_name in scores:
                    scores[system_name] = np.append(scores[system_name], score)
                else:
                    scores[system_name] = np.array([score])

    return scores


metric_name = 'EAPrompt'

for lp in wmt23_lps:
  evs = evs_dict[('wmt23', lp)]
  sys_scores = extract_sys_scores(f"EAPrompt_{lp}.sys.score")
  seg_scores = extract_seg_scores(f"EAPrompt_{lp}.seg.score")

  evs.AddMetric(metric_name, set(), 'sys', sys_scores, replace=True)
  evs.AddMetric(metric_name, set(), 'seg', seg_scores, replace=True)


for evs in evs_dict.values():
  evs.SetPrimaryMetrics(evs.primary_metrics | {metric_name})


metric_name = 'GEMBA-DA'

for lp in wmt23_lps:
  evs = evs_dict[('wmt23', lp)]

  sys_scores = extract_sys_scores(f"GEMBA-DA_{lp}.sys.score")
  seg_scores = extract_seg_scores(f"GEMBA-DA_{lp}.seg.score")

  evs.AddMetric(metric_name, set(), 'sys', sys_scores, replace=True)
  evs.AddMetric(metric_name, set(), 'seg', seg_scores, replace=True)


for evs in evs_dict.values():
  evs.SetPrimaryMetrics(evs.primary_metrics | {metric_name})


metric_name = 'GEMBA-MQM'

for lp in wmt23_lps:
  evs = evs_dict[('wmt23', lp)]
  sys_scores = extract_sys_scores(f"GEMBA-MQM_{lp}.sys.score")
  seg_scores = extract_seg_scores(f"GEMBA-MQM_{lp}.seg.score")

  evs.AddMetric(metric_name, set(), 'sys', sys_scores, replace=True)
  evs.AddMetric(metric_name, set(), 'seg', seg_scores, replace=True)

for evs in evs_dict.values():
  evs.SetPrimaryMetrics(evs.primary_metrics | {metric_name})

print(evs_dict.keys())


metric_name = 'Ours'

for lp in wmt23_lps:
  evs = evs_dict[('wmt23', lp)]
  sys_scores = extract_sys_scores(f"M-MAD_{lp}.sys.score")
  seg_scores = extract_seg_scores(f"M-MAD_{lp}.seg.score")

  evs.AddMetric(metric_name, set(), 'sys', sys_scores, replace=True)
  evs.AddMetric(metric_name, set(), 'seg', seg_scores, replace=True)


for evs in evs_dict.values():
  evs.SetPrimaryMetrics(evs.primary_metrics | {metric_name})

dict_keys([('wmt23', 'zh-en'), ('wmt23', 'en-de'), ('wmt23', 'he-en')])


In [12]:
# @title Generate results with new metric

wmt23_tasks, wts = tasks.WMT23(wmt23_lps, k=0)

# Takes about 3 minutes.
new_results = wmt23_tasks.Run(eval_set_dict=evs_dict)


In [13]:
# @title Print results

avg_corrs = new_results.AverageCorrs(wts)

table = new_results.Table(
    metrics=list(avg_corrs),
    initial_column=avg_corrs,
    initial_column_header='avg-corr',
    attr_list=['lang', 'level', 'corr_fcn'],
    nicknames={'KendallWithTiesOpt': 'acc-t'},
    fmt='text',
    baselines_metainfo=meta_info.WMT23)

print(table)
print(avg_corrs)


lang:                                  en-de,he-en,zh-en     en-de     en-de     en-de     he-en     he-en     he-en     zh-en     zh-en     zh-en
level:                                               sys       sys       seg       seg       sys       seg       seg       sys       seg       seg
corr_fcn:                                       accuracy   pearson   pearson     acc-t   pearson   pearson     acc-t   pearson   pearson     acc-t
metric                       avg-corr              task1     task2     task3     task4     task5     task6     task7     task8     task9    task10
---------------------------  --------  -----------------  --------  --------  --------  --------  --------  --------  --------  --------  --------
XCOMET-Ensemble               1 0.825            3 0.928   4 0.980   1 0.695   1 0.604   6 0.950   1 0.556   1 0.586  10 0.927   1 0.650   1 0.543
XCOMET-QE-Ensemble[noref]     2 0.808            6 0.908   9 0.974   2 0.679   4 0.588  15 0.909   4 0.498   4 0.554  