This is a demo colab for MTME. It assumes you have mt_metrics_eval installed on your runtime, and have downloaded the data onto that machine. Run the cells below in order.

# Preliminaries

In [1]:
# @title Imports

import numpy as np
import scipy.stats

from mt_metrics_eval import meta_info
from mt_metrics_eval import data
from mt_metrics_eval import stats
from mt_metrics_eval import tasks

In [2]:
# @title Print all available evalsets

for testset in meta_info.DATA:
  print(f'{testset}:', ' '.join(lp for lp in meta_info.DATA[testset]))


In [3]:
# @title Load data for WMT21 language pairs scored with MQM

all_evs = {}  # name/lp -> evs
for testset in meta_info.DATA:
  if not testset.startswith('wmt21'): continue
  for lp in meta_info.DATA[testset]:
    if 'mqm' in meta_info.DATA[testset][lp].std_gold.values():
      all_evs[f'{testset}/{lp}'] = data.EvalSet(testset, lp, True)

print('\n'.join(all_evs.keys()))

In [4]:
# @title Print summaries for all loaded evalsets

print(f'{"name":<20}  segs sys metrics gold  refs std')
for name, evs in all_evs.items():
  nsegs = len(evs.src)
  nsys = len(evs.sys_names)
  nmetrics = len(evs.metric_basenames)
  gold = evs.StdHumanScoreName('sys')
  nrefs = len(evs.ref_names)
  std_ref = evs.std_ref

  print(f'{name:<20} {nsegs:5d} {nsys:3d} {nmetrics:7d} '
        f'{gold:5} {nrefs:4d} {std_ref}')

In [24]:
testset_name, lp = "wmt22", "en-ru"
meta_info.DATA[testset_name][lp].std_gold.values(), meta_info.DATA[testset_name][lp].std_gold

In [25]:
meta_info.DATA[testset_name][lp]

In [26]:
testset, lp = testset_name, lp
testset = data.EvalSet(testset, lp, True)
testset, type(testset)

In [27]:
testset.ref_names

In [28]:
# @title Print summaries for all loaded evalsets
nsegs = len(testset.src)
nsys = len(testset.sys_names)
nmetrics = len(testset.metric_basenames)
gold_seg = testset.StdHumanScoreName('seg')
nrefs = len(testset.ref_names)
std_ref = testset.std_ref

print(f"# segs = {nsegs}")
print(f"# systems = {nsys}")
print(f"# metrics = {nmetrics}")
print(f"Annotation type = {gold_seg}")
print(f"# refs = {nrefs}")
print(f"std ref = {std_ref}")

# Comparing metrics

In [29]:
# @title Set up for comparing metrics

# There are many different ways to evaluate the performance of MT metrics. The
# most obvious question is what correlation statistic we should use to capture
# the similarity between a vector of metric scores and a vector of gold scores
# (human ratings). A less obvious question is where those vectors come from.
# We'll defer the choice of correlation statistic to later cells, and begin
# by setting some parameters that precisely define the vectors we're interested
# in comparing.

evs_list = [testset]

# Choose the version of each metric that uses the standard reference for each
# evalset.
main_refs = [{evs.std_ref} for evs in evs_list]

# Some alternative references are known to be close to the standard reference.
# Don't include these among systems to be scored if we are including 'human'
# systems. The only currently known instance is refB in wmt21.news/en-de,
# which is similar to the standard refC.
# close_refs = [{'refB'} if k == 'wmt21.news/en-de' else set() for k in all_evs]
# close_refs = [set() for _ in evs_list]

# Include 'human' systems (ie, reference translations) among systems to be
# scored. This can make the task more challenging, since some metrics are
# biased against less literal references.
include_human = False

# Don't include systems considered to be outliers. These are systems that are
# much better or worse than all other systems, so they are easy for all metrics
# to rank correctly).
include_outliers = False

# Use MQM ratings as gold scores rather than the scores provided by the main
# WMT task. Metrics tasks have used MQM for main results since 2021.
gold_name = 'mqm' #'wmt-raw' # 'wmt' # 'mqm'

# Only compare metrics that have been designated as primary submissions. This
# removes metric variants that are similar to each other, and reduces the size
# of the comparison matrix.
primary_metrics = True

# Don't limit the results to a particular domain. In WMT21, domains are treated
# as separate test-sets, so this is a no-op (WMT22 is a different story).
domain = None

# Set the number of resampling runs for determining whether one metric is better
# than another according to the permutation test. We'll use 5 to make the demo
# finish quickly, but at least 1000 is required for stable results.
k = 0 # 1000

# Set the size of blocks for 'early stopping' checks during resampling. If
# you're using k = 1000, this can speed up the computation, usually with
# only minimal changes to the results.
psd = stats.PermutationSigDiffParams(block_size = 100)

# Set the p-value for deciding wheter metrics are considered to be significantly
# different. Lower values make the test more stringent.
pval = 0.05

In [30]:
testset.domain_names

In [10]:
# @title Set up for comparing metrics

# There are many different ways to evaluate the performance of MT metrics. The
# most obvious question is what correlation statistic we should use to capture
# the similarity between a vector of metric scores and a vector of gold scores
# (human ratings). A less obvious question is where those vectors come from.
# We'll defer the choice of correlation statistic to later cells, and begin
# by setting some parameters that precisely define the vectors we're interested
# in comparing.

# Use all evalsets that we've loaded.
evs_list = all_evs.values()

# Choose the version of each metric that uses the standard reference for each
# evalset.
main_refs = [{evs.std_ref} for evs in evs_list]

# Some alternative references are known to be close to the standard reference.
# Don't include these among systems to be scored if we are including 'human'
# systems. The only currently known instance is refB in wmt21.news/en-de,
# which is similar to the standard refC.
close_refs = [{'refB'} if k == 'wmt21.news/en-de' else set() for k in all_evs]

# Include 'human' systems (ie, reference translations) among systems to be
# scored. This can make the task more challenging, since some metrics are
# biased against less literal references.
include_human = True

# Don't include systems considered to be outliers. These are systems that are
# much better or worse than all other systems, so they are easy for all metrics
# to rank correctly).
include_outliers = False

# Use MQM ratings as gold scores rather than the scores provided by the main
# WMT task. Metrics tasks have used MQM for main results since 2021.
gold_name = 'mqm'

# Only compare metrics that have been designated as primary submissions. This
# removes metric variants that are similar to each other, and reduces the size
# of the comparison matrix.
primary_metrics = True

# Don't limit the results to a particular domain. In WMT21, domains are treated
# as separate test-sets, so this is a no-op (WMT22 is a different story).
domain = None

# Set the number of resampling runs for determining whether one metric is better
# than another according to the permutation test. We'll use 5 to make the demo
# finish quickly, but at least 1000 is required for stable results.
k = 5

# Set the size of blocks for 'early stopping' checks during resampling. If
# you're using k = 1000, this can speed up the computation, usually with
# only minimal changes to the results.
psd = stats.PermutationSigDiffParams(block_size = 100)

# Set the p-value for deciding wheter metrics are considered to be significantly
# different. Lower values make the test more stringent.
pval = 0.05

In [8]:
# @title Evaluate metrics using global accuracy

# Global accuracy, introduced by Kocmi et al (https://arxiv.org/abs/2107.10821)
# is a robust way to evaluate the performance of a metric across many different
# settings. The idea is to count the number of pairwise system rankings where
# the metric agrees with the gold ranking, and micro average this across all
# settings.

# The output shows the rank of each metric's significance cluster, followed
# by its accuracy, and whether it is statistically tied with (=) or better than
# (>) each lower-ranking metric.


ranks, matrix = data.CompareMetricsWithGlobalAccuracy(
    evs_list, main_refs, close_refs, include_human, include_outliers,
    gold_name, primary_metrics, domain, k, psd, pval)[:2]

data.PrintMetricComparison(ranks, matrix, pval)

In [10]:
# @title Evaluate metrics using system-level Pearson correlation

# Pearson correlation measures the degree of linear correspondence between
# metric and gold scores. Computing a single correlation across different
# evalsets isn't a great idea, so the interface forces you to choose a single
# set. We'll pick 'wmt21.news/en-de'. The part of the computation that extracts
# relevant score vectors is factored into a separate step to allow you to
# compute other correlations with these vectors.

# Notice that the ranking is quite different from the accuracy ranking, partly
# because we're using only a subset of the data, and partly because Pearson and
# accuracy measure different things. The ranking also includes two metrics that
# were automatically filtered out of the accuracy ranking because they weren't
# available for all evalsets.

evs = all_evs['wmt21.news/en-de']
corrs = data.GetCorrelations(
    evs, 'sys', {evs.std_ref}, {'refB'}, include_human, include_outliers,
    gold_name, primary_metrics, domain)
ranks, matrix = data.CompareMetrics(
    corrs, scipy.stats.pearsonr, 'none', k, psd, pval)

data.PrintMetricComparison(ranks, matrix, pval, evs)

In [12]:
include_human, include_outliers, gold_name, primary_metrics, domain, testset.human_sys_names, testset.std_ref

In [13]:
testset.sys_names, testset.sys_names - {testset.std_ref} - testset.outlier_sys_names

In [14]:
primary_metrics = True
gold_name, {testset.std_ref}, include_outliers, primary_metrics

In [13]:
corrs = data.GetCorrelations(
    testset, "seg", {testset.std_ref}, set(), include_human, include_outliers,
    gold_name, primary_metrics, domain)
corrs_and_ranks, sig_matrix, draws_index, draws_list = data.CompareMetrics(
    corrs, stats.KendallWithTiesOpt, "item", False, k, psd, pval, sample_rate=1.0)

data.PrintMetricComparison(corrs_and_ranks, sig_matrix, pval, testset)

In [59]:
# WMT21.news, MQM, zh-en

corrs = data.GetCorrelations(
    testset, "seg", {testset.std_ref}, set(), include_human, include_outliers,
    gold_name, primary_metrics, domain)
corrs_and_ranks, sig_matrix, draws_index, draws_list = data.CompareMetrics(
    corrs, scipy.stats.kendalltau, "none", True, 0, psd, pval)

data.PrintMetricComparison(corrs_and_ranks, sig_matrix, pval, testset)

In [31]:
# WMT22, MQM, zh-en, seg

corrs = data.GetCorrelations(
    testset, "seg", {"refA"}, set(), include_human, include_outliers,
    gold_name, primary_metrics, domain)
corrs_and_ranks, sig_matrix, draws_index, draws_list = data.CompareMetrics(
    corrs, scipy.stats.kendalltau, "none", True, 0, psd, pval)

data.PrintMetricComparison(corrs_and_ranks, sig_matrix, pval, testset)

In [149]:
# WMT22, MQM, zh-en, sys

corrs = data.GetCorrelations(
    testset, "sys", {"refA"}, set(), include_human, include_outliers,
    gold_name, primary_metrics, domain)
corrs_and_ranks, sig_matrix, draws_index, draws_list = data.CompareMetrics(
    corrs, scipy.stats.pearsonr, "none", True, 0, psd, pval)

data.PrintMetricComparison(corrs_and_ranks, sig_matrix, pval, testset)

In [10]:
testset.sys_names, testset

In [33]:
# WMT22, DA, zh-en

corrs = data.GetCorrelations(
    testset, "seg", {"refA"}, set(), include_human, include_outliers,
    gold_name, primary_metrics, domain)
corrs_and_ranks, sig_matrix, draws_index, draws_list = data.CompareMetrics(
    corrs, scipy.stats.kendalltau, "none", True, 0, psd, pval)

data.PrintMetricComparison(corrs_and_ranks, sig_matrix, pval, testset)

In [18]:
draws_index, len(draws_index)

In [15]:
draws_list

In [11]:
testset.human_score_names, testset.human_sys_names, testset.std_ref

In [15]:
sys2seg_scores = testset.Scores("seg", "mqm")
type(sys2seg_scores), len(sys2seg_scores)

In [12]:
nsegs

In [63]:
wmt_z_human_scores, scored_systems = [], []
for sys, scores in sys2seg_scores.items():
    wmt_z_human_scores.extend(scores)
    # assert not any(score is None for score in scores)
    assert len(scores) == nsegs
    scored_systems.append(sys)
scored_systems = set(scored_systems) 
wmt_z_human_scores, len(wmt_z_human_scores), scored_systems, len(scored_systems)

In [23]:
# @title Load data for a specific WMT year language pairs scored with z-normalized DA

wmt_year = "wmt22"
all_evs = {}  # name/lp -> evs
for testset in meta_info.DATA:
    if not testset.startswith(wmt_year):
        continue
    for lp in meta_info.DATA[testset]:
        new_eval_set = data.EvalSet(testset, lp, True)
        if "wmt-z" not in new_eval_set.human_score_names:
            print(testset, lp)
            continue
        all_evs[f'{testset}/{lp}'] = new_eval_set

print('\n'.join(all_evs.keys()))

In [5]:
from pprint import pp

n_annotated_sys_outputs = 0
testset_name2scored_systems = dict()
for testset_name, testset in all_evs.items():
    sys2seg_scores = testset.Scores("seg", "wmt-z")
    wmt_z_human_scores, scored_systems = [], []
    sum_sum = 0
    for sys, scores in sys2seg_scores.items():
        if sys == testset.std_ref:# or ((testset_name.endswith("zh-en") or testset_name.endswith("en-ru")) and sys.startswith("ref")):
            continue
        # wmt_z_human_scores.extend(scores)
        # assert not any(score is None for score in scores)
        # assert len(scores) == nsegs
        scored_systems.append(sys)
        new_sum = sum(1 for score in scores if score is not None)
        n_annotated_sys_outputs += new_sum
        sum_sum += new_sum
    print(testset_name, sum_sum)
    pp(scored_systems)
    print(testset.std_ref)
    print("\n")
    assert testset_name not in testset_name2scored_systems
    testset_name2scored_systems[testset_name] = scored_systems
n_annotated_sys_outputs

In [94]:
testset_name2scored_systems["wmt21.news/en-ru"]

In [None]:
testset, lp = testset_name, lp
testset = data.EvalSet(testset, lp, True)
testset, type(testset)

In [None]:
# @title Evaluate metrics using segment-level Kendall correlation

# Kendall correlation is similar to pairwise accuracy, except that it is
# normalized differently. The function calls are identical to the previous one,
# except that we set the 'level' parameter to 'seg', and specify Kendall rather
# than Pearson. The value of the 'average_by' parameter also matters here, as it
# specifies how system x segment score matrices get converted into vectors for
# comparison. We will use 'none', which just flattens the matrices.

# The resulting ranking is similar to the ranking from accuracy. One noticeable
# difference is that the significance clusters are smaller because they are
# based on more data (much larger vectors). Notice that BLEU is absent because
# it isn't available at the segment level.

evs = all_evs['wmt21.news/en-de']
corrs = data.GetCorrelations(
    evs, 'seg', {evs.std_ref}, {'refB'}, include_human, include_outliers,
    gold_name, primary_metrics, domain)
ranks, matrix = data.CompareMetrics(
    corrs, scipy.stats.kendalltau, 'none', k, psd, pval)

data.PrintMetricComparison(ranks, matrix, pval, evs)

In [None]:
# @title Evaluate metrics using seg-level accuracy with optimized tie threshold.

# This is an implementation of the acc*_eq pairwise ranking accuracy proposed in
# https://arxiv.org/abs/2305.14324. This is similar to global accuracy, but it
# additionally gives metrics credit for predicting ties in gold scores, which
# arise frequently in MQM segment-level data. To avoid bias due to differences
# in scoring precision for different metrics, an optimal threshold for assigning
# ties is automatically computed for each metric and test set.

# For demo purposes we disable significance testing by setting k to 0.
# (Significance testing works but is currently very slow.) Note that the
# optimization procedure uses sampling, so results can change across different
# runs.

evs = all_evs['wmt21.news/en-de']
corrs = data.GetCorrelations(
    evs, 'seg', {evs.std_ref}, {'refB'}, include_human, include_outliers,
    gold_name, primary_metrics, domain)
ranks, matrix = data.CompareMetrics(
    corrs, stats.KendallWithTiesOpt, 'item', 0, psd, pval, variant='acc23',
    sample_rate=0.1)

data.PrintMetricComparison(ranks, matrix, pval, evs)

In [35]:
from comet import download_model, load_from_checkpoint

model_path = download_model("Unbabel/wmt22-comet-da")
model = load_from_checkpoint(model_path)
model

In [10]:
from matese.metric import MaTESe

candidates = []
references = []

metric = MaTESe.load_metric('matese', device="cuda", batch_size=1) # pass 'matese-en' or 'matese-qe' for the other versions
metric

In [None]:
from matese.metric import MaTESe

sources = ["这是一个中文的句子"]
candidates = ["This is a wrong translation in Chinese"]
references = ["This is a sentence in Chinese"]

metric = MaTESe.load_metric('matese') # pass 'matese-qe' for the reference-less metric
assessment = metric.evaluate(candidates, sources, references)[0]

print(assessment)

In [36]:
def create_input_for_comet(src_sents, cand_sents, ref_sents):
    data = []
    assert len(src_sents) == len(cand_sents) == len(ref_sents)
    for input_sents in [src_sents, ref_sents]:
        assert all(sent is not None and len(sent) > 0 for sent in input_sents)
    for src, cand, ref in zip(src_sents, cand_sents, ref_sents):
        data.append({"src": src, "mt": cand, "ref": ref})
    return data

In [11]:
testset.std_ref

In [12]:
ref_a_sents = testset.all_refs[testset.std_ref] # refA
len(ref_a_sents), len(testset.src), nsegs

In [13]:
systems_to_discard = testset.human_sys_names.union(testset.outlier_sys_names)
systems_to_discard

In [14]:
systems_to_discard = {testset.std_ref}.union(testset.outlier_sys_names)
systems_to_discard

In [15]:
sys2outputs = dict()
for sys, candidates in testset.sys_outputs.items():
    if sys in systems_to_discard:
        continue
    assert len(candidates) == nsegs
    assert sys not in sys2outputs
    assert all(candidate is not None for candidate in candidates)
    sys2outputs[sys] = candidates
len(sys2outputs)

In [41]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [16]:
import torch

torch.cuda.get_device_name(0)

In [46]:
# @title Evaluate a new metric

# New metrics can be included in the comparison of existing metrics using the
# 'extern_metrics' argument to GetCorrelations().

sys2seg_scores = dict()
for sys_name, cand_sents in sys2outputs.items():
    assert sys_name not in sys2seg_scores
    comet_input_data = create_input_for_comet(testset.src, cand_sents, ref_a_sents)
    sys2seg_scores[sys_name] = model.predict(comet_input_data, batch_size=8, gpus=1)[0]

In [47]:
from fake_metric import load_from_checkpoint

model_path = "/home/prosho/mt-metrics-analysis/checkpoints/fake_regression_metric_bs=16_nepochs=10/epoch=3-step=216949-val_mse_loss=0.41041.ckpt"
fake_model = load_from_checkpoint(model_path)
fake_model

In [48]:
# @title Evaluate a new metric

# New metrics can be included in the comparison of existing metrics using the
# 'extern_metrics' argument to GetCorrelations().

def create_input_for_fake_metric(src_sents, cand_sents, ref_sents):
    data = []
    assert len(src_sents) == len(cand_sents) == len(ref_sents)
    for input_sents in [src_sents, ref_sents]:
        assert all(sent is not None and len(sent) > 0 for sent in input_sents)
    for src, cand, ref in zip(src_sents, cand_sents, ref_sents):
        data.append({"mt": cand})
    return data

fake_sys2seg_scores = dict()
for sys_name, cand_sents in sys2outputs.items():
    assert sys_name not in fake_sys2seg_scores
    input_data = create_input_for_fake_metric(testset.src, cand_sents, ref_a_sents)
    fake_sys2seg_scores[sys_name] = fake_model.predict(input_data, batch_size=8, gpus=1)[0]

In [16]:
# @title Evaluate a new metric

# New metrics can be included in the comparison of existing metrics using the
# 'extern_metrics' argument to GetCorrelations().

sys2seg_scores = dict()
for sys_name, cand_sents in sys2outputs.items():
    assert sys_name not in sys2seg_scores
    matese_input_candidates = cand_sents
    matese_input_refs = testset.all_refs[testset.std_ref]
    assert len(matese_input_candidates) == len(matese_input_refs) == nsegs
    sys2seg_scores[sys_name] = [assessment["score"] for assessment in metric.evaluate(matese_input_candidates, references=matese_input_refs)]
len(sys2seg_scores)

In [49]:
primary_metrics, gold_name

In [111]:
old_sys2seg_scores = testset.Scores("seg", "COMET-DA_2021-refA")
type(old_sys2seg_scores), len(old_sys2seg_scores), len(sys2seg_scores), list(old_sys2seg_scores)

In [112]:
old_sys2seg_scores.pop("refB"), len(old_sys2seg_scores)

In [115]:
set(old_sys2seg_scores) == set(sys2seg_scores)

In [None]:
# Run the comparison with the new metric included via the 'extern_metrics'
# argument.

# WMT23, MQM, he-en

extras = {"MATESETEST-refB": sys2seg_scores}

corrs = data.GetCorrelations(
    testset, "seg", {testset.std_ref}, set(), include_human, include_outliers,
    gold_name, primary_metrics, domain, extern_metrics=extras)
corrs_and_ranks, sig_matrix, draws_index, draws_list = data.CompareMetrics(
    corrs, stats.KendallWithTiesOpt, "item", False, k, psd, pval, sample_rate=1.0)

data.PrintMetricComparison(corrs_and_ranks, sig_matrix, pval, testset)

In [22]:
# Run the comparison with the new metric included via the 'extern_metrics'
# argument.

# WMT21.news, DA, zh-en

extras = {"COMET-DA_2022-refA": sys2seg_scores}
corrs = data.GetCorrelations(
    testset, "seg", {"refA"}, set(), include_human, include_outliers,
    gold_name, primary_metrics, domain, extern_metrics=extras)
corrs_and_ranks, sig_matrix, draws_index, draws_list = data.CompareMetrics(
    corrs, stats.KendallLike, "item", False, 0, psd, pval)

data.PrintMetricComparison(corrs_and_ranks, sig_matrix, pval, testset)

In [52]:
k

In [53]:
# WMT22, DA, zh-en, seg

extras = {"COMET-DA_2022-refA": sys2seg_scores, "FAKE-METRIC-refA": fake_sys2seg_scores}
corrs = data.GetCorrelations(
    testset, "seg", {"refA"}, set(), include_human, include_outliers,
    gold_name, primary_metrics, domain, extern_metrics=extras)
corrs_and_ranks, sig_matrix, draws_index, draws_list = data.CompareMetrics(
    corrs, scipy.stats.kendalltau, "none", True, k, psd, pval)

data.PrintMetricComparison(corrs_and_ranks, sig_matrix, pval, testset)

In [157]:
# WMT22, MQM, zh-en, seg

corrs = data.GetCorrelations(
    testset, "seg", {"refA"}, set(), include_human, include_outliers,
    gold_name, primary_metrics, domain, extern_metrics=extras)
corrs_and_ranks, sig_matrix, draws_index, draws_list = data.CompareMetrics(
    corrs, scipy.stats.kendalltau, "none", True, 1000, psd, pval)

data.PrintMetricComparison(corrs_and_ranks, sig_matrix, pval, testset)

In [60]:
sys2score = {sys: [sum(scores) / len(scores)] for (sys, scores) in sys2seg_scores.items()}
sys2score

In [59]:
fake_sys2score = {sys: [sum(scores) / len(scores)] for (sys, scores) in fake_sys2seg_scores.items()}
fake_sys2score

In [62]:
# WMT22, DA, zh-en, sys

sys_extras = {"COMET-DA_2022-refA": sys2score, "FAKE-METRIC-refA": fake_sys2score}
corrs = data.GetCorrelations(
    testset, 'sys', {"refA"}, set(), include_human, include_outliers,
    gold_name, primary_metrics, domain, extern_metrics=sys_extras)
corrs_and_ranks, sig_matrix, draws_index, draws_list = data.CompareMetrics(
    corrs, scipy.stats.pearsonr, "none", True, 1000, psd, pval)

data.PrintMetricComparison(corrs_and_ranks, sig_matrix, pval, testset)

In [162]:
# WMT22, MQM, zh-en, sys

corrs = data.GetCorrelations(
    testset, "sys", {"refA"}, set(), include_human, include_outliers,
    gold_name, True, domain, extern_metrics=sys_extras)
corrs_and_ranks, sig_matrix, draws_index, draws_list = data.CompareMetrics(
    corrs, scipy.stats.pearsonr, "none", True, 1000, psd, pval)

data.PrintMetricComparison(corrs_and_ranks, sig_matrix, pval, testset)

In [148]:
# WMT22, MQM, zh-en

corrs = data.GetCorrelations(
    testset, "seg", {"refA"}, set(), include_human, include_outliers,
    "mqm", primary_metrics, domain, extern_metrics=extras)
corrs_and_ranks, sig_matrix, draws_index, draws_list = data.CompareMetrics(
    corrs, scipy.stats.kendalltau, 'none', True, 1000, psd, pval)

data.PrintMetricComparison(corrs_and_ranks, sig_matrix, pval, testset)

In [70]:
# WMT21.news, MQM, zh-en

extras = {"COMET-DA_2022-refB": sys2seg_scores}
corrs = data.GetCorrelations(
    testset, "seg", {testset.std_ref}, set(), include_human, include_outliers,
    gold_name, primary_metrics, domain, extern_metrics=extras)
corrs_and_ranks, sig_matrix, draws_index, draws_list = data.CompareMetrics(
    corrs, scipy.stats.kendalltau, "none", True, 0, psd, pval)

data.PrintMetricComparison(corrs_and_ranks, sig_matrix, pval, testset)

In [63]:
testset.metric_names

In [192]:
comet_20_sys2seg_scores = testset.Scores("seg", f"COMET-QE-src")
type(comet_20_sys2seg_scores), len(comet_20_sys2seg_scores), len(sys2seg_scores), len(fake_sys2seg_scores)

In [193]:
comet_20_sys2score = testset.Scores("sys", f"COMET-QE-src")
type(comet_20_sys2score), len(comet_20_sys2score), len(sys2score), len(fake_sys2score)

In [13]:
other_seg = testset.Scores("seg", f"COMET-20-refA")
type(other_seg), len(other_seg)

In [14]:
new_seg = testset.Scores("seg", f"COMETKiwi-src")
type(new_seg), len(new_seg)

In [223]:
new_sys = testset.Scores("sys", f"COMETKiwi-src")
type(new_sys), len(new_sys), len(sys2seg_scores), len(fake_sys2seg_scores)

In [15]:
other_sys = testset.Scores("sys", f"COMET-20-refA")
type(other_sys), len(other_sys)

In [97]:
comet_da_21_sys2seg_scores = testset.Scores("sys", f"COMET-DA_2021-{testset.std_ref}")
type(comet_da_21_sys2seg_scores), len(comet_da_21_sys2seg_scores), len(sys2seg_scores)

In [170]:
list(comet_20_sys2seg_scores), list(sys2seg_scores), set(comet_20_sys2seg_scores).difference(set(sys2seg_scores)), set(sys2seg_scores).difference(set(comet_20_sys2seg_scores))

In [18]:
include_human = False

In [19]:
# List of systems to be scored.
sys_names = testset.sys_names - {testset.std_ref}
if not include_human:
    sys_names -= testset.human_sys_names
if not include_outliers:
    sys_names -= testset.outlier_sys_names
sys_names

In [126]:
gold_sys2score = testset.Scores("sys", "wmt")
type(gold_sys2score), len(gold_sys2score), set(gold_sys2score)
gold_sys2score

In [225]:
sys_names = sys_names.intersection(comet_20_sys2score)
sys_names

In [127]:
sys_names = sys_names.intersection(gold_sys2score)
sys_names

In [173]:
set(sys2seg_scores), set(fake_sys2seg_scores), set(sys2seg_scores) == set(fake_sys2seg_scores)

In [68]:
sys_names = set(sys2seg_scores)

In [247]:
correlation = testset.Correlation(fake_sys2seg_scores, new_seg, sys_names)
correlation

In [219]:
correlation = testset.Correlation(comet_20_sys2score, sys2score, sys_names)
correlation

In [235]:
correlation = testset.Correlation(fake_sys2score, new_sys, sys_names)
correlation

In [205]:
correlation = testset.Correlation(gold_sys2score, comet_20_sys2score, sys_names)
correlation

In [91]:
correlation = testset.Correlation(gold_sys2seg_scores, sys2seg_scores, sys_names)
correlation

In [20]:
correlation = testset.Correlation(other_seg, new_seg, sys_names)
correlation

In [21]:
corr_wrapper = stats.AverageCorrelation(
    scipy.stats.kendalltau, #scipy.stats.pearsonr
    correlation.num_sys,
    average_by="none",
    filter_nones=correlation.none_count,
    replace_nans_with_zeros=False,
)
corr_wrapper

In [28]:
correlation_value = corr_wrapper(correlation.gold_scores, correlation.metric_scores)[0]
correlation_value

In [25]:
len(correlation.gold_scores), len(correlation.metric_scores)

In [32]:
s = scipy.stats.kendalltau(correlation.gold_scores, correlation.metric_scores)
s

In [191]:
correlation_value = corr_wrapper(correlation.gold_scores, correlation.metric_scores)[0]
correlation_value

In [183]:
correlation_value = corr_wrapper(correlation.gold_scores, correlation.metric_scores)[0]
correlation_value

In [179]:
correlation_value = corr_wrapper(correlation.gold_scores, correlation.metric_scores)[0]
correlation_value

In [176]:
correlation_value = corr_wrapper(correlation.gold_scores, correlation.metric_scores)[0]
correlation_value

In [138]:
correlation_value = corr_wrapper(correlation.gold_scores, correlation.metric_scores)[0]
correlation_value

In [117]:
correlation_value = corr_wrapper(correlation.gold_scores, correlation.metric_scores)[0]
correlation_value

In [92]:
correlation_value = corr_wrapper(correlation.gold_scores, correlation.metric_scores)[0]
correlation_value

In [106]:
correlation_value = corr_wrapper(correlation.gold_scores, correlation.metric_scores)[0]
correlation_value

In [109]:
correlation_value = corr_wrapper(correlation.gold_scores, correlation.metric_scores)[0]
correlation_value

In [75]:
correlation_value = corr_wrapper(correlation.gold_scores, correlation.metric_scores)[0]
correlation_value

In [85]:
corr_wrapper(correlation.gold_scores, correlation.metric_scores)[0]

In [93]:
corr_wrapper(correlation.gold_scores, correlation.metric_scores)[0]

In [None]:
# @title Evaluate a new metric

# New metrics can be included in the comparison of existing metrics using the
# 'extern_metrics' argument to GetCorrelations(). To demonstrate this, we'll
# create and evaluate a new metric consisting of the average of the top 3
# metrics in the system-level Pearson ranking.

# The result is a slight, non-significant, improvement over C-SPECpn, the metric
# with highest Pearson correlation. (The '*' before the new metric indicates
# that it isn't recognized as a primary submission.)

evs = all_evs['wmt21.news/en-de']

# Create the new metric
top3_metrics = ['C-SPECpn-refC', 'COMET-QE-MQM_2021-src', 'bleurt-20-refC']
sys_scores = {}
for sys_name in evs.sys_names:
  if sys_name == 'refC': continue
  scores = np.array([evs.Scores('sys', m)[sys_name] for m in top3_metrics])
  sys_scores[sys_name] = scores.mean(axis=0)

# Run the comparison with the new metric included via the 'extern_metrics'
# argument.
extras = {'top3_avg-refC': sys_scores}
corrs = data.GetCorrelations(
    evs, 'sys', {evs.std_ref}, {'refB'}, include_human, include_outliers,
    gold_name, primary_metrics, domain, extern_metrics=extras)
ranks, matrix = data.CompareMetrics(
    corrs, scipy.stats.pearsonr, 'none', k, psd, pval)

data.PrintMetricComparison(ranks, matrix, pval, evs)

In [None]:
# @title Evaluate a new metric using global accuracy

# This requires a bit more work, since we have to produce results for multiple
# evalsets. As before, the result is a slight gain over the best single metric
# (note that the averaged metrics aren't quite the top 3 for the global accuracy
# task).

# Create the new metric, one instance per input evalset
top3_metrics = ['C-SPECpn-<REF>', 'COMET-QE-MQM_2021-src', 'bleurt-20-<REF>']
extras_list = []
for evs in evs_list:
  top3 = [m.replace('<REF>', evs.std_ref) for m in top3_metrics]
  sys_scores = {}
  for sys_name in evs.sys_names:
    if sys_name == evs.std_ref: continue
    scores = np.array([evs.Scores('sys', m)[sys_name] for m in top3])
    sys_scores[sys_name] = scores.mean(axis=0)
  extras_list.append({f'top3_avg-{evs.std_ref}': sys_scores})

# Run the comparison with the new metric included via the 'extern_metrics_list'
# argument.
ranks, matrix = data.CompareMetricsWithGlobalAccuracy(
    evs_list, main_refs, close_refs, include_human, include_outliers,
    gold_name, primary_metrics, domain, k, psd, pval,
    extern_metrics_list=extras_list)

data.PrintMetricComparison(ranks, matrix)

# Ranking metrics using the task interface

This is a higher-level interface designed to make it more convenient to compare
a set of metrics using various different criteria called 'tasks'. The following
code uses this interface to roughly duplicate the comparisons in the previous
section.

In [None]:
# @title Define a set of tasks

# Create TaskSets from dicts that specify attribute/value-list combinations,
# along with fixed assignments to other attributes. Concatenate these into a
# single TaskSet.

k = 1  # Use only a single random draw for demo.
lang0 = {'test_set': ['wmt21.news'], 'lang': ['en-de,en-ru,zh-en']}
langs = {'test_set': ['wmt21.news'], 'lang': ['en-de', 'en-ru', 'zh-en']}

taskset = tasks.TaskSet(
    lang0, corr_fcn='accuracy', close_refs=[{'refB'}, set(), set()], k=k)
taskset += tasks.TaskSet(langs, level='sys', corr_fcn='pearson', k=k)
taskset += tasks.TaskSet(langs, level='seg', corr_fcn='pearson', k=k)
taskset += tasks.TaskSet(
    langs, level='seg', avg_by='item', corr_fcn='KendallWithTiesOpt',
    perm_test='pairs', corr_fcn_args={'sample_rate': 0.1}, k=k)

# A TaskSet is just a list of Tasks, so we can make arbitrary changes to
# attribute values. In this case, set the correct close_refs for en-de tasks.

for task in taskset:
  if task.lang == 'en-de': task.close_refs = {'refB'}

# Print task 'names' (attribute/value strings in canonical order).

for t in taskset:
  print(t.name)

In [None]:
# @title Run the tasks

# This first loads the necessary data, then runs each task in sequence to
# produce a TaskSetResults object. Subsequent runs re-use the loaded data.

results = taskset.Run()  # Takes about 5 minutes.

In [None]:
# @title Print raw task results

for result in results:
  print(result.name)
  print(result.Str())

In [None]:
# @title Average ranks for metrics

# To combine the performance of metrics across tasks, we average their task
# ranks. The tasks are weighted to ensure that the total mass for important
# attributes is evenly distributed among the different values those attributes
# take on.
weights = results.AssignWeights(tasks.Attributes())
global_ranks = results.AverageRanks(weights)

# It is also interesting to compare the metric performance on different subsets
# of tasks, for instance split by language.
ranks_by_lp = {}
for val, subset in results.SplitByAttr('lang').items():
  weights = subset.AssignWeights(tasks.Attributes())
  ranks_by_lp[val] = subset.AverageRanks(weights)

# Print out the comparison, with global ranks first, followed by a breakdown
# by language pair. We only show metrics that are in the intersection of all
# tasks.
langs = [' all ' if lp == 'en-de,en-ru,zh-en' else lp for lp in ranks_by_lp]
print(''.rjust(24), 'global', ' '.join(langs))
for metric, rank in global_ranks.items():
  ranks_for_metric = [rank] + [d[metric] for d in ranks_by_lp.values()]
  print(f'{metric:<25}', ' '.join(f'{r:5.2f}' for r in ranks_for_metric))
