In [1]:
import pickle
import os
import numpy as np
# import warnings
# warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)



scores_dir = './system-level/scores_ALL/'
scores_files = f'{scores_dir}/*scores.csv' 
outliers_dict  = pickle.load(open('../manual-evaluation/outlier-systems-dict.pk', 'rb'))
def is_outlier(df, syscolname = 'SYSTEM'): 
    lp = df.LP[0][:5]
    outliers = [True if sys in outliers_dict[lp] else False for sys in df[syscolname]]
    return np.array(outliers)
exclude_outliers = lambda scores: scores[~is_outlier(scores)]

 
qe_metrics = ['COMET-QE', 'OpenKiwi-Bert', 'OpenKiwi-XLMR', 'YiSi-2']
exclude_human = ['Human-A.0','Human-B.0', 'Human-P.0', ]

langs=['cs', 'de', 'ja','pl','ru','ta','zh']    

ENTO_LPS = [f'en-{l}' for l in langs]
ENTO_LPS.extend( ['en-iu_full','en-iu_news'])

TOEN_LPS = [f'{l}-en' for l in langs]
TOEN_LPS.extend( ['iu-en','km-en','ps-en'])

ALL_LPS = TOEN_LPS + ENTO_LPS

MULTIREF_TOEN_LPS = ['de-en','de-en_B','de-en_M',  'ru-en', 'ru-en_B', 'ru-en_M', 'zh-en','zh-en_B','zh-en_M', ]
MULTIREF_ENTO_LPS = [ 'en-de','en-de_B','en-de_P','en-de_M', 'en-zh', 'en-zh_B', 'en-zh_M']


EVALHUMAN_LPS = ['de-en', 'ru-en', 'zh-en', 'en-de',   'en-zh']
 
outdir= './system-level/tables'
os.makedirs(outdir, exist_ok = True)


METRIC_ORDER = [    'HUMAN_RAW',
 'sentBLEU', 'BLEU', 'bleu', 'TER', 'ter',
 'chrF++',  'chrF', 'chrf',
 'parbleu',
 'parchrf++', 
 'CharacTER','EED', 
 'YiSi-0', 'SWSS+METEOR',
 'MEE',  
 'prism',
 'YiSi-1',
 'Yisi-combi',
 'bleurt-Yisi-combi',
 'bleurt-combi',
 'BERT-base-L2',
 'BERT-large-L2',
 'mBERT-L2',
 'BLEURT',
 'BLEURT-extended',
 'esim', 
 'paresim-1',
 'COMET',
 'COMET-2R',
 'COMET-HTER',
 'COMET-MQM',
 'COMET-Rank',  
 'BAQ_dyn',
 'BAQ_static',
 'EQ_dyn',
 'EQ_static', 
 'COMET-QE', 
 'OpenKiwi-Bert',
 'OpenKiwi-XLMR', 
 'YiSi-2']




In [None]:


if __name__ == '__main__':
     
    import argparse
    import os
    parser = argparse.ArgumentParser()
    
    parser.add_argument('--scores-dir', type=str, default='data/wmt19_sys_scores', help="Directory with scores files")  
    parser.add_argument('--outliers', action='store_true', default = False, help='also include results without outlier systems')
    parser.add_argument('--winners', action='store_true', default = False, help="get winners using William's test")

    parser.add_argument('--corr-dir', type=str, default=None, help="Directory to store correlations for each language pair, and if 'winners' is true, also saves p-values")  
    parser.add_argument('--outputformat', type=str, default='latex', choices = ['latex','csv'], help="format to save final results tables")
    parser.add_argument('--tables-dir', type=str, default=None, help="Directory to save final results tables")

    args = parser.parse_args()


In [2]:
#for table to print list of outliers

for lp, ol in outliers_dict.items():
    if ol:
        ol = ['\system{'+ o + '}' for o in ol]
        ss = ", ".join(ol)
        print(f'{lp} & {ss} \\\\')

cs-en & \system{CUNI-DocTransformer.1457}, \system{zlabs-nlp.1149} \\
de-en & \system{zlabs-nlp.1153}, \system{yolo.1052}, \system{WMTBiomedBaseline.387} \\
iu-en & \system{NiuTrans.1206}, \system{Facebook_AI.729} \\
ja-en & \system{Online-Z.1640}, \system{zlabs-nlp.66}, \system{Online-G.1564} \\
pl-en & \system{zlabs-nlp.1162} \\
ru-en & \system{zlabs-nlp.1164} \\
ta-en & \system{Online-G.1568}, \system{TALP_UPC.192} \\
zh-en & \system{WMTBiomedBaseline.183} \\
en-cs & \system{zlabs-nlp.1151}, \system{Online-G.1555} \\
en-de & \system{zlabs-nlp.179}, \system{Online-G.1556}, \system{WMTBiomedBaseline.388} \\
en-iu_news & \system{UQAM_TanLe.521}, \system{OPPO.722}, \system{UEDIN.1281} \\
en-iu_full & \system{UQAM_TanLe.521}, \system{OPPO.722}, \system{UEDIN.1281} \\
en-iu & \system{UQAM_TanLe.521}, \system{OPPO.722}, \system{UEDIN.1281} \\
en-pl & \system{Online-A.1576}, \system{zlabs-nlp.180}, \system{Online-Z.1634} \\
en-ta & \system{TALP_UPC.1049}, \system{SJTU-NICT.386}, \system{Onl

In [3]:
import pandas as pd
import glob 
            
from metric_williams import metric_williams
from utils import output_tables, output_combined_tables

class DACorrelation:
    """ stores and returns information related to Pearson/Kendall Tau correlation
    and (optionally) significance values for each language pair """
    
    def __init__(self, scores_dir = None, correlation = 'Pearson', metric_order = METRIC_ORDER, williams = False, include_lps = None, exclude_outliers = None, 
                 exclude_systems = None, include_metrics = None, exclude_metrics = None): 
        self.correlation = correlation
        self.ss = williams  
        self.lps = []  
        self.scores = {}
        self.exclude_outliers = exclude_outliers
        self.metric_order = metric_order
        self.correlations = {}  
        self.pvals = {}   
        if scores_dir:
            scores_files = f'{scores_dir}/*scores.csv'
            print(f'importing scores from dir: {scores_dir}')
            for file in sorted(glob.glob(scores_files)): 
                self.add_scores_file(file, include_lps = include_lps, exclude_systems = exclude_systems, include_metrics = include_metrics, exclude_metrics = exclude_metrics)
        
    def add_scores_file(self, file, include_lps = None, exclude_systems = None, include_metrics = None, exclude_metrics = None):  
#         scores_files = f'{scores_dir}/*scores.csv'
#             print('adding', file)
            scores = pd.read_csv(file, delimiter = '\s', engine='python') 
    
            lp = scores['LP'].values[0] 
            if include_lps:
                if lp not in include_lps:
                    return
                
            if include_metrics:  
                temp=scores[['LP','SYSTEM','HUMAN']].copy()
                for metric in include_metrics:
                    if metric in scores:
                        temp[metric] = scores[metric]
                if len(temp.columns) == 3:
                    print(f'skipping {lp}: None of {include_metrics} are available') 
                    display(scores)
                    return
                scores = temp
            
            if exclude_metrics:
                scores = scores.drop(exclude_metrics, axis=1, errors='ignore')    
                
            if exclude_systems:                
                scores = scores[~scores.SYSTEM.isin(exclude_systems)]  
                
            scores_nonans = scores.dropna(axis = 1) 
            metricsna = set(scores.columns) - set(scores_nonans.columns)

            if len(metricsna) != 0:
                print(f' NaNs for lp {lp} for metrics {metricsna}') 
                if len(scores_nonans.columns) == 3:
                    print(f'skipping {lp}: No metrics without NaNs for lp') 
                    return
                    
            scores = scores.dropna(axis = 1) 
            
            if self.exclude_outliers: 
                temp = self.exclude_outliers(scores)  
#                 print(f'{lp}:  {len(scores) - len(temp)} outliers')
                scores = temp          
#             print(f'{lp}: {len(scores)} systems')
            self.lps.append(lp) 
                
            self.scores[lp] = scores    
            self.compute_corrs(scores)
        

    def compute_corrs(self, scores):  
        lp = scores['LP'].values[0] 
        corrs = pd.DataFrame(scores.corr(self.correlation.lower()).HUMAN[1:].rename(self.correlation))    
        corrs = corrs.sort_values(self.correlation, ascending=False)
        corrs['N'] = len(scores) 

        if self.ss:
            self.pvals[lp], winners = metric_williams(scores)  
            corrs['Winner'] =  winners   
        self.correlations[lp] = corrs
        
    def add_corrs(self, corrs_table):
        """ add """
        for lp in corrs_table.columns:
            corrs = corrs_table[[lp]]
#             corrs['N'] = n[lp]
#             corrs['Winner'] =  False  

#             if winner_set:
#                 for metric in winner_set[lp]:
#                     corrs.loc[metric, 'Winner'] = True
        self.correlations[lp] = corrs

    def get_tables(self, lps, formatter):   
        corrs = [] 
        for lp in lps: 
            corr = self.correlations[lp]
            if 'Winner' not in corr.columns:
                corr['Winner'] = [False for _ in corr.index]
            formattedscores = [formatter(c, w) for c,w in zip(corr[self.correlation], corr.Winner)]    
            corrs.append(pd.DataFrame(index = corr.index, data = {(lp, corr.N[0]): formattedscores }))
            
        res = pd.DataFrame().join(corrs, how='outer', sort=False).fillna('-')
        
        if self.metric_order:
            ordered_metrics = [m for m in  self.metric_order if m in res.index ]
            return res.reindex(ordered_metrics)
        else:
            return res.reindex(sorted(res.index.values,key = lambda x: x.upper()))
    

    def write_corr_files(self, output_dir):
        if self.outliers:
            suffix = '-nooutl'
        else:
            suffix = ''
        """writes correlations and significance results to file for each language pair"""
        for lp in self.lps: 
            lp_ = "".join(lp.split('-'))  
            self.correlations[lp].to_csv( f"{output_dir}/DA-{lp_}-cor{suffix}.csv", sep= '\t')
            if self.ss:
                self.pvals[lp].to_csv(f"{output_dir}/DA-{lp_}-sig{suffix}.csv", sep= '\t')
            
                  


In [17]:

da_allsys = DACorrelation(scores_dir=scores_dir, williams = True, correlation='Pearson',  
                            include_lps = ENTO_LPS + TOEN_LPS, 
                            exclude_outliers = False, 
                            exclude_systems = exclude_human, include_metrics =  None, exclude_metrics = ['HUMAN_RAW'])   
output_tables(da_allsys, outputformat='display_nb', output_dir=outdir, 
              lp_groups = {'MTall-nohuman-ento': ENTO_LPS, 'MTall-nohuman-toen': TOEN_LPS});
output_tables(da_allsys, outputformat='latex', output_dir=outdir, 
              lp_groups = {'MTall-nohuman-ento': ENTO_LPS, 'MTall-nohuman-toen': TOEN_LPS});

importing scores from dir: ./system-level/scores_ALL/


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  verify_integrity=True)


MTall-nohuman-ento


Unnamed: 0_level_0,en-cs,en-de,en-ja,en-pl,en-ru,en-ta,en-zh,en-iu_full,en-iu_news
Unnamed: 0_level_1,12,14,11,14,9,15,12,11,11
sentBLEU,0.840,0.934,0.946,0.950,0.981*,0.881,0.927,0.129,0.075
BLEU,0.825,0.928,0.945,0.943,0.980*,0.880,0.928,0.163,0.074
TER,0.814,0.941,0.297,0.893,0.064,0.870,-0.213,0.384,0.357
chrF++,0.833,0.958,0.952,0.956*,0.983*,0.929,0.878,0.328,0.315
chrF,0.826,0.962*,0.951,0.957*,0.982*,0.937,0.923,0.350,0.336
parbleu,0.870,0.910,0.869,0.948,0.959*,0.871,0.962,0.194,0.126
parchrf++,0.860,0.957,0.955,0.953*,0.975*,-,0.948,-,-
CharacTER,0.807,0.961*,0.951,0.935,0.961,0.957,0.905,0.503,0.515
EED,0.817,0.965*,0.955,0.962*,0.980*,0.959*,0.928,0.519,0.483
YiSi-0,0.797,0.953,0.967,0.953*,0.971*,0.929,0.362,0.525,0.505


MTall-nohuman-toen


Unnamed: 0_level_0,cs-en,de-en,ja-en,pl-en,ru-en,ta-en,zh-en,iu-en,km-en,ps-en
Unnamed: 0_level_1,12,12,10,14,11,14,16,11,7,6
sentBLEU,0.844,0.978,0.974*,0.502*,0.916*,0.925,0.948,0.649,0.969,0.888*
BLEU,0.851*,0.985,0.969*,0.549*,0.884,0.916,0.956,0.569,0.969,0.888*
TER,0.845*,0.993,0.974*,0.586*,0.904*,0.805,0.956,0.733*,0.973*,0.935*
chrF++,0.867*,0.997,0.974*,0.538*,0.894,0.953*,0.975*,0.726,0.983,0.900
chrF,0.872*,0.997*,0.968,0.528*,0.890,0.951*,0.976*,0.729,0.978,0.898
parbleu,0.834*,0.986,0.970*,0.562*,0.877,0.908,0.958,0.624,0.971*,0.939*
parchrf++,0.865*,0.998*,0.974*,0.551*,0.885,0.942*,0.976*,0.720,0.985*,0.899
CharacTER,0.844,0.998*,0.970*,0.522*,0.927*,0.965*,0.964,0.763*,0.977*,0.841
EED,0.884*,0.997*,0.974*,0.538*,0.926*,0.958*,0.956,0.821*,0.990*,0.930*
YiSi-0,0.876*,0.998*,0.972*,0.453,0.938*,0.968*,0.956,0.831*,0.986*,0.932*


In [18]:
 
da_noout = DACorrelation(scores_dir=scores_dir, williams = True, correlation='Pearson',  metric_order=METRIC_ORDER,
                            include_lps = None, 
                            exclude_outliers = exclude_outliers, 
                            exclude_systems = exclude_human, include_metrics =  None, exclude_metrics = ['HUMAN_RAW'])    

output_tables(da_noout, outputformat='display_nb', output_dir=outdir, 
              lp_groups = {'MTnooutl-nohuman-ento': ENTO_LPS, 'MTnooutl-nohuman-toen': TOEN_LPS});

output_tables(da_noout, outputformat='latex', output_dir=outdir, 
              lp_groups = {'MTnooutl-nohuman-ento': ENTO_LPS, 'MTnooutl-nohuman-toen': TOEN_LPS});

importing scores from dir: ./system-level/scores_ALL/


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  verify_integrity=True)


MTnooutl-nohuman-ento


Unnamed: 0_level_0,en-cs,en-de,en-ja,en-pl,en-ru,en-ta,en-zh,en-iu_full,en-iu_news
Unnamed: 0_level_1,10,11,11,11,9,12,12,8,8
sentBLEU,0.436,0.823,0.946,0.772*,0.981*,0.852,0.927,0.047,0.172*
BLEU,0.390,0.825,0.945,0.743*,0.980*,0.829,0.928,0.131,0.111*
TER,0.339,0.848*,0.297,0.553,0.064,0.883*,-0.213,0.133*,0.083*
chrF++,0.349,0.850*,0.952,0.783*,0.983*,0.880*,0.878,0.128,0.098*
chrF,0.313,0.862*,0.951,0.793*,0.982*,0.890*,0.923,0.122*,0.091*
parbleu,0.543,0.774,0.869,0.760*,0.959*,0.849*,0.962,0.464*,0.306*
parchrf++,0.438,0.845*,0.955,0.818*,0.975*,-,0.948,-,-
CharacTER,0.269,0.868*,0.951,0.726*,0.961,0.851,0.905,0.008,0.121*
EED,0.271,0.869*,0.955,0.789*,0.980*,0.913*,0.928,0.043,0.122*
YiSi-0,0.270,0.889*,0.967,0.783*,0.971*,0.897*,0.362,0.015,0.095*


MTnooutl-nohuman-toen


Unnamed: 0_level_0,cs-en,de-en,ja-en,pl-en,ru-en,ta-en,zh-en,iu-en,km-en,ps-en
Unnamed: 0_level_1,10,9,7,13,10,12,15,9,7,6
sentBLEU,0.800*,0.786*,0.851*,0.284*,0.833*,0.829*,0.950*,0.469*,0.969,0.888*
BLEU,0.800*,0.778,0.826,0.355*,0.761,0.807*,0.957*,0.348,0.969,0.888*
TER,0.783*,0.766*,0.752,0.346*,0.829*,0.795*,0.911,0.616*,0.973*,0.935*
chrF++,0.804*,0.699,0.871*,0.328*,0.833*,0.830*,0.955*,0.392,0.983,0.900
chrF,0.806*,0.687,0.861*,0.312*,0.831,0.828*,0.954*,0.337,0.978,0.898
parbleu,0.774*,0.838*,0.833*,0.342*,0.744,0.801*,0.953*,0.398,0.971*,0.939*
parchrf++,0.810*,0.708,0.877*,0.347*,0.823,0.825*,0.956*,0.296,0.985*,0.899
CharacTER,0.812*,0.687,0.895*,0.325*,0.869*,0.880*,0.950*,0.410*,0.977*,0.841
EED,0.838*,0.752*,0.904*,0.299*,0.872*,0.862*,0.932,0.587*,0.990*,0.930*
YiSi-0,0.825*,0.786*,0.867*,0.207*,0.874*,0.861*,0.918,0.563*,0.986*,0.932*


In [20]:
 
output_combined_tables(da_allsys, da_noout, outputformat='latex', output_dir=outdir, 
              lp_groups = {'MTcombined-nohuman-ento': ENTO_LPS, 'MTcombined-nohuman-toen': TOEN_LPS}) ;


output_combined_tables(da_allsys, da_noout, outputformat='display_nb', output_dir=outdir, 
              lp_groups = {'MTcombined-nohuman-ento': ENTO_LPS, 'MTcombined-nohuman-toen': TOEN_LPS}) ;



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  verify_integrity=True)


MTcombined-nohuman-ento


Unnamed: 0_level_0,en-cs,en-cs,en-de,en-de,en-ja,en-pl,en-pl,en-ru,en-ta,en-ta,en-zh,en-iu_full,en-iu_full,en-iu_news,en-iu_news
Unnamed: 0_level_1,All,-out,All,-out,All,All,-out,All,All,-out,All,All,-out,All,-out
Unnamed: 0_level_2,12,10,14,11,11,14,11,9,15,12,12,11,8,11,8
sentBLEU,0.840,0.436,0.934,0.823,0.946,0.950,0.772*,0.981*,0.881,0.852,0.927,0.129,0.047,0.075,0.172*
BLEU,0.825,0.390,0.928,0.825,0.945,0.943,0.743*,0.980*,0.880,0.829,0.928,0.163,0.131,0.074,0.111*
TER,0.814,0.339,0.941,0.848*,0.297,0.893,0.553,0.064,0.870,0.883*,-0.213,0.384,0.133*,0.357,0.083*
chrF++,0.833,0.349,0.958,0.850*,0.952,0.956*,0.783*,0.983*,0.929,0.880*,0.878,0.328,0.128,0.315,0.098*
chrF,0.826,0.313,0.962*,0.862*,0.951,0.957*,0.793*,0.982*,0.937,0.890*,0.923,0.350,0.122*,0.336,0.091*
parbleu,0.870,0.543,0.910,0.774,0.869,0.948,0.760*,0.959*,0.871,0.849*,0.962,0.194,0.464*,0.126,0.306*
parchrf++,0.860,0.438,0.957,0.845*,0.955,0.953*,0.818*,0.975*,-,-,0.948,-,-,-,-
CharacTER,0.807,0.269,0.961*,0.868*,0.951,0.935,0.726*,0.961,0.957,0.851,0.905,0.503,0.008,0.515,0.121*
EED,0.817,0.271,0.965*,0.869*,0.955,0.962*,0.789*,0.980*,0.959*,0.913*,0.928,0.519,0.043,0.483,0.122*
YiSi-0,0.797,0.270,0.953,0.889*,0.967,0.953*,0.783*,0.971*,0.929,0.897*,0.362,0.525,0.015,0.505,0.095*


MTcombined-nohuman-toen


Unnamed: 0_level_0,cs-en,cs-en,de-en,de-en,ja-en,ja-en,pl-en,pl-en,ru-en,ru-en,ta-en,ta-en,zh-en,zh-en,iu-en,iu-en,km-en,ps-en
Unnamed: 0_level_1,All,-out,All,-out,All,-out,All,-out,All,-out,All,-out,All,-out,All,-out,All,All
Unnamed: 0_level_2,12,10,12,9,10,7,14,13,11,10,14,12,16,15,11,9,7,6
sentBLEU,0.844,0.800*,0.978,0.786*,0.974*,0.851*,0.502*,0.284*,0.916*,0.833*,0.925,0.829*,0.948,0.950*,0.649,0.469*,0.969,0.888*
BLEU,0.851*,0.800*,0.985,0.778,0.969*,0.826,0.549*,0.355*,0.884,0.761,0.916,0.807*,0.956,0.957*,0.569,0.348,0.969,0.888*
TER,0.845*,0.783*,0.993,0.766*,0.974*,0.752,0.586*,0.346*,0.904*,0.829*,0.805,0.795*,0.956,0.911,0.733*,0.616*,0.973*,0.935*
chrF++,0.867*,0.804*,0.997,0.699,0.974*,0.871*,0.538*,0.328*,0.894,0.833*,0.953*,0.830*,0.975*,0.955*,0.726,0.392,0.983,0.900
chrF,0.872*,0.806*,0.997*,0.687,0.968,0.861*,0.528*,0.312*,0.890,0.831,0.951*,0.828*,0.976*,0.954*,0.729,0.337,0.978,0.898
parbleu,0.834*,0.774*,0.986,0.838*,0.970*,0.833*,0.562*,0.342*,0.877,0.744,0.908,0.801*,0.958,0.953*,0.624,0.398,0.971*,0.939*
parchrf++,0.865*,0.810*,0.998*,0.708,0.974*,0.877*,0.551*,0.347*,0.885,0.823,0.942*,0.825*,0.976*,0.956*,0.720,0.296,0.985*,0.899
CharacTER,0.844,0.812*,0.998*,0.687,0.970*,0.895*,0.522*,0.325*,0.927*,0.869*,0.965*,0.880*,0.964,0.950*,0.763*,0.410*,0.977*,0.841
EED,0.884*,0.838*,0.997*,0.752*,0.974*,0.904*,0.538*,0.299*,0.926*,0.872*,0.958*,0.862*,0.956,0.932,0.821*,0.587*,0.990*,0.930*
YiSi-0,0.876*,0.825*,0.998*,0.786*,0.972*,0.867*,0.453,0.207*,0.938*,0.874*,0.968*,0.861*,0.956,0.918,0.831*,0.563*,0.986*,0.932*


In [7]:

kt_allsys = DACorrelation(scores_dir=scores_dir, williams = False, correlation='Kendall',  metric_order=METRIC_ORDER,
#                             include_lps = ENTO_LPS + TOEN_LPS, 
                            exclude_outliers = False, 
                            exclude_systems = exclude_human, include_metrics = None,)   
output_tables(kt_allsys, outputformat='latex', output_dir=outdir, 
              lp_groups = {'MTall-nohuman-ento': ENTO_LPS, 'MTall-nohuman-toen': TOEN_LPS})

output_tables(kt_allsys, outputformat='display_nb', output_dir=outdir, 
              lp_groups = {'MTall-nohuman-ento': ENTO_LPS, 'MTall-nohuman-toen': TOEN_LPS});

importing scores from dir: ./system-level/scores_ALL/


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  verify_integrity=True)


MTall-nohuman-ento


Unnamed: 0_level_0,en-cs,en-de,en-ja,en-pl,en-ru,en-ta,en-zh,en-iu_full,en-iu_news
Unnamed: 0_level_1,12,14,11,14,9,15,12,11,11
HUMAN_RAW,1.000,0.868,0.964,0.846,0.778,0.810,0.818,0.600,0.600
sentBLEU,0.515,0.802,0.855,0.604,0.944,0.867,0.727,0.236,0.273
BLEU,0.515,0.802,0.818,0.582,0.889,0.829,0.727,0.236,0.236
TER,0.515,0.824,0.018,0.641,0.556,0.752,0.242,0.309,0.309
chrF++,0.485,0.868,0.782,0.604,0.889,0.829,0.727,0.309,0.309
chrF,0.485,0.868,0.818,0.604,0.889,0.810,0.727,0.345,0.309
parbleu,0.504,0.736,0.611,0.633,0.761,0.842,0.718,0.404,0.345
parchrf++,0.515,0.846,0.818,0.670,0.889,-,0.727,-,-
CharacTER,0.515,0.890,0.782,0.560,0.944,0.771,0.697,0.236,0.345
EED,0.545,0.868,0.782,0.604,0.833,0.867,0.727,0.273,0.273


MTall-nohuman-toen


Unnamed: 0_level_0,cs-en,de-en,ja-en,pl-en,ru-en,ta-en,zh-en,iu-en,km-en,ps-en
Unnamed: 0_level_1,12,12,10,14,11,14,16,11,7,6
HUMAN_RAW,0.727,0.758,0.778,0.429,0.673,0.604,0.65,0.891,0.905,1.000
sentBLEU,0.788,0.758,0.733,0.297,0.564,0.692,0.85,0.455,0.619,0.600
BLEU,0.848,0.697,0.778,0.407,0.455,0.692,0.833,0.309,0.714,0.600
TER,0.758,0.788,0.689,0.287,0.600,0.780,0.8,0.514,0.878,0.867
chrF++,0.818,0.697,0.778,0.407,0.673,0.714,0.85,0.418,0.619,0.733
chrF,0.818,0.727,0.822,0.363,0.709,0.714,0.833,0.418,0.619,0.733
parbleu,0.809,0.779,0.778,0.420,0.491,0.685,0.807,0.404,0.714,0.867
parchrf++,0.818,0.727,0.822,0.407,0.709,0.714,0.817,0.491,0.619,0.733
CharacTER,0.758,0.758,0.822,0.341,0.745,0.692,0.8,0.527,0.810,0.733
EED,0.788,0.727,0.733,0.297,0.782,0.758,0.833,0.636,0.714,0.733


In [4]:

da_noout_mref = DACorrelation(scores_dir=scores_dir, williams = True, correlation='Pearson',  
                            include_lps = None, metric_order = METRIC_ORDER,
                            exclude_outliers = exclude_outliers, 
                            exclude_systems = exclude_human, include_metrics = None, exclude_metrics = qe_metrics + ['HUMAN_RAW', 'BAQ_static', 'BAQ_dyn', 'EQ_static', 'EQ_dyn'])    

output_tables(da_noout_mref, outputformat='display_nb', output_dir=outdir, 
              lp_groups = {'MTnooutl-nohuman-alllp-mref': MULTIREF_ENTO_LPS + MULTIREF_TOEN_LPS});

output_tables(da_noout_mref, outputformat='latex', output_dir=outdir, 
              lp_groups = {'MTnooutl-nohuman-alllp-mref': MULTIREF_ENTO_LPS + MULTIREF_TOEN_LPS});


importing scores from dir: ./system-level/scores_ALL/


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  verify_integrity=True)


MTnooutl-nohuman-alllp-mref


Unnamed: 0_level_0,en-de,en-de_B,en-de_P,en-de_M,en-zh,en-zh_B,en-zh_M,de-en,de-en_B,de-en_M,ru-en,ru-en_B,ru-en_M,zh-en,zh-en_B,zh-en_M
Unnamed: 0_level_1,11,11,11,11,12,12,12,9,9,9,10,10,10,15,15,15
sentBLEU,0.823,0.837,0.815,0.827*,0.927,0.911,0.919,0.786*,0.763,0.788*,0.833*,0.850*,0.837*,0.950*,0.928,0.944*
BLEU,0.825,0.844,0.830,0.822,0.928,0.899,0.913,0.778,0.797,0.805*,0.761,0.780,0.775,0.957*,0.934*,0.949*
TER,0.848*,0.860*,0.859*,0.852*,-0.213,-0.200,-0.203,0.766*,0.744*,0.758*,0.829*,0.832*,0.853*,0.911,0.875,0.911
chrF++,0.850*,0.866*,0.876*,0.858*,0.878,0.915,0.885,0.699,0.681,0.704,0.833*,0.839*,0.843*,0.955*,0.948*,0.952*
chrF,0.862*,0.874*,0.883*,-,0.923,0.912,-,0.687,0.683,-,0.831,0.839*,-,0.954*,0.947*,-
parbleu,0.774,0.796,0.724,0.794,0.962,0.955,0.959,0.838*,0.831*,0.829*,0.744,0.767,0.756,0.953*,0.934*,0.945*
parchrf++,0.845*,0.863*,0.865*,0.856*,0.948,0.966*,0.896,0.708,0.704,0.669,0.823,0.834*,0.832*,0.956*,0.950*,0.956*
CharacTER,0.868*,0.889*,0.835*,0.878*,0.905,0.908,0.901,0.687,0.696,0.713*,0.869*,0.853*,0.873*,0.950*,0.942*,0.949*
EED,0.869*,0.871*,0.867*,0.867*,0.928,0.923,0.930,0.752*,0.747*,0.752*,0.872*,0.868*,0.879*,0.932,0.922,0.932
YiSi-0,0.889*,0.882*,0.873*,0.886*,0.362,0.273,0.332,0.786*,0.790*,0.794*,0.874*,0.867*,0.880*,0.918,0.911,0.918


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  verify_integrity=True)


In [5]:
###making a copy of the ende scores files for convenience as we want to get metric correlations of en-de with HUMAN-A +MT systems, and then 
# HUMAN-B +MT systems 
!cp  '{scores_dir}/DA-newstestP2020-ende-sys-scores.csv'  '{scores_dir}/DA-newstestPa2020-ende-sys-scores.csv'
!cp  '{scores_dir}/DA-newstestP2020-ende-sys-scores.csv'  '{scores_dir}/DA-newstestPb2020-ende-sys-scores.csv'

!sed -i 's/en-de_P/en-de_Pa/'  '{scores_dir}/DA-newstestPa2020-ende-sys-scores.csv'
!sed -i 's/en-de_P/en-de_Pb/'  '{scores_dir}/DA-newstestPb2020-ende-sys-scores.csv'

EVALHUMAN_LPS = ['de-en', 'ru-en', 'zh-en', 'en-de', 'en-de_B', 'en-de_P', 'en-zh', 'en-zh_B']
# EVALHUMAN_LPS = ['de-en', 'ru-en', 'zh-en', 'en-de',   'en-zh' ]

da_noout_human = DACorrelation(scores_dir=None, williams = True, correlation='Pearson',  
                            include_lps = EVALHUMAN_LPS, 
                            exclude_outliers = exclude_outliers, 
                            exclude_systems = ['Human-A.0'], include_metrics = None, exclude_metrics = ['HUMAN_RAW'])   



file = f'{scores_dir}/DA-newstest2020-deen-sys-scores.csv'
da_noout_human.add_scores_file(file,  exclude_systems = ['Human-A.0'], include_metrics = None, exclude_metrics = ['HUMAN_RAW'])


file = f'{scores_dir}/DA-newstest2020-ruen-sys-scores.csv'
da_noout_human.add_scores_file(file,  exclude_systems = ['Human-A.0'], include_metrics = None, exclude_metrics = ['HUMAN_RAW'])


file = f'{scores_dir}/DA-newstest2020-zhen-sys-scores.csv'
da_noout_human.add_scores_file(file,  exclude_systems = ['Human-A.0','WMTBiomedBaseline.183'], include_metrics = None, exclude_metrics = ['HUMAN_RAW'])




file = f'{scores_dir}/DA-newstest2020-ende-sys-scores.csv'
da_noout_human.add_scores_file(file,  exclude_systems = ['Human-A.0','Human-P.0'], include_metrics = None, exclude_metrics = ['HUMAN_RAW'])

file = f'{scores_dir}/DA-newstestPb2020-ende-sys-scores.csv'
da_noout_human.add_scores_file(file,  exclude_systems = ['Human-P.0','Human-A.0'], include_metrics = None, exclude_metrics = ['HUMAN_RAW'])

file = f'{scores_dir}/DA-newstestB2020-ende-sys-scores.csv'
da_noout_human.add_scores_file(file,  exclude_systems = ['Human-B.0','Human-P.0'], include_metrics = None, exclude_metrics = ['HUMAN_RAW'])


file = f'{scores_dir}/DA-newstestPa2020-ende-sys-scores.csv'
da_noout_human.add_scores_file(file,  exclude_systems = ['Human-P.0','Human-B.0'], include_metrics = None, exclude_metrics = ['HUMAN_RAW'])



file = f'{scores_dir}/DA-newstest2020-enzh-sys-scores.csv'
da_noout_human.add_scores_file(file,  exclude_systems = ['Human-A.0'], include_metrics = None, exclude_metrics = ['HUMAN_RAW'])
# da_noout_human.correlations.keys()


file = f'{scores_dir}/DA-newstestB2020-enzh-sys-scores.csv'
da_noout_human.add_scores_file(file,  exclude_systems = ['Human-B.0'], include_metrics = None, exclude_metrics = ['HUMAN_RAW'])
da_noout_human.correlations.keys()


output_tables(da_noout_human, outputformat='display_nb', output_dir=outdir, 
              lp_groups = {'MTnooutl-inclhuman': [ 'en-de', 'en-de_Pb','en-de_B', 'en-de_Pa','en-zh', 'en-zh_B','de-en', 'ru-en', 'zh-en']}) ;


output_tables(da_noout_human, outputformat='latex', output_dir=outdir, 
              lp_groups = {'MTnooutl-inclhuman': [ 'en-de', 'en-de_Pb','en-de_B', 'en-de_Pa','en-zh', 'en-zh_B','de-en', 'ru-en', 'zh-en']}) ;
 

dict_keys(['de-en', 'ru-en', 'zh-en', 'en-de', 'en-de_Pb', 'en-de_B', 'en-de_Pa', 'en-zh', 'en-zh_B'])

In [12]:
# BLEU vs PARBLEU
da_noout = DACorrelation(scores_dir=scores_dir, williams = True, correlation='Pearson',  metric_order=METRIC_ORDER,
                            include_lps = None, 
                            exclude_outliers = exclude_outliers, 
                            exclude_systems = exclude_human, include_metrics =  ['BLEU',  'parbleu'], exclude_metrics = ['HUMAN_RAW'])    

output_tables(da_noout, outputformat='display_nb', output_dir=outdir, 
              lp_groups = {'Out-of-English MT systems': ENTO_LPS, 'To-English MT systems': TOEN_LPS});

importing scores from dir: ./system-level/scores_ALL/
Out-of-English MT systems


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  verify_integrity=True)


Unnamed: 0_level_0,en-cs,en-de,en-ja,en-pl,en-ru,en-ta,en-zh,en-iu_full,en-iu_news
Unnamed: 0_level_1,10,11,11,11,9,12,12,8,8
BLEU,0.390,0.825*,0.945*,0.743*,0.980*,0.829*,0.928,0.131,0.111*
parbleu,0.543*,0.774*,0.869*,0.760*,0.959*,0.849*,0.962*,0.464*,0.306*


To-English MT systems


Unnamed: 0_level_0,cs-en,de-en,ja-en,pl-en,ru-en,ta-en,zh-en,iu-en,km-en,ps-en
Unnamed: 0_level_1,10,9,7,13,10,12,15,9,7,6
BLEU,0.800*,0.778,0.826*,0.355*,0.761*,0.807*,0.957*,0.348*,0.969*,0.888*
parbleu,0.774*,0.838*,0.833*,0.342*,0.744*,0.801*,0.953*,0.398*,0.971*,0.939*


In [13]:
# esim vs paresim
da_noout = DACorrelation(scores_dir=scores_dir, williams = True, correlation='Pearson',  metric_order=METRIC_ORDER,
                            include_lps = None, 
                            exclude_outliers = exclude_outliers, 
                            exclude_systems = exclude_human, include_metrics =  ['esim',  'paresim'], exclude_metrics = ['HUMAN_RAW'])    

output_tables(da_noout, outputformat='display_nb', output_dir=outdir, 
              lp_groups = {'Out-of-English MT systems': ENTO_LPS, 'To-English MT systems': TOEN_LPS});

importing scores from dir: ./system-level/scores_ALL/
Out-of-English MT systems


Unnamed: 0_level_0,en-cs,en-de,en-ja,en-pl,en-ru,en-ta,en-zh,en-iu_full,en-iu_news
Unnamed: 0_level_1,10,11,11,11,9,12,12,8,8
esim,0.575*,0.894*,0.993*,0.698*,0.967*,0.833*,0.972*,0.365*,0.418*


To-English MT systems


Unnamed: 0_level_0,cs-en,de-en,ja-en,pl-en,ru-en,ta-en,zh-en,iu-en,km-en,ps-en
Unnamed: 0_level_1,10,9,7,13,10,12,15,9,7,6
esim,0.716*,0.808*,0.822*,0.358*,0.834*,0.801*,0.910*,0.514*,0.929*,0.929*


In [14]:
# AVERAGE KENDALL TAU over all LPS
def ident(x,_):
    return x
kt_allsys.get_tables(ENTO_LPS + TOEN_LPS, formatter=ident).replace('-', np.nan).dropna().mean(axis=1)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  verify_integrity=True)


HUMAN_RAW          0.773547
sentBLEU           0.640989
BLEU               0.629937
TER                0.585661
chrF++             0.658379
chrF               0.663862
parbleu            0.647740
CharacTER          0.669900
EED                0.672373
YiSi-0             0.660308
prism              0.678928
YiSi-1             0.684312
BLEURT-extended    0.687898
esim               0.710759
paresim-1          0.713191
COMET              0.694521
COMET-2R           0.693464
COMET-HTER         0.669445
COMET-MQM          0.671558
COMET-Rank         0.604501
COMET-QE           0.654176
OpenKiwi-Bert      0.453988
OpenKiwi-XLMR      0.588314
YiSi-2             0.531612
dtype: float64

In [15]:
# AVERAGE Pearson over all LPS

da_noout.get_tables(ENTO_LPS + TOEN_LPS, formatter=ident).replace('-', np.nan).dropna().mean(axis=1)


esim    0.754524
dtype: float64

In [63]:
# Ktau correlation with RAW scores: made with modifying the code above to use HUMAN_RAW as ground truth
# kt_allsys = DACorrelation(scores_dir=scores_dir, williams = False, correlation='Kendall',  metric_order=METRIC_ORDER,
#                             include_lps = ENTO_LPS + TOEN_LPS, 
#                             exclude_outliers = False, 
#                             exclude_systems = exclude_human, include_metrics = None,)   
# output_tables(kt_allsys, outputformat='latex', output_dir=outdir, 
#               lp_groups = {'MTall-nohuman-ento': ENTO_LPS, 'MTall-nohuman-toen': TOEN_LPS})

# output_tables(kt_allsys, outputformat='display_nb', output_dir=outdir, 
#               lp_groups = {'MTall-nohuman-ento': ENTO_LPS, 'MTall-nohuman-toen': TOEN_LPS});

importing scores from dir: ./system-level/scores_ALL/
cs-en: 12 systems
de-en: 12 systems
en-cs: 12 systems
en-de: 14 systems
en-iu_full: 11 systems
en-iu_news: 11 systems
en-ja: 11 systems
en-pl: 14 systems
en-ru: 9 systems
en-ta: 15 systems
en-zh: 12 systems
iu-en: 11 systems
ja-en: 10 systems
km-en: 7 systems
pl-en: 14 systems
ps-en: 6 systems
ru-en: 11 systems
ta-en: 14 systems
zh-en: 16 systems


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  verify_integrity=True)


MTall-nohuman-ento


Unnamed: 0_level_0,en-cs,en-de,en-ja,en-pl,en-ru,en-ta,en-zh,en-iu_full,en-iu_news
Unnamed: 0_level_1,12,14,11,14,9,15,12,11,11
HUMAN_RAW,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000
sentBLEU,0.515,0.714,0.818,0.626,0.833,0.867,0.848,-0.091,-0.055
BLEU,0.515,0.758,0.782,0.604,0.778,0.829,0.848,-0.091,-0.091
TER,0.515,0.736,-0.018,0.575,0.556,0.790,0.364,-0.091,-0.091
chrF++,0.485,0.824,0.745,0.626,0.889,0.829,0.788,-0.091,-0.091
chrF,0.485,0.824,0.782,0.626,0.889,0.810,0.848,-0.055,-0.091
parbleu,0.504,0.648,0.574,0.656,0.873,0.861,0.840,0.037,0.018
parchrf++,0.515,0.846,0.782,0.692,0.889,-,0.788,-,-
CharacTER,0.515,0.846,0.745,0.538,0.833,0.771,0.758,-0.091,-0.055
EED,0.545,0.824,0.745,0.582,0.833,0.867,0.848,-0.055,-0.127


MTall-nohuman-toen


Unnamed: 0_level_0,cs-en,de-en,ja-en,pl-en,ru-en,ta-en,zh-en,iu-en,km-en,ps-en
Unnamed: 0_level_1,12,12,10,14,11,14,16,11,7,6
HUMAN_RAW,1.000,1.000,1.000,1.000,1.000,1.000,1.0,1.000,1.000,1.000
sentBLEU,0.636,0.758,0.778,0.209,0.382,0.604,0.567,0.345,0.714,0.600
BLEU,0.697,0.697,0.822,0.143,0.418,0.604,0.483,0.200,0.810,0.600
TER,0.667,0.727,0.733,0.243,0.273,0.604,0.483,0.404,0.976,0.867
chrF++,0.667,0.697,0.733,0.143,0.564,0.670,0.6,0.309,0.714,0.733
chrF,0.667,0.667,0.689,0.099,0.527,0.670,0.583,0.309,0.714,0.733
parbleu,0.718,0.779,0.822,0.155,0.382,0.597,0.521,0.294,0.810,0.867
parchrf++,0.667,0.667,0.689,0.143,0.527,0.670,0.567,0.382,0.714,0.733
CharacTER,0.606,0.697,0.778,0.165,0.491,0.692,0.517,0.418,0.714,0.733
EED,0.636,0.667,0.689,0.209,0.455,0.626,0.55,0.527,0.810,0.733
