In [None]:
!pip3 install numpy==1.20.1
!pip3 install pandas==1.2.3
!pip3 install scipy==1.6.2
!pip3 install statsmodels==0.12.2

In [1]:
import copy
import json
import numpy as np
import os
import pandas as pd
from scipy.stats import zscore
from statsmodels.stats.inter_rater import fleiss_kappa
from scipy.stats import ranksums

SYSTEMS_PATH = '../../results/en'
REFERENCES_PATH = '../../../references/references-en.json'
DOMAIN_PATH = '../../../references/ref2types-en.json'

# Parsing the data

In [2]:
rdfs = json.load(open(REFERENCES_PATH))
sys_files = [w for w in os.listdir(SYSTEMS_PATH) if not w.startswith('.') and not w.endswith('.json')] # exclude result file
domains = json.load(open(DOMAIN_PATH))

doc_id = 1
data = []
for sys_file in sys_files:
    results = json.load(open(os.path.join(SYSTEMS_PATH, sys_file, 'primary.json')))
    submission_id = sys_file

    for sample_id in results:
        entry = [w for w in rdfs['entries'] if list(w.keys())[0] == sample_id][0]
        for worker_id in results[sample_id]:
            assign = results[sample_id][worker_id]
            inp = {
                'id': doc_id,
                'sample_id': sample_id,
                'domain': domains['Id' + str(sample_id)],
                'submission_id': submission_id,
                'worker_id': worker_id,
                'category': entry[sample_id]['category'],
                'size': entry[sample_id]['size'],
            }
            inp.update(assign)
            data.append(inp)
            doc_id += 1

# Inter-rater Agreement
## Fleiss' Kappa

Discretize the ratings in 5 categories

In [3]:
n_cat, max_range = 5, 100 # number of categories
data_discretized = []

ids = [w['id'] for w in data]
correctness = [int((n_cat* w['Correctness']) / (max_range+1)) for w in data]
coverage = [int((n_cat* w['DataCoverage']) / (max_range+1)) for w in data]
fluency = [int((n_cat* w['Fluency']) / (max_range+1)) for w in data]
relevance = [int((n_cat* w['Relevance']) / (max_range+1)) for w in data]
structure = [int((n_cat* w['TextStructure']) / (max_range+1)) for w in data]
    
for i, id_ in enumerate(ids):
    for j, row in enumerate(data):
        if row['id'] == id_:
            row_ = copy.copy(row)
            row_['Correctness'] = correctness[i]
            row_['DataCoverage'] = coverage[i]
            row_['Fluency'] = fluency[i]
            row_['Relevance'] = relevance[i]
            row_['TextStructure'] = structure[i]
            data_discretized.append(row_)
            break

data_discretized = sorted(data_discretized, key=lambda x: x['id'])         

Computing the Fleiss' Kappa agreements

In [4]:
assignments = set([(w['submission_id'], w['sample_id']) for w in data_discretized])

correctness = np.zeros((len(assignments), n_cat))
coverage = np.zeros((len(assignments), n_cat))
fluency = np.zeros((len(assignments), n_cat))
relevance = np.zeros((len(assignments), n_cat))
structure = np.zeros((len(assignments), n_cat))

for i, (submission_id, sample_id) in enumerate(assignments):
    fdata = [w for w in data_discretized if w['submission_id'] == submission_id and w['sample_id'] == sample_id]
    
    for rating in fdata:
        correctness[i, rating['Correctness']-1] += 1
        coverage[i, rating['DataCoverage']-1] += 1
        fluency[i, rating['Fluency']-1] += 1
        relevance[i, rating['Relevance']-1] += 1
        structure[i, rating['TextStructure']-1] += 1      

# delete trials with less than 3 ratings
new_correctness = []
new_coverage = []
new_fluency = []
new_relevance = []
new_structure = []
for i in range(correctness.shape[0]):
    if correctness[i].sum() == 3:
        new_correctness.append(correctness[i])
    if coverage[i].sum() == 3:
        new_coverage.append(coverage[i])
    if fluency[i].sum() == 3:
        new_fluency.append(fluency[i])
    if relevance[i].sum() == 3:
        new_relevance.append(relevance[i])
    if structure[i].sum() == 3:
        new_structure.append(structure[i])
    
pd.DataFrame({"Fleiss' Kappa": {
    'Correctness': fleiss_kappa(new_correctness),
    'Data Coverage': fleiss_kappa(new_coverage),
    'Fluency': fleiss_kappa(new_fluency),
    'Relevance': fleiss_kappa(new_relevance),
    'Text Structure': fleiss_kappa(new_structure),
}}).round(3)

Unnamed: 0,Fleiss' Kappa
Correctness,0.166
Data Coverage,0.185
Fluency,0.113
Relevance,0.129
Text Structure,0.107


# Human Evaluation

Results of the human evaluation for the participating systems according to original ratings of correctness, data coverage, fluency, relevance and text structure.

In [5]:
import json
submission_ids = sorted(list(set([w['submission_id'] for w in data])))
sample_ids = sorted(list(set([w['sample_id'] for w in data])), key=lambda x: int(x))

finaldata = []
for submission_id in submission_ids:
  for sample_id in sample_ids:
    fdata = [w for w in data if w['submission_id'] == submission_id and w['sample_id'] == sample_id]

    if len(fdata) > 0:
      finaldata.append({
        'submission_id': submission_id,
        'size': fdata[0]['size'],
        'sample_id': sample_id,
        'domain': fdata[0]['domain'],
        'category': fdata[0]['category'],
        'Correctness': np.nan_to_num(np.mean(np.nan_to_num([w['Correctness'] for w in fdata]))),
        'DataCoverage': np.nan_to_num(np.mean(np.nan_to_num([w['DataCoverage'] for w in fdata]))),
        'Fluency': np.nan_to_num(np.mean(np.nan_to_num([w['Fluency'] for w in fdata]))),
        'Relevance': np.nan_to_num(np.mean(np.nan_to_num([w['Relevance'] for w in fdata]))),
        'TextStructure': np.nan_to_num(np.mean(np.nan_to_num([w['TextStructure'] for w in fdata])))
      })

json.dump(finaldata, open(SYSTEMS_PATH + '/english_humeval_data_all_teams.json', 'w'), separators=(',', ':'), indent=4)


In [6]:
df = pd.DataFrame(finaldata)

submissions = df.groupby("submission_id")["Correctness", "DataCoverage", "Fluency", "Relevance", "TextStructure"]
submissions.agg([np.mean, np.std]).sort_values(by=('Correctness', 'mean'), ascending=False).round(3)

  submissions = df.groupby("submission_id")["Correctness", "DataCoverage", "Fluency", "Relevance", "TextStructure"]


Unnamed: 0_level_0,Correctness,Correctness,DataCoverage,DataCoverage,Fluency,Fluency,Relevance,Relevance,TextStructure,TextStructure
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
submission_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
WebNLG-2020-reference,94.149,7.362,95.442,5.875,89.846,10.473,94.392,7.282,92.105,8.231
bt5,93.583,8.819,93.836,9.309,88.688,10.721,95.22,6.088,91.914,8.141
Amazon_AI_(Shanghai),93.531,9.079,94.393,8.347,90.286,9.461,95.196,7.365,92.951,7.004
OSU_Neural_NLG,93.409,8.285,95.123,6.569,90.066,10.876,94.615,7.272,92.438,7.46
FBConvAI,92.7,10.618,93.169,11.532,90.837,10.217,93.898,10.613,93.089,7.86
DANGNT-SGU,92.489,9.164,95.315,6.63,78.594,16.795,94.856,6.363,83.501,14.403
Baseline-FORGE2020,92.313,9.429,93.417,9.917,82.895,15.118,94.314,8.598,87.894,11.777
RALI,92.128,8.372,95.204,6.324,77.759,18.33,94.81,6.572,81.835,15.632
NUIG-DSI,92.053,11.451,92.063,9.908,88.898,10.49,94.061,7.52,91.588,9.495
cuni-ufal,91.587,10.69,93.291,9.332,87.642,11.666,94.555,8.17,90.752,8.941


# Human Evaluation (Z-Scores)

Results of the human evaluation for the participating systems according to normalized z-scores for correctness, data coverage, fluency, relevance and text structure.

In [7]:
normdata = []
worker_ids = set([w['worker_id'] for w in data])
for worker_id in worker_ids:
    fdata = [w for w in data if w['worker_id'] == worker_id]
    
    ids = [w['id'] for w in fdata]
    correctness = zscore([w['Correctness'] for w in fdata])
    coverage = zscore([w['DataCoverage'] for w in fdata])
    fluency = zscore([w['Fluency'] for w in fdata])
    relevance = zscore([w['Relevance'] for w in fdata])
    structure = zscore([w['TextStructure'] for w in fdata])
    
    for i, id_ in enumerate(ids):
        for j, row in enumerate(data):
            if row['id'] == id_:
                row_ = copy.copy(row)
                row_['Correctness'] = correctness[i]
                row_['DataCoverage'] = coverage[i]
                row_['Fluency'] = fluency[i]
                row_['Relevance'] = relevance[i]
                row_['TextStructure'] = structure[i]
                normdata.append(row_)
                break

In [8]:
submission_ids = sorted(list(set([w['submission_id'] for w in data])))
sample_ids = sorted(list(set([w['sample_id'] for w in data])), key=lambda x: int(x))

final_normdata = []
for submission_id in submission_ids:
  for sample_id in sample_ids:
    fdata = [w for w in normdata if w['submission_id'] == submission_id and w['sample_id'] == sample_id]

    if len(fdata) > 0:
      final_normdata.append({
        'submission_id': submission_id,
        'size': fdata[0]['size'],
        'sample_id': sample_id,
        'domain': fdata[0]['domain'],
        'category': fdata[0]['category'],
        'Correctness': np.nan_to_num(np.mean(np.nan_to_num([w['Correctness'] for w in fdata]))),
        'DataCoverage': np.nan_to_num(np.mean(np.nan_to_num([w['DataCoverage'] for w in fdata]))),
        'Fluency': np.nan_to_num(np.mean(np.nan_to_num([w['Fluency'] for w in fdata]))),
        'Relevance': np.nan_to_num(np.mean(np.nan_to_num([w['Relevance'] for w in fdata]))),
        'TextStructure': np.nan_to_num(np.mean(np.nan_to_num([w['TextStructure'] for w in fdata])))
      })

df = pd.DataFrame(final_normdata)
submissions = df.groupby("submission_id")["Correctness", "DataCoverage", "Fluency", "Relevance", "TextStructure"]
submissions.agg([np.mean, np.std]).sort_values(by=('Correctness', 'mean'), ascending=False).round(3)

  submissions = df.groupby("submission_id")["Correctness", "DataCoverage", "Fluency", "Relevance", "TextStructure"]


Unnamed: 0_level_0,Correctness,Correctness,DataCoverage,DataCoverage,Fluency,Fluency,Relevance,Relevance,TextStructure,TextStructure
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
submission_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
WebNLG-2020-reference,0.256,0.391,0.251,0.333,0.279,0.474,0.139,0.496,0.254,0.454
Amazon_AI_(Shanghai),0.248,0.411,0.222,0.389,0.326,0.412,0.214,0.406,0.305,0.375
bt5,0.224,0.465,0.161,0.489,0.218,0.498,0.184,0.375,0.236,0.428
OSU_Neural_NLG,0.224,0.422,0.235,0.403,0.323,0.493,0.163,0.464,0.289,0.382
FBConvAI,0.206,0.493,0.151,0.587,0.327,0.43,0.117,0.512,0.319,0.357
Baseline-FORGE2020,0.191,0.42,0.171,0.508,0.011,0.626,0.162,0.466,0.039,0.666
NUIG-DSI,0.189,0.542,0.116,0.591,0.233,0.517,0.161,0.394,0.258,0.455
DANGNT-SGU,0.179,0.476,0.259,0.37,-0.161,0.794,0.185,0.394,-0.203,0.863
RALI,0.163,0.553,0.272,0.426,-0.241,0.761,0.171,0.513,-0.285,0.851
cuni-ufal,0.161,0.514,0.155,0.494,0.185,0.578,0.164,0.464,0.208,0.49


# Statistical Testing

## Wilcoxon rank-sum significant test

In [9]:
def parse(data, normdata):
    correctness, coverage, fluency, relevance, structure = {}, {}, {}, {}, {}
    normcorrectness, normcoverage, normfluency, normrelevance, normstructure = {}, {}, {}, {}, {}

    submission_ids = sorted(list(set([w['submission_id'] for w in data])))
    sample_ids = sorted(list(set([w['sample_id'] for w in data])), key=lambda x: int(x))
    for i, submission_id in enumerate(submission_ids):
        if submission_id not in correctness:
            correctness[submission_id] = []
            coverage[submission_id] = []
            fluency[submission_id] = []
            relevance[submission_id] = []
            structure[submission_id] = []

            normcorrectness[submission_id] = []
            normcoverage[submission_id] = []
            normfluency[submission_id] = []
            normrelevance[submission_id] = []
            normstructure[submission_id] = []
        
        for sample_id in sample_ids:
          fdata = [w for w in data if w['submission_id'] == submission_id and w['sample_id'] == sample_id]
          fnormdata = [w for w in normdata if w['submission_id'] == submission_id and w['sample_id'] == sample_id]
          
          correctness[submission_id].append(np.nan_to_num(np.mean([w['Correctness'] for w in fdata])))
          coverage[submission_id].append(np.nan_to_num(np.mean([w['DataCoverage'] for w in fdata])))
          fluency[submission_id].append(np.nan_to_num(np.mean([w['Fluency'] for w in fdata])))
          relevance[submission_id].append(np.nan_to_num(np.mean([w['Relevance'] for w in fdata])))
          structure[submission_id].append(np.nan_to_num(np.mean([w['TextStructure'] for w in fdata])))

          # Average the z-scores (setting nans to zeros) of the three turkers for each trial of each system
          normcorrectness[submission_id].append(np.nan_to_num(np.mean(np.nan_to_num([w['Correctness'] for w in fnormdata]))))
          normcoverage[submission_id].append(np.nan_to_num(np.mean(np.nan_to_num([w['DataCoverage'] for w in fnormdata]))))
          normfluency[submission_id].append(np.nan_to_num(np.mean(np.nan_to_num([w['Fluency'] for w in fnormdata]))))
          normrelevance[submission_id].append(np.nan_to_num(np.mean(np.nan_to_num([w['Relevance'] for w in fnormdata]))))
          normstructure[submission_id].append(np.nan_to_num(np.mean(np.nan_to_num([w['TextStructure'] for w in fnormdata]))))
    return correctness, coverage, fluency, relevance, structure, \
            normcorrectness, normcoverage, normfluency, normrelevance, normstructure
    
def rank_systems(X, raw_X, name):
    submissions = sorted(X.keys(), key=lambda x: np.mean(X[x]), reverse=True)
    ranking = { s:1 for i, s in enumerate(submissions) }

    for i, subA in enumerate(submissions):
        for j, subB in enumerate(submissions[i+1:]):
            s, pvalue = ranksums(X[subA], X[subB])
            if pvalue < 0.05:
                ranking[subB] = ranking[subA] + 1
            elif ranking[subB] < ranking[submissions[i+1+j-1]] :
                ranking[subB] = ranking[submissions[i+1+j-1]] 

    ranking_ = {}
    for sub in ranking:
        rank = ranking[sub]
        normmean = np.mean(X[sub])
        mean = np.mean(raw_X[sub])
        ranking_[sub] = { 'Ranking': int(rank), name + ' (Z.)': round(normmean, 3), name: round(mean, 3) }

    return ranking_

correctness, coverage, fluency, relevance, structure, \
      normcorrectness, normcoverage, normfluency, normrelevance, normstructure = parse(data, normdata)


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


### All Data


In [10]:
pd.DataFrame(rank_systems(normcorrectness, correctness, 'Correctness')).T.sort_index(axis=0, key=lambda x: x.str.lower())

Unnamed: 0,Ranking,Correctness (Z.),Correctness
Amazon_AI_(Shanghai),1.0,0.248,93.531
Baseline-FORGE2017,2.0,0.13,90.138
Baseline-FORGE2020,1.0,0.19,91.794
bt5,1.0,0.224,93.583
cuni-ufal,1.0,0.161,91.587
CycleGT,2.0,0.071,89.846
DANGNT-SGU,1.0,0.179,92.489
FBConvAI,1.0,0.206,92.7
Huawei_Noahs_Ark_Lab,3.0,-0.389,80.76
NILC,3.0,-0.589,76.702


In [11]:
pd.DataFrame(rank_systems(normcoverage, coverage, 'Coverage')).T.sort_index(axis=0, key=lambda x: x.str.lower())

Unnamed: 0,Ranking,Coverage (Z.),Coverage
Amazon_AI_(Shanghai),1.0,0.222,94.393
Baseline-FORGE2017,2.0,0.127,92.066
Baseline-FORGE2020,1.0,0.17,92.892
bt5,2.0,0.161,93.836
cuni-ufal,2.0,0.155,93.291
CycleGT,3.0,0.023,91.231
DANGNT-SGU,1.0,0.259,95.315
FBConvAI,2.0,0.151,93.169
Huawei_Noahs_Ark_Lab,4.0,-0.31,84.743
NILC,4.0,-0.477,81.605


In [12]:
pd.DataFrame(rank_systems(normfluency, fluency, 'Fluency')).T.sort_index(axis=0, key=lambda x: x.str.lower())

Unnamed: 0,Ranking,Fluency (Z.),Fluency
Amazon_AI_(Shanghai),1.0,0.326,90.286
Baseline-FORGE2017,4.0,-0.143,80.941
Baseline-FORGE2020,3.0,0.011,82.43
bt5,2.0,0.218,88.688
cuni-ufal,2.0,0.185,87.642
CycleGT,3.0,0.072,84.82
DANGNT-SGU,4.0,-0.161,78.594
FBConvAI,1.0,0.327,90.837
Huawei_Noahs_Ark_Lab,5.0,-0.369,75.205
NILC,5.0,-0.408,74.851


In [13]:
pd.DataFrame(rank_systems(normrelevance, relevance, 'Relevance')).T.sort_index(axis=0, key=lambda x: x.str.lower())


Unnamed: 0,Ranking,Relevance (Z.),Relevance
Amazon_AI_(Shanghai),1.0,0.214,95.196
Baseline-FORGE2017,2.0,0.113,92.588
Baseline-FORGE2020,1.0,0.161,93.784
bt5,1.0,0.184,95.22
cuni-ufal,1.0,0.164,94.555
CycleGT,1.0,0.125,93.37
DANGNT-SGU,1.0,0.185,94.856
FBConvAI,2.0,0.117,93.898
Huawei_Noahs_Ark_Lab,3.0,-0.425,85.265
NILC,3.0,-0.499,83.522


In [14]:
pd.DataFrame(rank_systems(normstructure, structure, 'Text Structure')).T.sort_index(axis=0, key=lambda x: x.str.lower())

Unnamed: 0,Ranking,Text Structure (Z.),Text Structure
Amazon_AI_(Shanghai),1.0,0.305,92.951
Baseline-FORGE2017,2.0,-0.064,85.737
Baseline-FORGE2020,2.0,0.039,87.4
bt5,1.0,0.236,91.914
cuni-ufal,1.0,0.208,90.752
CycleGT,2.0,0.045,87.879
DANGNT-SGU,3.0,-0.203,83.501
FBConvAI,1.0,0.319,93.089
Huawei_Noahs_Ark_Lab,3.0,-0.373,80.219
NILC,3.0,-0.402,80.463


### Domain Type 1

In [15]:
fnormdata = [w for w in normdata if w['domain'] == 'type1']
fdata = [w for w in data if w['domain'] == 'type1']
correctness, coverage, fluency, relevance, structure, \
  normcorrectness, normcoverage, normfluency, normrelevance, normstructure = parse(fdata, fnormdata)


In [16]:
pd.DataFrame(rank_systems(normcorrectness, correctness, 'Correctness')).T.sort_index(axis=0, key=lambda x: x.str.lower())

Unnamed: 0,Ranking,Correctness (Z.),Correctness
Amazon_AI_(Shanghai),1.0,0.295,93.691
Baseline-FORGE2017,2.0,0.042,87.608
Baseline-FORGE2020,1.0,0.226,93.593
bt5,1.0,0.312,94.843
cuni-ufal,1.0,0.273,93.886
CycleGT,2.0,0.062,88.633
DANGNT-SGU,2.0,0.14,90.772
FBConvAI,1.0,0.261,93.472
Huawei_Noahs_Ark_Lab,2.0,0.08,90.269
NILC,1.0,0.212,93.071


In [17]:
pd.DataFrame(rank_systems(normcoverage, coverage, 'Coverage')).T.sort_index(axis=0, key=lambda x: x.str.lower())

Unnamed: 0,Ranking,Coverage (Z.),Coverage
Amazon_AI_(Shanghai),1.0,0.258,94.09
Baseline-FORGE2017,2.0,0.065,90.253
Baseline-FORGE2020,1.0,0.28,95.296
bt5,1.0,0.196,94.46
cuni-ufal,1.0,0.257,94.941
CycleGT,3.0,-0.137,88.386
DANGNT-SGU,1.0,0.239,94.367
FBConvAI,1.0,0.178,93.543
Huawei_Noahs_Ark_Lab,2.0,0.101,92.173
NILC,1.0,0.225,94.448


In [18]:
pd.DataFrame(rank_systems(normfluency, fluency, 'Fluency')).T.sort_index(axis=0, key=lambda x: x.str.lower())

Unnamed: 0,Ranking,Fluency (Z.),Fluency
Amazon_AI_(Shanghai),1.0,0.308,87.75
Baseline-FORGE2017,3.0,-0.406,75.037
Baseline-FORGE2020,2.0,0.03,82.664
bt5,1.0,0.28,89.892
cuni-ufal,1.0,0.281,89.454
CycleGT,2.0,-0.036,83.287
DANGNT-SGU,3.0,-0.159,79.559
FBConvAI,1.0,0.358,91.654
Huawei_Noahs_Ark_Lab,2.0,0.064,85.111
NILC,2.0,0.155,87.306


In [19]:
pd.DataFrame(rank_systems(normrelevance, relevance, 'Relevance')).T.sort_index(axis=0, key=lambda x: x.str.lower())


Unnamed: 0,Ranking,Relevance (Z.),Relevance
Amazon_AI_(Shanghai),1.0,0.17,93.586
Baseline-FORGE2017,2.0,-0.043,89.568
Baseline-FORGE2020,1.0,0.153,94.568
bt5,1.0,0.222,95.167
cuni-ufal,1.0,0.203,94.87
CycleGT,1.0,0.125,92.12
DANGNT-SGU,1.0,0.164,93.596
FBConvAI,2.0,0.112,93.111
Huawei_Noahs_Ark_Lab,2.0,0.011,92.222
NILC,1.0,0.266,96.269


In [20]:
pd.DataFrame(rank_systems(normstructure, structure, 'Text Structure')).T.sort_index(axis=0, key=lambda x: x.str.lower())
    

Unnamed: 0,Ranking,Text Structure (Z.),Text Structure
Amazon_AI_(Shanghai),1.0,0.293,91.154
Baseline-FORGE2017,3.0,-0.16,82.892
Baseline-FORGE2020,2.0,0.074,87.04
bt5,1.0,0.264,91.846
cuni-ufal,1.0,0.263,91.429
CycleGT,3.0,-0.121,84.262
DANGNT-SGU,3.0,-0.132,84.691
FBConvAI,1.0,0.326,92.966
Huawei_Noahs_Ark_Lab,2.0,0.067,88.38
NILC,1.0,0.212,91.225


### Domain Type 2

In [21]:
fnormdata = [w for w in normdata if w['domain'] == 'type2']
fdata = [w for w in data if w['domain'] == 'type2']
correctness, coverage, fluency, relevance, structure, \
  normcorrectness, normcoverage, normfluency, normrelevance, normstructure = parse(fdata, fnormdata)


In [22]:
pd.DataFrame(rank_systems(normcorrectness, correctness, 'Correctness')).T.sort_index(axis=0, key=lambda x: x.str.lower())


Unnamed: 0,Ranking,Correctness (Z.),Correctness
Amazon_AI_(Shanghai),1.0,0.293,94.703
Baseline-FORGE2017,1.0,0.317,91.302
Baseline-FORGE2020,1.0,0.199,92.635
bt5,1.0,0.154,93.239
cuni-ufal,1.0,0.149,91.086
CycleGT,1.0,0.171,92.541
DANGNT-SGU,1.0,0.16,93.459
FBConvAI,1.0,0.201,94.0
Huawei_Noahs_Ark_Lab,2.0,-0.242,84.126
NILC,2.0,-0.563,78.315


In [23]:
pd.DataFrame(rank_systems(normcoverage, coverage, 'Coverage')).T.sort_index(axis=0, key=lambda x: x.str.lower())


Unnamed: 0,Ranking,Coverage (Z.),Coverage
Amazon_AI_(Shanghai),1.0,0.291,95.532
Baseline-FORGE2017,1.0,0.196,92.207
Baseline-FORGE2020,1.0,0.161,93.36
bt5,1.0,0.158,93.734
cuni-ufal,1.0,0.206,93.937
CycleGT,2.0,0.094,92.703
DANGNT-SGU,1.0,0.23,95.329
FBConvAI,2.0,0.139,93.536
Huawei_Noahs_Ark_Lab,2.0,-0.259,85.041
NILC,3.0,-0.343,84.896


In [24]:
pd.DataFrame(rank_systems(normfluency, fluency, 'Fluency')).T.sort_index(axis=0, key=lambda x: x.str.lower())


Unnamed: 0,Ranking,Fluency (Z.),Fluency
Amazon_AI_(Shanghai),1.0,0.452,93.365
Baseline-FORGE2017,2.0,0.039,83.604
Baseline-FORGE2020,2.0,0.025,82.126
bt5,1.0,0.318,89.595
cuni-ufal,2.0,0.098,86.559
CycleGT,1.0,0.286,89.189
DANGNT-SGU,2.0,-0.116,78.599
FBConvAI,1.0,0.365,91.599
Huawei_Noahs_Ark_Lab,2.0,-0.221,79.315
NILC,3.0,-0.492,74.36


In [25]:
pd.DataFrame(rank_systems(normrelevance, relevance, 'Relevance')).T.sort_index(axis=0, key=lambda x: x.str.lower())


Unnamed: 0,Ranking,Relevance (Z.),Relevance
Amazon_AI_(Shanghai),1.0,0.272,96.329
Baseline-FORGE2017,1.0,0.266,93.797
Baseline-FORGE2020,1.0,0.271,96.099
bt5,2.0,0.146,95.351
cuni-ufal,1.0,0.213,94.995
CycleGT,1.0,0.181,95.198
DANGNT-SGU,1.0,0.249,96.658
FBConvAI,2.0,0.169,95.644
Huawei_Noahs_Ark_Lab,3.0,-0.366,85.559
NILC,3.0,-0.299,88.23


In [26]:
pd.DataFrame(rank_systems(normstructure, structure, 'Text Structure')).T.sort_index(axis=0, key=lambda x: x.str.lower())


Unnamed: 0,Ranking,Text Structure (Z.),Text Structure
Amazon_AI_(Shanghai),1.0,0.348,94.288
Baseline-FORGE2017,2.0,0.003,85.644
Baseline-FORGE2020,2.0,-0.011,88.243
bt5,1.0,0.263,91.766
cuni-ufal,1.0,0.314,93.243
CycleGT,1.0,0.233,92.036
DANGNT-SGU,2.0,-0.245,81.977
FBConvAI,1.0,0.334,94.405
Huawei_Noahs_Ark_Lab,2.0,-0.204,83.383
NILC,3.0,-0.629,78.55


### Domain Type 3

In [27]:
fnormdata = [w for w in normdata if w['domain'] == 'type3']
fdata = [w for w in data if w['domain'] == 'type3']
correctness, coverage, fluency, relevance, structure, \
  normcorrectness, normcoverage, normfluency, normrelevance, normstructure = parse(fdata, fnormdata)


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [28]:
pd.DataFrame(rank_systems(normcorrectness, correctness, 'Correctness')).T.sort_index(axis=0, key=lambda x: x.str.lower())


Unnamed: 0,Ranking,Correctness (Z.),Correctness
Amazon_AI_(Shanghai),1.0,0.201,92.933
Baseline-FORGE2017,2.0,0.105,91.213
Baseline-FORGE2020,1.0,0.163,90.32
bt5,1.0,0.2,92.948
cuni-ufal,2.0,0.096,90.374
CycleGT,2.0,0.035,89.452
DANGNT-SGU,1.0,0.21,93.142
FBConvAI,1.0,0.173,91.669
Huawei_Noahs_Ark_Lab,3.0,-0.743,73.427
NILC,4.0,-1.098,65.856


In [29]:
pd.DataFrame(rank_systems(normcoverage, coverage, 'Coverage')).T.sort_index(axis=0, key=lambda x: x.str.lower())


Unnamed: 0,Ranking,Coverage (Z.),Coverage
Amazon_AI_(Shanghai),1.0,0.17,94.098
Baseline-FORGE2017,2.0,0.137,93.13
Baseline-FORGE2020,2.0,0.106,91.201
bt5,2.0,0.14,93.492
cuni-ufal,2.0,0.069,91.992
CycleGT,2.0,0.092,92.372
DANGNT-SGU,1.0,0.284,95.897
FBConvAI,2.0,0.14,92.78
Huawei_Noahs_Ark_Lab,4.0,-0.586,80.004
NILC,5.0,-0.97,72.234


In [30]:
pd.DataFrame(rank_systems(normfluency, fluency, 'Fluency')).T.sort_index(axis=0, key=lambda x: x.str.lower())


Unnamed: 0,Ranking,Fluency (Z.),Fluency
Amazon_AI_(Shanghai),1.0,0.284,90.55
Baseline-FORGE2017,2.0,-0.057,83.473
Baseline-FORGE2020,2.0,-0.007,82.414
bt5,2.0,0.137,87.556
cuni-ufal,1.0,0.163,86.979
CycleGT,2.0,0.048,83.914
DANGNT-SGU,3.0,-0.182,77.992
FBConvAI,1.0,0.293,90.006
Huawei_Noahs_Ark_Lab,4.0,-0.7,67.308
NILC,4.0,-0.721,67.33


In [31]:
pd.DataFrame(rank_systems(normrelevance, relevance, 'Relevance')).T.sort_index(axis=0, key=lambda x: x.str.lower())


Unnamed: 0,Ranking,Relevance (Z.),Relevance
Amazon_AI_(Shanghai),1.0,0.215,95.713
Baseline-FORGE2017,2.0,0.145,93.948
Baseline-FORGE2020,2.0,0.12,92.312
bt5,1.0,0.177,95.197
cuni-ufal,2.0,0.119,94.172
CycleGT,2.0,0.102,93.368
DANGNT-SGU,1.0,0.172,94.872
FBConvAI,2.0,0.098,93.644
Huawei_Noahs_Ark_Lab,3.0,-0.721,80.822
NILC,4.0,-1.06,73.609


In [32]:
pd.DataFrame(rank_systems(normstructure, structure, 'Text Structure')).T.sort_index(axis=0, key=lambda x: x.str.lower())


Unnamed: 0,Ranking,Text Structure (Z.),Text Structure
Amazon_AI_(Shanghai),1.0,0.295,93.498
Baseline-FORGE2017,2.0,-0.032,87.542
Baseline-FORGE2020,2.0,0.039,87.264
bt5,1.0,0.207,92.019
cuni-ufal,2.0,0.129,89.272
CycleGT,2.0,0.069,88.356
DANGNT-SGU,3.0,-0.229,83.41
FBConvAI,1.0,0.308,92.605
Huawei_Noahs_Ark_Lab,4.0,-0.718,73.808
NILC,4.0,-0.685,74.598
