In [None]:
!pip3 install numpy==1.20.1
!pip3 install pandas==1.2.3
!pip3 install scipy==1.6.2
!pip3 install statsmodels==0.12.2

In [1]:
import copy
import json
import numpy as np
import os
import pandas as pd
from scipy.stats import zscore
from statsmodels.stats.inter_rater import fleiss_kappa
from scipy.stats import ranksums

SYSTEMS_PATH = '../../results/ru'
REFERENCES_PATH = '../../../references/references-ru.json'

# Parsing the data

In [2]:
rdfs = json.load(open(REFERENCES_PATH))
sys_files = [w for w in os.listdir(SYSTEMS_PATH) if not w.startswith('.') and not w.endswith('.json')]

doc_id = 1
data = []
for sys_file in sys_files:
    results = json.load(open(os.path.join(SYSTEMS_PATH, sys_file, 'primary.json')))
    submission_id = sys_file

    for sample_id in results:
        entry = [w for w in rdfs['entries'] if list(w.keys())[0] == sample_id][0]
        for worker_id in results[sample_id]:
            assign = results[sample_id][worker_id]
            inp = {
                'id': doc_id,
                'sample_id': sample_id,
                'submission_id': submission_id,
                'worker_id': worker_id,
                'category': entry[sample_id]['category'],
                'size': entry[sample_id]['size'],
            }
            inp.update(assign)
            data.append(inp)
            doc_id += 1

# Inter-rater Agreement
## Fleiss' Kappa

Discretize the ratings in 5 categories

In [3]:
n_cat, max_range = 5, 100 # number of categories
data_discretized = []

ids = [w['id'] for w in data]
correctness = [int((n_cat* w['Correctness']) / (max_range+1)) for w in data]
coverage = [int((n_cat* w['DataCoverage']) / (max_range+1)) for w in data]
fluency = [int((n_cat* w['Fluency']) / (max_range+1)) for w in data]
relevance = [int((n_cat* w['Relevance']) / (max_range+1)) for w in data]
structure = [int((n_cat* w['TextStructure']) / (max_range+1)) for w in data]
    
for i, id_ in enumerate(ids):
    for j, row in enumerate(data):
        if row['id'] == id_:
            row_ = copy.copy(row)
            row_['Correctness'] = correctness[i]
            row_['DataCoverage'] = coverage[i]
            row_['Fluency'] = fluency[i]
            row_['Relevance'] = relevance[i]
            row_['TextStructure'] = structure[i]
            data_discretized.append(row_)
            break

data_discretized = sorted(data_discretized, key=lambda x: x['id'])         

Computing the Fleiss' Kappa agreements

In [4]:
assignments = set([(w['submission_id'], w['sample_id']) for w in data_discretized])

correctness = np.zeros((len(assignments), n_cat))
coverage = np.zeros((len(assignments), n_cat))
fluency = np.zeros((len(assignments), n_cat))
relevance = np.zeros((len(assignments), n_cat))
structure = np.zeros((len(assignments), n_cat))

for i, (submission_id, sample_id) in enumerate(assignments):
    fdata = [w for w in data_discretized if w['submission_id'] == submission_id and w['sample_id'] == sample_id]
    
    for rating in fdata:
        correctness[i, rating['Correctness']-1] += 1
        coverage[i, rating['DataCoverage']-1] += 1
        fluency[i, rating['Fluency']-1] += 1
        relevance[i, rating['Relevance']-1] += 1
        structure[i, rating['TextStructure']-1] += 1      
        
pd.DataFrame({"Fleiss' Kappa": {
    'Correctness': fleiss_kappa(correctness),
    'Data Coverage': fleiss_kappa(coverage),
    'Fluency': fleiss_kappa(fluency),
    'Relevance': fleiss_kappa(relevance),
    'Text Structure': fleiss_kappa(structure),
}}).round(3)

Unnamed: 0,Fleiss' Kappa
Correctness,0.244
Data Coverage,0.435
Fluency,0.156
Relevance,0.125
Text Structure,0.132


# Human Evaluation

Results of the human evaluation for the participating systems according to original ratings of correctness, data coverage, fluency, relevance and text structure.

In [5]:
df = pd.DataFrame(data)

submissions = df.groupby("submission_id")["Correctness", "DataCoverage", "Fluency", "Relevance", "TextStructure"]
submissions.agg([np.mean, np.std]).sort_values(by=('Correctness', 'mean'), ascending=False).round(3)

  submissions = df.groupby("submission_id")["Correctness", "DataCoverage", "Fluency", "Relevance", "TextStructure"]


Unnamed: 0_level_0,Correctness,Correctness,DataCoverage,DataCoverage,Fluency,Fluency,Relevance,Relevance,TextStructure,TextStructure
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
submission_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
bt5,95.594,11.577,95.63,9.961,93.088,14.143,95.385,11.676,95.745,11.601
FBConvAI,90.779,17.901,92.339,17.709,90.248,17.853,93.491,17.44,93.764,14.286
WebNLG-2020-reference,90.63,18.217,94.0,14.474,89.021,20.07,93.636,14.852,92.082,17.564
cuni-ufal,90.382,20.7,93.155,16.98,92.921,13.779,93.306,17.177,96.073,11.017
med,88.585,21.181,82.23,21.431,88.252,21.491,92.224,16.161,91.309,18.479
Huawei_Noahs_Ark_Lab,87.033,23.697,86.448,21.397,85.679,23.897,91.761,19.421,89.515,21.019
OSU_Neural_NLG,84.83,25.781,82.836,23.884,88.558,21.273,90.433,20.299,92.958,17.266
Baseline-FORGE2020,80.83,25.715,93.191,17.387,84.691,21.115,91.294,19.648,87.645,21.338


# Human Evaluation (Z-Scores)

Results of the human evaluation for the participating systems according to normalized z-scores for correctness, data coverage, fluency, relevance and text structure.

In [6]:
normdata = []
worker_ids = set([w['worker_id'] for w in data])
for worker_id in worker_ids:
    fdata = [w for w in data if w['worker_id'] == worker_id]
    
    ids = [w['id'] for w in fdata]
    correctness = zscore([w['Correctness'] for w in fdata])
    coverage = zscore([w['DataCoverage'] for w in fdata])
    fluency = zscore([w['Fluency'] for w in fdata])
    relevance = zscore([w['Relevance'] for w in fdata])
    structure = zscore([w['TextStructure'] for w in fdata])
    
    for i, id_ in enumerate(ids):
        for j, row in enumerate(data):
            if row['id'] == id_:
                row_ = copy.copy(row)
                row_['Correctness'] = correctness[i]
                row_['DataCoverage'] = coverage[i]
                row_['Fluency'] = fluency[i]
                row_['Relevance'] = relevance[i]
                row_['TextStructure'] = structure[i]
                normdata.append(row_)
                break
                
df = pd.DataFrame(normdata)

submissions = df.groupby("submission_id")["Correctness", "DataCoverage", "Fluency", "Relevance", "TextStructure"]
submissions.agg([np.mean, np.std]).sort_values(by=('Correctness', 'mean'), ascending=False).round(3)

  submissions = df.groupby("submission_id")["Correctness", "DataCoverage", "Fluency", "Relevance", "TextStructure"]


Unnamed: 0_level_0,Correctness,Correctness,DataCoverage,DataCoverage,Fluency,Fluency,Relevance,Relevance,TextStructure,TextStructure
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
submission_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
bt5,0.34,0.586,0.312,0.534,0.232,0.613,0.174,0.696,0.219,0.658
WebNLG-2020-reference,0.109,0.794,0.23,0.707,0.022,0.96,0.065,0.814,-0.005,0.969
cuni-ufal,0.101,0.951,0.204,0.763,0.213,0.691,0.077,0.891,0.218,0.69
FBConvAI,0.08,0.833,0.133,0.88,0.063,0.837,0.027,0.967,0.079,0.808
med,0.021,1.009,-0.47,1.202,-0.06,1.221,-0.022,1.001,-0.077,1.195
Huawei_Noahs_Ark_Lab,-0.084,1.111,-0.189,1.114,-0.174,1.198,-0.06,1.11,-0.183,1.211
OSU_Neural_NLG,-0.181,1.179,-0.422,1.325,-0.05,1.104,-0.182,1.242,0.019,0.998
Baseline-FORGE2020,-0.387,1.213,0.2,0.87,-0.247,1.098,-0.079,1.134,-0.27,1.192


In [7]:
import json
submission_ids = sorted(list(set([w['submission_id'] for w in normdata])))
sample_ids = sorted(list(set([w['sample_id'] for w in normdata])), key=lambda x: int(x))

finaldata = []
for submission_id in submission_ids:
  for sample_id in sample_ids:
    fdata = [w for w in normdata if w['submission_id'] == submission_id and w['sample_id'] == sample_id]

    if len(fdata) > 0:
      finaldata.append({
        'submission_id': submission_id,
        'size': fdata[0]['size'],
        'sample_id': sample_id,
        'category': fdata[0]['category'],
        'Correctness': np.nan_to_num(np.mean(np.nan_to_num([w['Correctness'] for w in fdata]))),
        'DataCoverage': np.nan_to_num(np.mean(np.nan_to_num([w['DataCoverage'] for w in fdata]))),
        'Fluency': np.nan_to_num(np.mean(np.nan_to_num([w['Fluency'] for w in fdata]))),
        'Relevance': np.nan_to_num(np.mean(np.nan_to_num([w['Relevance'] for w in fdata]))),
        'TextStructure': np.nan_to_num(np.mean(np.nan_to_num([w['TextStructure'] for w in fdata])))
      })

json.dump(finaldata, open('../../results/ru/russian_humeval_data_all_teams.json', 'w'), separators=(',', ':'), indent=4)


# Statistical Testing

## Wilcoxon rank-sum significant test

In [8]:
def parse(data, normdata):
    correctness, coverage, fluency, relevance, structure = {}, {}, {}, {}, {}
    normcorrectness, normcoverage, normfluency, normrelevance, normstructure = {}, {}, {}, {}, {}

    submission_ids = sorted(list(set([w['submission_id'] for w in data])))
    sample_ids = sorted(list(set([w['sample_id'] for w in data])), key=lambda x: int(x))
    for i, submission_id in enumerate(submission_ids):
        if submission_id not in correctness:
            correctness[submission_id] = []
            coverage[submission_id] = []
            fluency[submission_id] = []
            relevance[submission_id] = []
            structure[submission_id] = []

            normcorrectness[submission_id] = []
            normcoverage[submission_id] = []
            normfluency[submission_id] = []
            normrelevance[submission_id] = []
            normstructure[submission_id] = []
        
        for sample_id in sample_ids:
          fdata = [w for w in data if w['submission_id'] == submission_id and w['sample_id'] == sample_id]
          fnormdata = [w for w in normdata if w['submission_id'] == submission_id and w['sample_id'] == sample_id]

          correctness[submission_id].append(np.mean([w['Correctness'] for w in fdata]))
          coverage[submission_id].append(np.mean([w['DataCoverage'] for w in fdata]))
          fluency[submission_id].append(np.mean([w['Fluency'] for w in fdata]))
          relevance[submission_id].append(np.mean([w['Relevance'] for w in fdata]))
          structure[submission_id].append(np.mean([w['TextStructure'] for w in fdata]))

          # Average the z-scores (setting nans to zeros) of the three turkers for each trial of each system
          normcorrectness[submission_id].append(np.mean(np.nan_to_num([w['Correctness'] for w in fnormdata])))
          normcoverage[submission_id].append(np.mean(np.nan_to_num([w['DataCoverage'] for w in fnormdata])))
          normfluency[submission_id].append(np.mean(np.nan_to_num([w['Fluency'] for w in fnormdata])))
          normrelevance[submission_id].append(np.mean(np.nan_to_num([w['Relevance'] for w in fnormdata])))
          normstructure[submission_id].append(np.mean(np.nan_to_num([w['TextStructure'] for w in fnormdata])))
    return correctness, coverage, fluency, relevance, structure, \
            normcorrectness, normcoverage, normfluency, normrelevance, normstructure
    
def rank_systems(X, raw_X, name):
    submissions = sorted(X.keys(), key=lambda x: np.mean(X[x]), reverse=True)
    ranking = { s:1 for i, s in enumerate(submissions) }

    for i, subA in enumerate(submissions):
        for j, subB in enumerate(submissions[i+1:]):
            s, pvalue = ranksums(X[subA], X[subB])
            if pvalue < 0.05:
                ranking[subB] = ranking[subA] + 1
            elif ranking[subB] < ranking[submissions[i+1+j-1]] :
                ranking[subB] = ranking[submissions[i+1+j-1]] 

    ranking_ = {}
    for sub in ranking:
        rank = ranking[sub]
        normmean = np.mean(X[sub])
        mean = np.mean(raw_X[sub])
        ranking_[sub] = { 'Ranking': int(rank), name + ' (Z.)': round(normmean, 3), name: round(mean, 3) }

    return ranking_

correctness, coverage, fluency, relevance, structure, \
      normcorrectness, normcoverage, normfluency, normrelevance, normstructure = parse(data, normdata)


### All Data


In [9]:
pd.DataFrame(rank_systems(normcorrectness, correctness, 'Correctness')).T.sort_index(axis=0, key=lambda x: x.str.lower())

Unnamed: 0,Ranking,Correctness (Z.),Correctness
Baseline-FORGE2020,4.0,-0.387,80.83
bt5,1.0,0.34,95.594
cuni-ufal,2.0,0.101,90.382
FBConvAI,2.0,0.08,90.779
Huawei_Noahs_Ark_Lab,2.0,-0.084,87.033
med,2.0,0.021,88.585
OSU_Neural_NLG,3.0,-0.181,84.83
WebNLG-2020-reference,2.0,0.109,90.63


In [10]:
pd.DataFrame(rank_systems(normcoverage, coverage, 'Coverage')).T.sort_index(axis=0, key=lambda x: x.str.lower())

Unnamed: 0,Ranking,Coverage (Z.),Coverage
Baseline-FORGE2020,1.0,0.2,93.191
bt5,1.0,0.312,95.63
cuni-ufal,1.0,0.203,93.155
FBConvAI,1.0,0.133,92.339
Huawei_Noahs_Ark_Lab,2.0,-0.189,86.448
med,3.0,-0.467,82.23
OSU_Neural_NLG,2.0,-0.422,82.836
WebNLG-2020-reference,1.0,0.23,94.0


In [11]:
pd.DataFrame(rank_systems(normfluency, fluency, 'Fluency')).T.sort_index(axis=0, key=lambda x: x.str.lower())

Unnamed: 0,Ranking,Fluency (Z.),Fluency
Baseline-FORGE2020,3.0,-0.247,84.691
bt5,1.0,0.232,93.088
cuni-ufal,1.0,0.213,92.921
FBConvAI,2.0,0.063,90.248
Huawei_Noahs_Ark_Lab,3.0,-0.174,85.679
med,2.0,-0.06,88.252
OSU_Neural_NLG,2.0,-0.05,88.558
WebNLG-2020-reference,2.0,0.022,89.021


In [12]:
pd.DataFrame(rank_systems(normrelevance, relevance, 'Relevance')).T.sort_index(axis=0, key=lambda x: x.str.lower())


Unnamed: 0,Ranking,Relevance (Z.),Relevance
Baseline-FORGE2020,2.0,-0.079,91.294
bt5,1.0,0.174,95.385
cuni-ufal,1.0,0.077,93.306
FBConvAI,2.0,0.027,93.491
Huawei_Noahs_Ark_Lab,2.0,-0.06,91.761
med,2.0,-0.022,92.224
OSU_Neural_NLG,2.0,-0.182,90.433
WebNLG-2020-reference,2.0,0.065,93.636


In [13]:
pd.DataFrame(rank_systems(normstructure, structure, 'Text Structure')).T.sort_index(axis=0, key=lambda x: x.str.lower())
    

Unnamed: 0,Ranking,Text Structure (Z.),Text Structure
Baseline-FORGE2020,3.0,-0.27,87.645
bt5,1.0,0.219,95.745
cuni-ufal,1.0,0.218,96.073
FBConvAI,2.0,0.079,93.764
Huawei_Noahs_Ark_Lab,3.0,-0.183,89.515
med,2.0,-0.077,91.309
OSU_Neural_NLG,2.0,0.019,92.958
WebNLG-2020-reference,2.0,-0.005,92.082
