# Load Dependencies

In [4]:
!git clone https://github.com/WebNLG/GenerationEval.git

Cloning into 'GenerationEval'...
remote: Enumerating objects: 279, done.[K
remote: Counting objects: 100% (279/279), done.[K
remote: Compressing objects: 100% (141/141), done.[K
remote: Total 279 (delta 171), reused 235 (delta 135), pack-reused 0[K
Receiving objects: 100% (279/279), 2.80 MiB | 19.53 MiB/s, done.
Resolving deltas: 100% (171/171), done.


In [5]:
import os
os.chdir('GenerationEval')

In [6]:
!./install_dependencies.sh

Collecting nltk==3.5
[?25l  Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)
[K     |▎                               | 10kB 25.0MB/s eta 0:00:01[K     |▌                               | 20kB 16.6MB/s eta 0:00:01[K     |▊                               | 30kB 14.1MB/s eta 0:00:01[K     |█                               | 40kB 13.2MB/s eta 0:00:01[K     |█▏                              | 51kB 8.2MB/s eta 0:00:01[K     |█▍                              | 61kB 7.8MB/s eta 0:00:01[K     |█▋                              | 71kB 8.7MB/s eta 0:00:01[K     |█▉                              | 81kB 9.6MB/s eta 0:00:01[K     |██                              | 92kB 9.3MB/s eta 0:00:01[K     |██▎                             | 102kB 8.0MB/s eta 0:00:01[K     |██▌                             | 112kB 8.0MB/s eta 0:00:01[K     |██▊                             | 122kB 8.0MB/s eta 0:00:01[K    

In [7]:
import sys
sys.path.append('metrics/bleurt/bleurt')
sys.path.append('metrics/bleurt')
import copy
import codecs
import eval
import json
import numpy as np
import os
import pandas as pd
from scipy.stats import zscore
from statsmodels.stats.inter_rater import fleiss_kappa
from scipy.stats import ranksums

import nltk
nltk.download('punkt')

HUMANEVAL_PATH = '../../../human-evaluation/eval-scripts/en/english_data.json'
REFERENCES_PATH = '../../../human-evaluation/eval-scripts/en/references.json'
SUBMISSIONS_PATH = '../../../../submissions/rdf2text/en'

  import pandas.util.testing as tm
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Reading Files

In [8]:
# READ FILES
rdfs = json.load(open(REFERENCES_PATH))
data = json.load(open(HUMANEVAL_PATH))
data = [w for w in data if w['submission_id'] != 'WebNLG-2020-reference']

In [9]:
for i, row in enumerate(data):
  sample_id = row['sample_id']
  submission_id = row['submission_id']

submission_ids = sorted(list(set([w['submission_id'] for w in data])))
sample_ids = sorted(list(set([w['sample_id'] for w in data])), key=lambda x: int(x))

hypothesis = {}
for submission_id in submission_ids:
  path = os.path.join(SUBMISSIONS_PATH, submission_id, 'primary.en')
  if not os.path.exists(path):
    path = os.path.join(SUBMISSIONS_PATH, submission_id, 'primary.en.tsv')
    if not os.path.exists(path):
      path = os.path.join(SUBMISSIONS_PATH, submission_id, 'primary.en.txt')
  
  with open(path) as f:
    hypothesis[submission_id] = f.read().split('\n')
  
for i, row in enumerate(data):
  submission_id, sample_id = row['submission_id'], row['sample_id']
  entry = [w for w in rdfs['entries'] if list(w.keys())[0] == sample_id][0]

  hyp = hypothesis[submission_id][int(sample_id)-1]
  refs = [w['lex'] for w in entry[sample_id]['lexicalisations']]

  data[i]['hyp'] = hyp
  data[i]['refs'] = [w['lex'] for w in entry[sample_id]['lexicalisations']]    

# BLEU, ChrF++, TER

In [10]:
# BLEU, CHRF++, TER
NUM_REFS = 5

for i, row in enumerate(data):
  if i % 100 == 0:
    print('Progress: ', round(i / len(data), 2))
  hyp, references = row['hyp'], row['refs']
  with codecs.open('hypothesis', 'w', 'utf-8') as f:
    f.write(hyp)

  if not os.path.exists('references'):
    os.mkdir('references')
    
  for j in range(NUM_REFS):
    with codecs.open('references/reference' + str(j), 'w', 'utf-8') as f:
      if j < len(references):
        f.write(references[j])
      else:
        f.write('EMPTY')

  try:
    results = eval.run(refs_path='references/reference', hyps_path='hypothesis', num_refs=NUM_REFS, lng='en', metrics='bleu,ter,chrf++')   
    data[i]['bleu'] = results['bleu']
    data[i]['bleu_nltk'] = results['bleu_nltk']
    # data[i]['meteor'] = results['meteor']
    data[i]['ter'] = results['ter']
    data[i]['chrf++'] = results['chrf++']
  except:
    data[i]['bleu'] = 0
    data[i]['bleu_nltk'] = 0
    # data[i]['meteor'] = 0
    data[i]['ter'] = 0
    data[i]['chrf++'] = 0
    print('ERROR')

json.dump(data, open('final.json', 'w'))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
STARTING TO COMPUTE TER...
FINISHING TO COMPUTE TER...
STARTING TO PARSE INPUTS...
FINISHING TO PARSE INPUTS...
STARTING TO COMPUTE BLEU...
FINISHING TO COMPUTE BLEU...
STARTING TO COMPUTE CHRF++...
FINISHING TO COMPUTE CHRF++...
STARTING TO COMPUTE TER...
FINISHING TO COMPUTE TER...
STARTING TO PARSE INPUTS...
FINISHING TO PARSE INPUTS...
STARTING TO COMPUTE BLEU...
FINISHING TO COMPUTE BLEU...
STARTING TO COMPUTE CHRF++...
FINISHING TO COMPUTE CHRF++...
STARTING TO COMPUTE TER...
FINISHING TO COMPUTE TER...
STARTING TO PARSE INPUTS...
FINISHING TO PARSE INPUTS...
STARTING TO COMPUTE BLEU...
FINISHING TO COMPUTE BLEU...
STARTING TO COMPUTE CHRF++...
FINISHING TO COMPUTE CHRF++...
STARTING TO COMPUTE TER...
FINISHING TO COMPUTE TER...
STARTING TO PARSE INPUTS...
FINISHING TO PARSE INPUTS...
STARTING TO COMPUTE BLEU...
FINISHING TO COMPUTE BLEU...
STARTING TO COMPUTE CHRF++...
FINISHING TO COMPUTE CHRF++...
STARTING TO COM

# METEOR

In [11]:
# METEOR
NUM_REFS = 5

hyps = []
linear_references = []
for i, row in enumerate(data):
  hyp, references = row['hyp'], row['refs']
  hyps.append(hyp)

  for j in range(NUM_REFS):
    if j < len(references):
      linear_references.append(references[j])
    else:
      linear_references.append('')

if not os.path.exists('references'):
  os.mkdir('references')
  
with codecs.open('hypothesis', 'w', 'utf-8') as f:
  f.write('\n'.join(hyps))

with codecs.open('references/references', 'w', 'utf-8') as f:
  f.write('\n'.join(linear_references))

!java -Xmx5G -jar metrics/meteor-1.5/meteor-1.5.jar hypothesis references/references -l en -norm -r 5 > out

meteor = []
with open('out') as f:
  doc = f.read().split('\n')
  for row in doc:
    if 'Segment' in row:
      score = float(row.split('\t')[-1])
      meteor.append(score)

for i, row in enumerate(data):
  data[i]['meteor'] = meteor[i]

json.dump(data, open('final.json', 'w'))

# BERT

In [18]:
!pip3 install transformers==3.5.1

Collecting transformers==3.5.1
  Downloading transformers-3.5.1-py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 7.6 MB/s 
Collecting tokenizers==0.9.3
  Downloading tokenizers-0.9.3-cp37-cp37m-manylinux1_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 28.6 MB/s 
Collecting sentencepiece==0.1.91
  Downloading sentencepiece-0.1.91-cp37-cp37m-manylinux1_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 58.4 MB/s 
Installing collected packages: tokenizers, sentencepiece, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.10.1
    Uninstalling tokenizers-0.10.1:
      Successfully uninstalled tokenizers-0.10.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.4.2
    Uninstalling transformers-4.4.2:
      Successfully uninstalled transformers-4.4.2
Successfully installed sentencepiece-0.1.91 tokenizers-0.9.3 transformers-3.5.1


In [19]:
# BERT
from bert_score import score

NUM_REFS = 5

hyps = [w['hyp'] for w in data]
references = [w['refs'] for w in data]
  
P, R, F1 = score(hyps, references, lang='en')
P, R, F1 = list(P), list(R), list(F1)

for i, row in enumerate(data):
  data[i]['bert_precision'] = float(P[i])
  data[i]['bert_recall'] = float(R[i])
  data[i]['bert_f1'] = float(F1[i])


AttributeError: ignored

# BLEURT

In [16]:
# BLEURT
from metrics.bleurt.bleurt import score as bleurt_score
NUM_REFS = 5

refs, cands = [], []
for i, row in enumerate(data):
  hyp = row['hyp']
  for i in range(NUM_REFS):
    cands.append(hyp)
    if i < len(row['refs']):
      refs.append(row['refs'][i])
    else:
      refs.append('EMPTY')

checkpoint = "metrics/bleurt/bleurt-base-128"
scorer = bleurt_score.BleurtScorer(checkpoint)
scores = scorer.score(refs, cands)
scores = [max(scores[i:i+NUM_REFS]) for i in range(0, len(scores), NUM_REFS)]

for i, row in enumerate(data):
  data[i]['bleurt'] = scores[i]

INFO:tensorflow:Reading checkpoint metrics/bleurt/bleurt-base-128.


INFO:tensorflow:Reading checkpoint metrics/bleurt/bleurt-base-128.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Performs basic checks...


INFO:tensorflow:Performs basic checks...


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... max_seq_length:128


INFO:tensorflow:... max_seq_length:128


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Loading model...


INFO:tensorflow:Loading model...


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


# Correlation

In [22]:
from scipy.stats import pearsonr
metrics = ['bleu_nltk', 'meteor', 'chrf++', 'ter', 'bert_f1', 'bleurt',
           'Correctness', 'DataCoverage', 'Fluency', 'Relevance', 'TextStructure']

for i, m1 in enumerate(metrics):
  for j in range(i+1, len(metrics)):
    x = [w[m1] for w in data]
    y = [w[metrics[j]] for w in data]
    c, p = pearsonr(x, y)
    print(m1, 'x', metrics[j], ':', round(c, 2), round(p, 2))

bleu_nltk x meteor : 0.74 0.0
bleu_nltk x chrf++ : 0.86 0.0
bleu_nltk x ter : -0.83 0.0
bleu_nltk x bleurt : 0.63 0.0
bleu_nltk x Correctness : 0.36 0.0
bleu_nltk x DataCoverage : 0.28 0.0
bleu_nltk x Fluency : 0.4 0.0
bleu_nltk x Relevance : 0.3 0.0
bleu_nltk x TextStructure : 0.37 0.0
meteor x chrf++ : 0.81 0.0
meteor x ter : -0.72 0.0
meteor x bleurt : 0.65 0.0
meteor x Correctness : 0.32 0.0
meteor x DataCoverage : 0.3 0.0
meteor x Fluency : 0.34 0.0
meteor x Relevance : 0.26 0.0
meteor x TextStructure : 0.31 0.0
chrf++ x ter : -0.81 0.0
chrf++ x bleurt : 0.71 0.0
chrf++ x Correctness : 0.43 0.0
chrf++ x DataCoverage : 0.4 0.0
chrf++ x Fluency : 0.41 0.0
chrf++ x Relevance : 0.36 0.0
chrf++ x TextStructure : 0.39 0.0
ter x bleurt : -0.73 0.0
ter x Correctness : -0.39 0.0
ter x DataCoverage : -0.31 0.0
ter x Fluency : -0.44 0.0
ter x Relevance : -0.34 0.0
ter x TextStructure : -0.4 0.0
bleurt x Correctness : 0.57 0.0
bleurt x DataCoverage : 0.51 0.0
bleurt x Fluency : 0.55 0.0
bleur