# Load Dependencies

In [4]:
!git clone https://github.com/WebNLG/GenerationEval.git

Cloning into 'GenerationEval'...
remote: Enumerating objects: 279, done.[K
remote: Counting objects: 100% (279/279), done.[K
remote: Compressing objects: 100% (141/141), done.[K
remote: Total 279 (delta 171), reused 235 (delta 135), pack-reused 0[K
Receiving objects: 100% (279/279), 2.80 MiB | 20.80 MiB/s, done.
Resolving deltas: 100% (171/171), done.


In [5]:
import os
os.chdir('GenerationEval')

In [6]:
!./install_dependencies.sh

Collecting nltk==3.5
[?25l  Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)
[K     |▎                               | 10kB 17.8MB/s eta 0:00:01[K     |▌                               | 20kB 25.3MB/s eta 0:00:01[K     |▊                               | 30kB 26.5MB/s eta 0:00:01[K     |█                               | 40kB 22.3MB/s eta 0:00:01[K     |█▏                              | 51kB 24.6MB/s eta 0:00:01[K     |█▍                              | 61kB 20.8MB/s eta 0:00:01[K     |█▋                              | 71kB 20.8MB/s eta 0:00:01[K     |█▉                              | 81kB 18.7MB/s eta 0:00:01[K     |██                              | 92kB 18.0MB/s eta 0:00:01[K     |██▎                             | 102kB 18.4MB/s eta 0:00:01[K     |██▌                             | 112kB 18.4MB/s eta 0:00:01[K     |██▊                             | 122kB 18.4MB/s eta 0:00:01

In [9]:
import sys
sys.path.append('metrics/bleurt/bleurt')
sys.path.append('metrics/bleurt')
import copy
import codecs
import eval
import json
import numpy as np
import os
import pandas as pd
from scipy.stats import ranksums

import nltk
nltk.download('punkt')

HUMANEVAL_PATH = '../../../human-evaluation/eval-scripts/ru/russian_data.json'
REFERENCES_PATH = '../../../human-evaluation/eval-scripts/ru/references.json'
SUBMISSIONS_PATH = '../../../../submissions/rdf2text/ru'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Reading Files

In [10]:
# READ FILES
rdfs = json.load(open(REFERENCES_PATH))
data = json.load(open(HUMANEVAL_PATH))
data = [w for w in data if w['submission_id'] != 'WebNLG-2020-reference']

In [12]:
for i, row in enumerate(data):
  sample_id = row['sample_id']
  submission_id = row['submission_id']

submission_ids = sorted(list(set([w['submission_id'] for w in data])))
sample_ids = sorted(list(set([w['sample_id'] for w in data])), key=lambda x: int(x))

hypothesis = {}
for submission_id in submission_ids:
  path = os.path.join(SUBMISSIONS_PATH, submission_id, 'primary.ru')
  if not os.path.exists(path):
    path = os.path.join(SUBMISSIONS_PATH, submission_id, 'primary.ru.tsv')
    if not os.path.exists(path):
      path = os.path.join(SUBMISSIONS_PATH, submission_id, 'primary.ru.txt')
  
  with open(path) as f:
    hypothesis[submission_id] = f.read().split('\n')
  
for i, row in enumerate(data):
  submission_id, sample_id = row['submission_id'], row['sample_id']
  entry = [w for w in rdfs['entries'] if list(w.keys())[0] == sample_id][0]

  hyp = hypothesis[submission_id][int(sample_id)-1]
  refs = [w['lex'] for w in entry[sample_id]['lexicalisations']]

  data[i]['hyp'] = hyp
  data[i]['refs'] = [w['lex'] for w in entry[sample_id]['lexicalisations']]    

# BLEU, ChrF++, TER

In [13]:
# BLEU, CHRF++, TER
NUM_REFS = 7

for i, row in enumerate(data):
  if i % 100 == 0:
    print('Progress: ', round(i / len(data), 2))
  hyp, references = row['hyp'], row['refs']
  with codecs.open('hypothesis', 'w', 'utf-8') as f:
    f.write(hyp)

  if not os.path.exists('references'):
    os.mkdir('references')
    
  for j in range(NUM_REFS):
    with codecs.open('references/reference' + str(j), 'w', 'utf-8') as f:
      if j < len(references):
        f.write(references[j])
      else:
        f.write('EMPTY')

  try:
    results = eval.run(refs_path='references/reference', hyps_path='hypothesis', num_refs=NUM_REFS, lng='ru', metrics='bleu,ter,chrf++')   
    data[i]['bleu'] = results['bleu']
    data[i]['bleu_nltk'] = results['bleu_nltk']
    # data[i]['meteor'] = results['meteor']
    data[i]['ter'] = results['ter']
    data[i]['chrf++'] = results['chrf++']
  except:
    data[i]['bleu'] = 0
    data[i]['bleu_nltk'] = 0
    # data[i]['meteor'] = 0
    data[i]['ter'] = 0
    data[i]['chrf++'] = 0
    print('ERROR')

json.dump(data, open('final.json', 'w'))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
STARTING TO COMPUTE TER...
FINISHING TO COMPUTE TER...
STARTING TO PARSE INPUTS...
FINISHING TO PARSE INPUTS...
STARTING TO COMPUTE BLEU...
FINISHING TO COMPUTE BLEU...
STARTING TO COMPUTE CHRF++...
FINISHING TO COMPUTE CHRF++...
STARTING TO COMPUTE TER...
FINISHING TO COMPUTE TER...
STARTING TO PARSE INPUTS...
FINISHING TO PARSE INPUTS...
STARTING TO COMPUTE BLEU...
FINISHING TO COMPUTE BLEU...
STARTING TO COMPUTE CHRF++...
FINISHING TO COMPUTE CHRF++...
STARTING TO COMPUTE TER...
FINISHING TO COMPUTE TER...
STARTING TO PARSE INPUTS...
FINISHING TO PARSE INPUTS...
STARTING TO COMPUTE BLEU...
FINISHING TO COMPUTE BLEU...
STARTING TO COMPUTE CHRF++...
FINISHING TO COMPUTE CHRF++...
STARTING TO COMPUTE TER...
FINISHING TO COMPUTE TER...
STARTING TO PARSE INPUTS...
FINISHING TO PARSE INPUTS...
STARTING TO COMPUTE BLEU...
FINISHING TO COMPUTE BLEU...
STARTING TO COMPUTE CHRF++...
FINISHING TO COMPUTE CHRF++...
STARTING TO COM

# METEOR

In [14]:
# METEOR
NUM_REFS = 7

hyps = []
linear_references = []
for i, row in enumerate(data):
  hyp, references = row['hyp'], row['refs']
  hyps.append(hyp)

  for j in range(NUM_REFS):
    if j < len(references):
      linear_references.append(references[j])
    else:
      linear_references.append('')

if not os.path.exists('references'):
  os.mkdir('references')
  
with codecs.open('hypothesis', 'w', 'utf-8') as f:
  f.write('\n'.join(hyps))

with codecs.open('references/references', 'w', 'utf-8') as f:
  f.write('\n'.join(linear_references))

!java -Xmx5G -jar metrics/meteor-1.5/meteor-1.5.jar hypothesis references/references -l ru -norm -r 7 > out

meteor = []
with open('out') as f:
  doc = f.read().split('\n')
  for row in doc:
    if 'Segment' in row:
      score = float(row.split('\t')[-1])
      meteor.append(score)

for i, row in enumerate(data):
  data[i]['meteor'] = meteor[i]

json.dump(data, open('final.json', 'w'))

# BERT

In [15]:
!pip3 install transformers==3.5.1

Collecting transformers==3.5.1
  Downloading transformers-3.5.1-py3-none-any.whl (1.3 MB)
[?25l[K     |▎                               | 10 kB 28.6 MB/s eta 0:00:01[K     |▌                               | 20 kB 33.3 MB/s eta 0:00:01[K     |▊                               | 30 kB 28.2 MB/s eta 0:00:01[K     |█                               | 40 kB 31.7 MB/s eta 0:00:01[K     |█▎                              | 51 kB 28.6 MB/s eta 0:00:01[K     |█▌                              | 61 kB 31.5 MB/s eta 0:00:01[K     |█▊                              | 71 kB 19.4 MB/s eta 0:00:01[K     |██                              | 81 kB 20.4 MB/s eta 0:00:01[K     |██▎                             | 92 kB 18.9 MB/s eta 0:00:01[K     |██▌                             | 102 kB 18.9 MB/s eta 0:00:01[K     |██▊                             | 112 kB 18.9 MB/s eta 0:00:01[K     |███                             | 122 kB 18.9 MB/s eta 0:00:01[K     |███▎                            | 133 k

In [17]:
# BERT
from bert_score import score

NUM_REFS = 7

hyps = [w['hyp'] for w in data]
references = [w['refs'] for w in data]
  
P, R, F1 = score(hyps, references, lang='ru')
P, R, F1 = list(P), list(R), list(F1)

for i, row in enumerate(data):
  data[i]['bert_precision'] = float(P[i])
  data[i]['bert_recall'] = float(R[i])
  data[i]['bert_f1'] = float(F1[i])


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1961828.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…




AttributeError: ignored

# Correlation

In [18]:
from scipy.stats import pearsonr
metrics = ['bleu_nltk', 'meteor', 'chrf++', 'ter', 'bert_f1',
           'Correctness', 'DataCoverage', 'Fluency', 'Relevance', 'TextStructure']

for i, m1 in enumerate(metrics):
  for j in range(i+1, len(metrics)):
    x = [w[m1] for w in data]
    y = [w[metrics[j]] for w in data]
    c, p = pearsonr(x, y)
    print(m1, 'x', metrics[j], ':', round(c, 2), round(p, 2))

bleu_nltk x meteor : -0.25 0.0
bleu_nltk x chrf++ : -0.24 0.0
bleu_nltk x ter : -0.33 0.0
bleu_nltk x Correctness : -0.02 0.56
bleu_nltk x DataCoverage : 0.06 0.09
bleu_nltk x Fluency : 0.11 0.0
bleu_nltk x Relevance : -0.02 0.59
bleu_nltk x TextStructure : 0.07 0.06
meteor x chrf++ : 0.41 0.0
meteor x ter : 0.02 0.57
meteor x Correctness : -0.02 0.59
meteor x DataCoverage : -0.02 0.55
meteor x Fluency : -0.04 0.24
meteor x Relevance : 0.01 0.73
meteor x TextStructure : -0.05 0.19
chrf++ x ter : 0.04 0.29
chrf++ x Correctness : -0.09 0.01
chrf++ x DataCoverage : 0.01 0.75
chrf++ x Fluency : -0.13 0.0
chrf++ x Relevance : -0.03 0.37
chrf++ x TextStructure : -0.14 0.0
ter x Correctness : 0.08 0.03
ter x DataCoverage : 0.08 0.03
ter x Fluency : -0.04 0.29
ter x Relevance : 0.04 0.3
ter x TextStructure : 0.02 0.62
Correctness x DataCoverage : 0.5 0.0
Correctness x Fluency : 0.44 0.0
Correctness x Relevance : 0.56 0.0
Correctness x TextStructure : 0.43 0.0
DataCoverage x Fluency : 0.33 0.0
