# Load Dependencies

In [4]:
!git clone https://github.com/WebNLG/GenerationEval.git

Cloning into 'GenerationEval'...
remote: Enumerating objects: 279, done.[K
remote: Counting objects: 100% (279/279), done.[K
remote: Compressing objects: 100% (141/141), done.[K
remote: Total 279 (delta 171), reused 235 (delta 135), pack-reused 0[K
Receiving objects: 100% (279/279), 2.80 MiB | 7.76 MiB/s, done.
Resolving deltas: 100% (171/171), done.


In [5]:
import os
os.chdir('GenerationEval')

In [6]:
!./install_dependencies.sh

Collecting nltk==3.5
[?25l  Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 5.7MB/s 
[?25hCollecting pyter3==0.3
  Downloading https://files.pythonhosted.org/packages/2c/98/a9da2bdf07ef1e27d947fd13e9c9714fcc554ae30aaae92a35e0534b6722/pyter3-0.3-py3-none-any.whl
Collecting razdel==0.5.0
  Downloading https://files.pythonhosted.org/packages/15/2c/664223a3924aa6e70479f7d37220b3a658765b9cfe760b4af7ffdc50d38f/razdel-0.5.0-py3-none-any.whl
Collecting tabulate==0.8.7
  Downloading https://files.pythonhosted.org/packages/c4/f4/770ae9385990f5a19a91431163d262182d3203662ea2b5739d0fcfc080f1/tabulate-0.8.7-py3-none-any.whl
Collecting bert-score==0.3.5
[?25l  Downloading https://files.pythonhosted.org/packages/50/8c/70244d0f351176c9984643e2037edc18b9124491b47ad19191039a18d2bd/bert_score-0.3.5-py3-none-any.whl (52kB)
[K     |████████████████████████████████|

In [15]:
import sys
sys.path.append('metrics/bleurt/bleurt')
sys.path.append('metrics/bleurt')
import copy
import codecs
import eval
import json
import numpy as np
import os
import pandas as pd
from scipy.stats import ranksums

import nltk
nltk.download('punkt')

HUMANEVAL_PATH = '../../../human-evaluation/eval-scripts/ru/russian_data.json'
REFERENCES_PATH = '../../../human-evaluation/eval-scripts/ru/references.xml'
SUBMISSIONS_PATH = '../../../../submissions/rdf2text/ru'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Reading Files

In [26]:
# READ FILES
import xml.etree.ElementTree as ET
root = ET.parse(REFERENCES_PATH)
root = root.getroot()
lexicalizations = {}
for entry_xml in root.find('entries').findall('entry'):
  eid = entry_xml.attrib['eid'].replace('Id', '')
  lexicalizations[eid] = []
  for lex_xml in entry_xml.findall('lex'):
    lexicalizations[eid].append(lex_xml.text)

# rdfs = json.load(open(REFERENCES_PATH))
data = json.load(open(HUMANEVAL_PATH))
data = [w for w in data if w['submission_id'] != 'WebNLG-2020-reference']

In [30]:
for i, row in enumerate(data):
  sample_id = row['sample_id']
  submission_id = row['submission_id']

submission_ids = sorted(list(set([w['submission_id'] for w in data])))
sample_ids = sorted(list(set([w['sample_id'] for w in data])), key=lambda x: int(x))

hypothesis = {}
for submission_id in submission_ids:
  path = os.path.join(SUBMISSIONS_PATH, submission_id, 'primary.ru')
  if not os.path.exists(path):
    path = os.path.join(SUBMISSIONS_PATH, submission_id, 'primary.ru.tsv')
    if not os.path.exists(path):
      path = os.path.join(SUBMISSIONS_PATH, submission_id, 'primary.ru.txt')
  
  with open(path) as f:
    hypothesis[submission_id] = f.read().split('\n')
  
for i, row in enumerate(data):
  submission_id, sample_id = row['submission_id'], row['sample_id']

  hyp = hypothesis[submission_id][int(sample_id)-1]
  refs = lexicalizations[sample_id]

  data[i]['hyp'] = hyp
  data[i]['refs'] = refs

  print(data[i]['hyp'])
  print(data[i]['refs'])
  print()

Bandeja paisa, который происходит из колумбийской кухни, содержит авокадо. Он находится в регионе Пайса.
['Часть колумбийской, кухни бандеха паиса - это традиционное блюда региона Пайса. Одним из главных ингредиентов в этом блюде является авокадо.', 'Авокадо - один из ингредиентов в блюде бандеха паиса, которое является блюдом колумбийской кухни. Оно родом из региона Пайса.', 'Авокадо - один из ингредиентов в блюде бандеха паиса, который представляет собой типичную колумбийскую кухню из региона Паиса.']

Стюарт Паркер - менеджер A.F.C. Блэкпул. A.F.C. Блэкпул играет в свои домашние игры в Блэкпуле, а он - за футбольный клуб округа Стокпорт. Консервативная партия является лидером Блэкпула.
['Стюарт Патрик, играющий за Стокпорт Каунти, был тренером ФК "Блэкпул". Клуб также находится в Блэкпуле. Консервативная партия Великобритании является лидером Блэкпула.', 'ФК “Блэкпул” в Блэкпуле управляется Стюартом Паркером, ранее игравшим за ФК "Стокпорт Каунти". Лидером Блэкпула является Консерва

# BLEU, ChrF++, TER

In [31]:
# BLEU, CHRF++, TER
NUM_REFS = 7

for i, row in enumerate(data):
  if i % 100 == 0:
    print('Progress: ', round(i / len(data), 2))
  hyp, references = row['hyp'], row['refs']
  with codecs.open('hypothesis', 'w', 'utf-8') as f:
    f.write(hyp)

  if not os.path.exists('references'):
    os.mkdir('references')
    
  for j in range(NUM_REFS):
    with codecs.open('references/reference' + str(j), 'w', 'utf-8') as f:
      if j < len(references):
        f.write(references[j])
      else:
        f.write('EMPTY')

  try:
    results = eval.run(refs_path='references/reference', hyps_path='hypothesis', num_refs=NUM_REFS, lng='ru', metrics='bleu,ter,chrf++')   
    data[i]['bleu'] = results['bleu']
    data[i]['bleu_nltk'] = results['bleu_nltk']
    # data[i]['meteor'] = results['meteor']
    data[i]['ter'] = results['ter']
    data[i]['chrf++'] = results['chrf++']
  except:
    data[i]['bleu'] = 0
    data[i]['bleu_nltk'] = 0
    # data[i]['meteor'] = 0
    data[i]['ter'] = 0
    data[i]['chrf++'] = 0
    print('ERROR')

json.dump(data, open('final.json', 'w'))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
STARTING TO COMPUTE TER...
FINISHING TO COMPUTE TER...
STARTING TO PARSE INPUTS...
FINISHING TO PARSE INPUTS...
STARTING TO COMPUTE BLEU...
FINISHING TO COMPUTE BLEU...
STARTING TO COMPUTE CHRF++...
FINISHING TO COMPUTE CHRF++...
STARTING TO COMPUTE TER...
FINISHING TO COMPUTE TER...
STARTING TO PARSE INPUTS...
FINISHING TO PARSE INPUTS...
STARTING TO COMPUTE BLEU...
FINISHING TO COMPUTE BLEU...
STARTING TO COMPUTE CHRF++...
FINISHING TO COMPUTE CHRF++...
STARTING TO COMPUTE TER...
FINISHING TO COMPUTE TER...
STARTING TO PARSE INPUTS...
FINISHING TO PARSE INPUTS...
STARTING TO COMPUTE BLEU...
FINISHING TO COMPUTE BLEU...
STARTING TO COMPUTE CHRF++...
FINISHING TO COMPUTE CHRF++...
STARTING TO COMPUTE TER...
FINISHING TO COMPUTE TER...
STARTING TO PARSE INPUTS...
FINISHING TO PARSE INPUTS...
STARTING TO COMPUTE BLEU...
FINISHING TO COMPUTE BLEU...
STARTING TO COMPUTE CHRF++...
FINISHING TO COMPUTE CHRF++...
STARTING TO COM

# METEOR

In [32]:
# METEOR
NUM_REFS = 7

hyps = []
linear_references = []
for i, row in enumerate(data):
  hyp, references = row['hyp'], row['refs']
  hyps.append(hyp)

  for j in range(NUM_REFS):
    if j < len(references):
      linear_references.append(references[j])
    else:
      linear_references.append('')

if not os.path.exists('references'):
  os.mkdir('references')
  
with codecs.open('hypothesis', 'w', 'utf-8') as f:
  f.write('\n'.join(hyps))

with codecs.open('references/references', 'w', 'utf-8') as f:
  f.write('\n'.join(linear_references))

!java -Xmx5G -jar metrics/meteor-1.5/meteor-1.5.jar hypothesis references/references -l ru -norm -r 7 > out

meteor = []
with open('out') as f:
  doc = f.read().split('\n')
  for row in doc:
    if 'Segment' in row:
      score = float(row.split('\t')[-1])
      meteor.append(score)

for i, row in enumerate(data):
  data[i]['meteor'] = meteor[i]

json.dump(data, open('final.json', 'w'))

# BERT

In [None]:
!pip3 install transformers==3.5.1

Collecting transformers==3.5.1
  Downloading transformers-3.5.1-py3-none-any.whl (1.3 MB)
[?25l[K     |▎                               | 10 kB 28.6 MB/s eta 0:00:01[K     |▌                               | 20 kB 33.3 MB/s eta 0:00:01[K     |▊                               | 30 kB 28.2 MB/s eta 0:00:01[K     |█                               | 40 kB 31.7 MB/s eta 0:00:01[K     |█▎                              | 51 kB 28.6 MB/s eta 0:00:01[K     |█▌                              | 61 kB 31.5 MB/s eta 0:00:01[K     |█▊                              | 71 kB 19.4 MB/s eta 0:00:01[K     |██                              | 81 kB 20.4 MB/s eta 0:00:01[K     |██▎                             | 92 kB 18.9 MB/s eta 0:00:01[K     |██▌                             | 102 kB 18.9 MB/s eta 0:00:01[K     |██▊                             | 112 kB 18.9 MB/s eta 0:00:01[K     |███                             | 122 kB 18.9 MB/s eta 0:00:01[K     |███▎                            | 133 k

In [33]:
# BERT
from bert_score import score

NUM_REFS = 7

hyps = [w['hyp'] for w in data]
references = [w['refs'] for w in data]
  
P, R, F1 = score(hyps, references, lang='ru')
P, R, F1 = list(P), list(R), list(F1)

for i, row in enumerate(data):
  data[i]['bert_precision'] = float(P[i])
  data[i]['bert_recall'] = float(R[i])
  data[i]['bert_f1'] = float(F1[i])


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1961828.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…




AttributeError: ignored

# Correlation

In [38]:
from scipy.stats import pearsonr
metrics = ['bleu_nltk', 'meteor', 'chrf++', 'ter', #'bert_f1',
           'Correctness', 'DataCoverage', 'Fluency', 'Relevance', 'TextStructure']

for i, m1 in enumerate(metrics):
  for j in range(i+1, len(metrics)):
    x = [w[m1] for w in data]
    y = [w[metrics[j]] for w in data]
    c, p = pearsonr(x, y)
    print(m1, 'x', metrics[j], ':', round(c, 2), round(p, 2))

bleu_nltk x meteor : 0.86 0.0
bleu_nltk x chrf++ : 0.86 0.0
bleu_nltk x ter : -0.86 0.0
bleu_nltk x Correctness : 0.26 0.0
bleu_nltk x DataCoverage : 0.18 0.0
bleu_nltk x Fluency : 0.22 0.0
bleu_nltk x Relevance : 0.15 0.0
bleu_nltk x TextStructure : 0.23 0.0
meteor x chrf++ : 0.93 0.0
meteor x ter : -0.83 0.0
meteor x Correctness : 0.34 0.0
meteor x DataCoverage : 0.29 0.0
meteor x Fluency : 0.27 0.0
meteor x Relevance : 0.18 0.0
meteor x TextStructure : 0.27 0.0
chrf++ x ter : -0.84 0.0
chrf++ x Correctness : 0.37 0.0
chrf++ x DataCoverage : 0.33 0.0
chrf++ x Fluency : 0.26 0.0
chrf++ x Relevance : 0.19 0.0
chrf++ x TextStructure : 0.26 0.0
ter x Correctness : -0.28 0.0
ter x DataCoverage : -0.21 0.0
ter x Fluency : -0.28 0.0
ter x Relevance : -0.16 0.0
ter x TextStructure : -0.25 0.0
Correctness x DataCoverage : 0.5 0.0
Correctness x Fluency : 0.44 0.0
Correctness x Relevance : 0.56 0.0
Correctness x TextStructure : 0.43 0.0
DataCoverage x Fluency : 0.33 0.0
DataCoverage x Relevance