Run this notebook with your predictions to generate your BLEU, CHRF, SacreBLEU, BERTScore scores.

In [1]:
import jieba
import json
import torch
from bert_score import score
from rouge_chinese import Rouge
from sacrebleu.metrics import BLEU, CHRF, TER

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PREDICTION_FILE = '10k-transformer-scratch.json'
REFERENCE_FILE = 'iwslt2017-en-zh-test.zh'

In [3]:
predictions = []
reference = []

with open(f'./predictions/{PREDICTION_FILE}', 'r') as f:
    raw = f.read()
    pdict = json.loads(raw)
    if "predicted" in pdict:
        predictions = [*predictions, *pdict['predicted']]

with open(f'../tokenisation/sentencepiece_custom/{REFERENCE_FILE}', 'r') as f:
    reference = [*reference, *(f.readlines())]

assert len(predictions) == len(reference), \
    'Received a wrong number of predictions. ' + \
    'Ensure that you have generated predictions for the whole test set. \n\n' + \
    f'Predictions Length: {len(predictions)}, Expected: {len(reference)}' 

refs = [
    predictions
]

sys = reference


In [9]:
# BLEU-4 (1/2/3/4)
bleu = BLEU(smooth_method='exp', tokenize='zh', max_ngram_order=4)
bleu = bleu.corpus_score(sys, refs)
print('BLEU-4:')
print(bleu)
print('-' * 50)

# CHRF
print("CHRF")
chrf = CHRF(word_order=0, beta=0, eps_smoothing=False)
print(chrf.corpus_score(sys, refs))
print(50*'-')

# CHRF++
print("CHRF++")
chrf = CHRF(word_order=2, beta=0, eps_smoothing=False)
print(chrf.corpus_score(sys, refs))
print(50*'-')

# TER
print("TER")
ter = TER(asian_support=True, normalized=True)
print(ter.corpus_score(sys, refs))
print(50*'-')

# Rouge
print("Rouge")

def get_tok(sent):
    return ' '.join(jieba.lcut(sent))

rouge = Rouge()
scores = rouge.get_scores(list(map(get_tok, predictions)), list(map(get_tok, reference)), avg=True)
print(json.dumps(scores, indent=2))
print(50*'-')


# BERTScore
print("BERT")

P, R, F1 = score(predictions, reference, lang='zh') # default model for zh is bert-base-chinese
print(f'Precision: {P.mean().item()} | Recall: {R.mean().item()} | F1: {F1.mean().item()}') # Precision, Recall and F1

BLEU-4:
BLEU = 0.00 0.4/0.0/0.0/0.0 (BP = 1.000 ratio = 1.595 hyp_len = 245077 ref_len = 153638)
--------------------------------------------------
CHRF
chrF0 = 0.06
--------------------------------------------------
CHRF++
chrF0++ = 0.07
--------------------------------------------------
TER
TER = 159.30
--------------------------------------------------
Rouge
{
  "rouge-1": {
    "r": 0.008580685836678557,
    "p": 0.04643817990408226,
    "f": 0.013119041696869171
  },
  "rouge-2": {
    "r": 0.0,
    "p": 0.0,
    "f": 0.0
  },
  "rouge-l": {
    "r": 0.007944229065617996,
    "p": 0.007436969845068488,
    "f": 0.007347664149318669
  }
}
BERT
Precision: tensor([0.6356, 0.5078, 0.5312,  ..., 0.4325, 0.4538, 0.6079]) | Recall: tensor([0.4865, 0.4416, 0.5072,  ..., 0.4064, 0.4372, 0.5179]) | F1: tensor([0.5512, 0.4724, 0.5189,  ..., 0.4190, 0.4453, 0.5593])
