Run this notebook with your predictions to generate your BLEU, CHRF, SacreBLEU, BERTScore scores.

In [1]:
import jieba
import json
import torch
from bert_score import score
from rouge_chinese import Rouge
from sacrebleu.metrics import BLEU, CHRF, TER

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# PREDICTION_FILE = '10k-transformer-scratch.json'
PREDICTION_FILE = '230k-transformer-scratch.json'
REFERENCE_FILE = 'iwslt2017-en-zh-test.zh'

In [3]:
def get_report():
    predictions = []
    reference = []

    with open(f'./predictions/{PREDICTION_FILE}', 'r') as f:
        raw = f.read()
        pdict = json.loads(raw)
        if "predicted" in pdict:
            predictions = [*predictions, *pdict['predicted']]

    with open(f'../tokenisation/sentencepiece_custom/{REFERENCE_FILE}', 'r') as f:
        reference = [*reference, *(f.readlines())]

    assert len(predictions) == len(reference), \
        'Received a wrong number of predictions. ' + \
        'Ensure that you have generated predictions for the whole test set. \n\n' + \
        f'Predictions Length: {len(predictions)}, Expected: {len(reference)}' 

    refs = [
        predictions
    ]

    sys = reference

    # BLEU-4 (1/2/3/4)
    bleu = BLEU(smooth_method='exp', tokenize='zh', max_ngram_order=4)
    bleu = bleu.corpus_score(sys, refs)
    print('BLEU-4:')
    print(bleu)
    print('-' * 50)

    # CHRF
    print("CHRF")
    chrf = CHRF(word_order=0, beta=0, eps_smoothing=False)
    print(chrf.corpus_score(sys, refs))
    print(50*'-')

    # CHRF++
    print("CHRF++")
    chrf = CHRF(word_order=2, beta=0, eps_smoothing=False)
    print(chrf.corpus_score(sys, refs))
    print(50*'-')

    # TER
    print("TER")
    ter = TER(asian_support=True, normalized=True)
    print(ter.corpus_score(sys, refs))
    print(50*'-')

    # Rouge
    print("Rouge")

    def get_tok(sent):
        return ' '.join(jieba.lcut(sent))

    rouge = Rouge()
    scores = rouge.get_scores(list(map(get_tok, predictions)), list(map(get_tok, reference)), avg=True)
    print(json.dumps(scores, indent=2))
    print(50*'-')


    # BERTScore
    print("BERT")

    P, R, F1 = score(predictions, reference, lang='zh') # default model for zh is bert-base-chinese
    print(f'Precision: {P.mean().item()} | Recall: {R.mean().item()} | F1: {F1.mean().item()}') # Precision, Recall and F1

In [4]:
get_report()

BLEU-4:
BLEU = 12.89 39.5/18.8/8.6/4.3 (BP = 1.000 ratio = 1.227 hyp_len = 245077 ref_len = 199738)
--------------------------------------------------
CHRF
chrF0 = 12.59
--------------------------------------------------
CHRF++
chrF0++ = 9.57
--------------------------------------------------
TER


Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/pc/n3qxl__51zq83lxk2mhxs1x00000gn/T/jieba.cache


TER = 89.74
--------------------------------------------------
Rouge


Loading model cost 0.310 seconds.
Prefix dict has been built successfully.


{
  "rouge-1": {
    "r": 0.31928522104761126,
    "p": 0.4030361162215033,
    "f": 0.34884086250448565
  },
  "rouge-2": {
    "r": 0.0946124355671617,
    "p": 0.11718380109144616,
    "f": 0.10186365049152646
  },
  "rouge-l": {
    "r": 0.28955896126228486,
    "p": 0.35421947634495726,
    "f": 0.3101049318388194
  }
}
--------------------------------------------------
BERT
Precision: 0.7506004571914673 | Recall: 0.710690975189209 | F1: 0.729341983795166
