Run this notebook with your predictions to generate your BLEU, CHRF, SacreBLEU, BERTScore scores.

In [1]:
import jieba
import json
import torch
from bert_score import score
from rouge_chinese import Rouge
from sacrebleu.metrics import BLEU, CHRF, TER

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# PREDICTION_FILE = '10k-transformer-scratch.json'
# PREDICTION_FILE = '10k-rnn-baseline-spm-bi.json'
PREDICTION_FILE = '230k-transformer-bert.json'
REFERENCE_FILE = 'iwslt2017-en-zh-test.zh'

In [9]:
def get_report():
    predictions = []
    reference = []

    with open(f'./predictions/{PREDICTION_FILE}', 'r') as f:
        raw = f.read()
        pdict = json.loads(raw)
        if "predicted" in pdict:
            predictions = [*predictions, *pdict['predicted']]

    with open(f'../tokenisation/data/{REFERENCE_FILE}', 'r') as f:
        reference = [*reference, *(f.readlines())]

    assert len(predictions) == len(reference), \
        'Received a wrong number of predictions. ' + \
        'Ensure that you have generated predictions for the whole test set. \n\n' + \
        f'Predictions Length: {len(predictions)}, Expected: {len(reference)}' 

    predictions = list(map(lambda s: s or '⁇', predictions))

    refs = [[r] for r in reference]
    sys = predictions

    # BLEU-4 (1/2/3/4)
    bleu = BLEU(smooth_method='exp', tokenize='zh', max_ngram_order=4)
    bleu = bleu.corpus_score(sys, refs)
    print('BLEU-4:')
    print(bleu)
    print('-' * 50)

    # CHRF
    print("CHRF")
    chrf = CHRF(word_order=0, beta=0, eps_smoothing=False)
    print(chrf.corpus_score(sys, refs))
    print(50*'-')

    # CHRF++
    print("CHRF++")
    chrf = CHRF(word_order=2, beta=0, eps_smoothing=False)
    print(chrf.corpus_score(sys, refs))
    print(50*'-')

    # TER
    print("TER")
    ter = TER(asian_support=True, normalized=True)
    print(ter.corpus_score(sys, refs))
    print(50*'-')

    # Rouge
    print("Rouge")

    def get_tok(sent):
        return ' '.join(jieba.lcut(sent))

    rouge = Rouge()
    scores = rouge.get_scores(list(map(get_tok, predictions)), list(map(get_tok, reference)), avg=True)
    print(json.dumps(scores, indent=2))
    print(50*'-')


    # BERTScore
    print("BERT")

    P, R, F1 = score(predictions, reference, lang='zh') # default model for zh is bert-base-chinese
    print(f'Precision: {P.mean().item()} | Recall: {R.mean().item()} | F1: {F1.mean().item()}') # Precision, Recall and F1

In [10]:
get_report()

BLEU-4:
BLEU = 24.60 100.0/57.1/15.4/4.2 (BP = 1.000 ratio = 1.000 hyp_len = 15 ref_len = 15)
--------------------------------------------------
CHRF
chrF0 = 20.11
--------------------------------------------------
CHRF++
chrF0++ = 15.09
--------------------------------------------------
TER
TER = 41.90
--------------------------------------------------
Rouge
{
  "rouge-1": {
    "r": 0.20487647385045615,
    "p": 0.3473601641298524,
    "f": 0.2499311787478755
  },
  "rouge-2": {
    "r": 0.0334416581210893,
    "p": 0.05322732074366437,
    "f": 0.03970221581262693
  },
  "rouge-l": {
    "r": 0.18791112899427534,
    "p": 0.2872260627331141,
    "f": 0.22032848392283083
  }
}
--------------------------------------------------
BERT
Precision: 0.6814401745796204 | Recall: 0.614592432975769 | F1: 0.6455613970756531
