Run this notebook with your predictions to generate your BLEU, CHRF, SacreBLEU, BERTScore scores.

In [81]:
import jieba
import json
import torch
from bert_score import score
from rouge_chinese import Rouge
from sacrebleu.metrics import BLEU, CHRF, TER

In [82]:
PREDICTION_FILE = '10k-rnn-baseline-ft-bi.json' # rnn-baseline-spm-uni
REFERENCE_FILE = 'iwslt2017-en-zh-test.zh'

In [83]:
predictions = []
reference = []

with open(f'./predictions/{PREDICTION_FILE}', 'r', encoding='utf-8') as f:
    raw = f.read()
    pdict = json.loads(raw)
    if "predicted" in pdict:
        predictions = [*predictions, *pdict['predicted']]

with open(f'../tokenisation/sentencepiece_custom/{REFERENCE_FILE}', 'r',encoding='utf-8') as f:
    reference = [*reference, *(f.readlines())]

assert len(predictions) == len(reference), \
    'Received a wrong number of predictions. ' + \
    'Ensure that you have generated predictions for the whole test set. \n\n' + \
    f'Predictions Length: {len(predictions)}, Expected: {len(reference)}' 

refs = [
    predictions
]

sys = reference


In [84]:
sys = [str(s) if not isinstance(s, str) else s for s in sys]
refs = [[str(r) if not isinstance(r, str) else r for r in ref] for ref in refs]

## Overall Mean

In [85]:
# BLEU-4 (1/2/3/4)
bleu = BLEU(smooth_method='exp', tokenize='zh', max_ngram_order=4)
bleu = bleu.corpus_score(sys, refs)
print('BLEU-4:')
print(bleu)
print('-' * 50)

# CHRF
print("CHRF")
chrf = CHRF(word_order=0, beta=0, eps_smoothing=False)
print(chrf.corpus_score(sys, refs))
print(50*'-')

# CHRF++
print("CHRF++")
chrf = CHRF(word_order=2, beta=0, eps_smoothing=False)
print(chrf.corpus_score(sys, refs))
print(50*'-')

# TER
print("TER")
ter = TER(asian_support=True, normalized=True)
print(ter.corpus_score(sys, refs))
print(50*'-')

# Rouge
print("Rouge")

def get_tok(sent):
    sent = str(sent)
    return ' '.join(jieba.lcut(sent))

rouge = Rouge()
scores = rouge.get_scores(list(map(get_tok, predictions)), list(map(get_tok, reference)), avg=True)
print(json.dumps(scores, indent=2))
print(50*'-')


# BERTScore
print("BERT")
# run this if your strings got corrupted 
# predictions = [str(p) for p in predictions]
# reference = [str(r) for r in reference]
P, R, F1 = score(predictions, reference, lang='zh') # default model for zh is bert-base-chinese
print(f'Precision: {P.mean().item()} | Recall: {R.mean().item()} | F1: {F1.mean().item()}') # Precision, Recall and F1

BLEU-4:
BLEU = 0.00 3.7/0.0/0.0/0.0 (BP = 1.000 ratio = 13.895 hyp_len = 245077 ref_len = 17638)
--------------------------------------------------
CHRF
chrF0 = 0.61
--------------------------------------------------
CHRF++
chrF0++ = 0.46
--------------------------------------------------
TER
TER = 1336.18
--------------------------------------------------
Rouge
{
  "rouge-1": {
    "r": 0.04634896887117157,
    "p": 0.6718329629196397,
    "f": 0.08543896200682037
  },
  "rouge-2": {
    "r": 4.442661086220743e-05,
    "p": 0.000935781962802667,
    "f": 8.391199518847937e-05
  },
  "rouge-l": {
    "r": 0.05770398644967713,
    "p": 0.49870111514017207,
    "f": 0.10075674648956627
  }
}
--------------------------------------------------
BERT
Precision: 0.5123193860054016 | Recall: 0.4146314859390259 | F1: 0.45802053809165955


## Save Individual Scores

In [86]:
# BLEU
bleu = BLEU(smooth_method='exp', tokenize='zh', max_ngram_order=4) # effective_order=True
bleu_scores = [bleu.sentence_score(hypothesis, [reference]).score for hypothesis, reference in zip(sys, refs[0])]

# CHRF
chrf = CHRF(word_order=0, beta=0, eps_smoothing=False)
chrf_scores = [chrf.sentence_score(hypothesis, [reference]).score for hypothesis, reference in zip(sys, refs[0])]

# CHRF++
chrf_plus = CHRF(word_order=2, beta=0, eps_smoothing=False)
chrf_plus_scores = [chrf_plus.sentence_score(hypothesis, [reference]).score for hypothesis, reference in zip(sys, refs[0])]

# TER
ter = TER(asian_support=True, normalized=True)
ter_scores = [ter.sentence_score(hypothesis, [reference]).score for hypothesis, reference in zip(sys, refs[0])]

# ROUGE 
rouge = Rouge()
r_scores = rouge.get_scores(list(map(get_tok, predictions)), list(map(get_tok, reference)), avg=False)

# BERTSCORE 
P, R, F1 = score(predictions, reference, lang='zh')
precision_scores = P.tolist()
recall_scores = R.tolist()
f1_scores = F1.tolist()


scores = {
    'BLEU': bleu_scores,
    'CHRF': chrf_scores,
    'CHRF++': chrf_plus_scores,
    'TER': ter_scores,
    'ROUGE':r_scores,
    'BERTSCORE_P' : precision_scores,
    'BERTSCORE_R' : recall_scores,
    'BERTSCORE_F1' : f1_scores
}

with open(f'individual_scores\{PREDICTION_FILE[:-5]}.json', 'w') as f:
    json.dump(scores, f, indent=4)


It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is 