Run this notebook with your predictions to generate your BLEU, CHRF, SacreBLEU, BERTScore scores.

In [14]:
import jieba
import json
import torch
from bert_score import score
from rouge_chinese import Rouge
from sacrebleu.metrics import BLEU, CHRF, TER

In [15]:
PREDICTION_FILE = '10k-rnn-baseline-spm-bi.json'
REFERENCE_FILE = 'iwslt2017-en-zh-test.zh'

In [16]:
predictions = []
reference = []

with open(f'./predictions/{PREDICTION_FILE}', 'r', encoding='utf-8') as f:
    raw = f.read()
    pdict = json.loads(raw)
    if "predicted" in pdict:
        predictions = [*predictions, *pdict['predicted']]

with open(f'../tokenisation/data/{REFERENCE_FILE}', 'r',encoding='utf-8') as f:
    reference = [*reference, *(f.readlines())]

assert len(predictions) == len(reference), \
    'Received a wrong number of predictions. ' + \
    'Ensure that you have generated predictions for the whole test set. \n\n' + \
    f'Predictions Length: {len(predictions)}, Expected: {len(reference)}' 

refs = [[r] for r in reference]
sys = [str(p).rstrip() for p in predictions]

## Corpus Score for BLEU , CHRF , CHRF++ , TER and mean for ROUGE and BERTSCORE

In [17]:
# predicted , reference
sys[0] , refs[0]

('在 过 去 的 年 前 , , , , , , , , , ,   ⁇     ⁇     ⁇',
 ['几年前，在TED大会上， Peter Skillman 介绍了一个设计挑战 叫做“棉花糖挑战”\n'])

In [18]:
# BLEU-4 (1/2/3/4)
bleu = BLEU(smooth_method='exp', tokenize='zh', max_ngram_order=4) 
bleu = bleu.corpus_score(sys, refs)  
print('BLEU-4:')
print(bleu)
print('-' * 50)

# CHRF
print("CHRF")
chrf = CHRF(word_order=0, beta=0, eps_smoothing=False)
print(chrf.corpus_score(sys, refs))
print(50*'-')

# CHRF++
print("CHRF++")
chrf = CHRF(word_order=2, beta=0, eps_smoothing=False)
print(chrf.corpus_score(sys, refs))
print(50*'-')

# TER
print("TER")
ter = TER(asian_support=True, normalized=True)
print(ter.corpus_score(sys, refs))
print(50*'-')

# Rouge
print("Rouge")
def get_tok(sent):
    sent = str(sent)
    return ' '.join(jieba.lcut(sent))
rouge = Rouge()
scores = rouge.get_scores(list(map(get_tok, predictions)), list(map(get_tok, reference)), avg=True)
print(json.dumps(scores, indent=2))
print(50*'-')

# BERTScore
print("BERT")
# run this if your strings got corrupted 
predictions = [str(p) for p in predictions]
reference = [str(r) for r in reference]
P, R, F1 = score(predictions, reference, lang='zh') # default model for zh is bert-base-chinese
print(f'Precision: {P.mean().item()} | Recall: {R.mean().item()} | F1: {F1.mean().item()}') # Precision, Recall and F1

BLEU-4:
BLEU = 20.61 73.7/33.3/11.8/6.2 (BP = 1.000 ratio = 1.000 hyp_len = 19 ref_len = 19)
--------------------------------------------------
CHRF
chrF0 = 10.17
--------------------------------------------------
CHRF++
chrF0++ = 12.50
--------------------------------------------------
TER


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\user\AppData\Local\Temp\jieba.cache


TER = 55.87
--------------------------------------------------
Rouge


Loading model cost 0.677 seconds.
Prefix dict has been built successfully.


{
  "rouge-1": {
    "r": 0.07791557855036035,
    "p": 0.21827346056100105,
    "f": 0.10602943437603676
  },
  "rouge-2": {
    "r": 0.001971258989469778,
    "p": 0.0037221272248760826,
    "f": 0.002327314834782995
  },
  "rouge-l": {
    "r": 0.077905867277176,
    "p": 0.09546040670909742,
    "f": 0.08279716574836206
  }
}
--------------------------------------------------
BERT
Precision: 0.5506570339202881 | Recall: 0.5108605027198792 | F1: 0.5291464328765869


## Save Individual Scores

In [20]:
refs = [[r] for r in reference]
sys = [str(p).rstrip() for p in predictions]

In [25]:
# BLEU
bleu = BLEU(smooth_method='exp', tokenize='zh', max_ngram_order=4, effective_order=True)
bleu_scores = [bleu.sentence_score(h, r).score for h, r in zip(sys, refs)]

# CHRF
chrf = CHRF(word_order=0, beta=0, eps_smoothing=False)
chrf_scores = [chrf.sentence_score(h, r).score for h, r in zip(sys, refs)]

# CHRF++
chrf_plus = CHRF(word_order=2, beta=0, eps_smoothing=False)
chrf_plus_scores = [chrf_plus.sentence_score(h, r).score for h, r in zip(sys, refs)]

# TER
ter = TER(asian_support=True, normalized=True)
ter_scores = [ter.sentence_score(h, r).score for h, r in zip(sys, refs)]

# ROUGE 
rouge = Rouge()
r_scores = rouge.get_scores(list(map(get_tok, predictions)), list(map(get_tok, reference)), avg=False)

# BERTSCORE 
P, R, F1 = score(predictions, reference, lang='zh')
precision_scores = P.tolist()
recall_scores = R.tolist()
f1_scores = F1.tolist()


scores = {
    'BLEU': bleu_scores,
    'CHRF': chrf_scores,
    'CHRF++': chrf_plus_scores,
    'TER': ter_scores,
    'ROUGE':r_scores,
    'BERTSCORE_P' : precision_scores,
    'BERTSCORE_R' : recall_scores,
    'BERTSCORE_F1' : f1_scores
}

with open(f'individual_scores\{PREDICTION_FILE[:-5]}.json', 'w') as f:
    json.dump(scores, f, indent=4)


In [23]:
refs[0]

['几年前，在TED大会上， Peter Skillman 介绍了一个设计挑战 叫做“棉花糖挑战”\n']