In [27]:
import pandas as pd
import nltk

nltk.download('punkt')
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.tokenize import word_tokenize
import os


class Evaluator:

    def calculate_bleu_score(self, df_prediction):
        df_prediction['BLEU'] = 0
        for i, r in df_prediction.iterrows():
            bleu_score = sentence_bleu([word_tokenize(r['target'])], word_tokenize(r['translation']))
            df_prediction.at[i, 'BLEU'] = bleu_score

        return df_prediction

    def bleu_score_from_dataframe(self, dataframe, save_path='/data/'):
        df_prediction = dataframe.copy()
        df_prediction = self.calculate_bleu_score(df_prediction)

        if not os.path.exists(save_path):
            os.makedirs(save_path)

        save_path += 'df_prediction_with_BLEU'
        df_prediction.to_csv(save_path, sep=',')
        return df_prediction

    def bleu_score_from_file_path(self, prediction_file_path, sep=',', encoding='utf-8', save_path='/data/'):
        df_prediction = pd.read_csv(prediction_file_path, sep=sep, encoding=encoding)
        df_prediction = self.calculate_bleu_score(df_prediction)

        if not os.path.exists(save_path):
            os.makedirs(save_path)

        save_path += 'df_prediction_with_BLEU'
        df_prediction.to_csv(save_path, sep=',')
        return df_prediction

    def calculate_corpus_bleu(self, df_translation):
        list_of_references = []
        for sentence in df_translation['target'].values:
            list_of_references.append([word_tokenize(sentence)])

        hypotheses = []
        for sentence in df_translation['translation'].values:
            hypotheses.append(word_tokenize(sentence))

        return corpus_bleu(list_of_references, hypotheses)

    def calculate_mean_bleu(self, df_prediction):
        mean_bleu = df_prediction.loc[:, 'BLEU'].mean()
        return mean_bleu


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [29]:
trans = Evaluator()
df_translation = trans.bleu_score_from_file_path('/content/drive/MyDrive/Colab Notebooks/predictions_Helsinki-NLP_10-03-2023 (1).csv', save_path = '/content/')
df_translation.head(2)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Unnamed: 0,source,target,translation,BLEU
0,"""We now have 4-month-old mice that are non-dia...","""Abbiamo topi di quattro mesi che prima erano ...","""Ora abbiamo topi di 4 mesi che non sono diabe...",3.737747e-78
1,"Dr. Ehud Ur, professor of medicine at Dalhousi...","Lo studio è ancora in fase iniziale, come dich...","Dr. Ehud Ur, professore di medicina presso l'U...",0.2274466


In [30]:
trans.calculate_corpus_bleu(df_translation)

0.2560285427662942

In [31]:
import numpy as np
print(np.mean(df_translation['BLEU'].values))

print(trans.calculate_mean_bleu(df_translation))

0.22117357052480296
0.22117357052480296
