In [10]:
import evaluate
import transformers
import os

In [19]:
# paths to the data
root_path = '..' # comment if using colab
# root_path = './drive/MyDrive/HandsOn-NLP/' # uncomment if using colab
data_path = f'{root_path}/data'
french_texts_path = f'{data_path}/chia_criteria_french'
chat_gpt_translations_path = f'{data_path}/chia_criteria_french_chat_gpt'

In [20]:
# load all french translated files
french_texts = os.listdir(french_texts_path)

In [29]:
all_sentences = []

for file in french_texts:
    with open(f'{french_texts_path}/{file}', 'r') as f:
        all_sentences.extend([line for line in f.readlines() if (line.strip() != '' and line.strip() != '\n' and len(line.split()) < 500) and len(line.split()) >= 3])

In [30]:
perplexity = evaluate.load("perplexity", module_type="metric")

results = perplexity.compute(model_id='Helsinki-NLP/opus-mt-en-fr',
                             add_start_token=False,
                             predictions=all_sentences)

  0%|          | 0/730 [00:00<?, ?it/s]

In [None]:
import json

with open(f'{data_path}/perplexity_results.json', 'w') as f:
    json.dump(results, f)

**BLEU evaluation**

We are going to use some translations from *chatGPT* as reference to get the BLEU score for the translations used in our task.

In [None]:
# get chat_gpt translations
files_gpt = os.listdir(chat_gpt_translations_path)

all_sentences_gpt = []
all_sentences_to_evaluate = []
for file in files_gpt:
    with open(f'{chat_gpt_translations_path}/{file}', 'r') as f:
        all_sentences_gpt.extend([line for line in f.readlines() if (line.strip() != '' and line.strip() != '\n' and len(line.split()) < 500) and len(line.split()) >= 3])
    with open(f'{french_texts_path}/{file}', 'r') as f:
        all_sentences_to_evaluate.extend([line for line in f.readlines() if (line.strip() != '' and line.strip() != '\n' and len(line.split()) < 500) and len(line.split()) >= 3])

In [None]:
bleu = evaluate.load("bleu")
results_bleu = bleu.compute(predictions=all_sentences_to_evaluate, references=all_sentences_gpt)

In [None]:
# save results
with open(f'{data_path}/bleu_results.json', 'w') as f:
    json.dump(results_bleu, f)

**TER(Translation Edit Rate) evaluation**

We will use the same approach as for BLEU score case. (using *chatGPT* translations as reference)

In [None]:
ter = evaluate.load("ter")
results_ter = ter.compute(predictions=all_sentences_to_evaluate, references=all_sentences_gpt)

In [None]:
# save results
with open(f'{data_path}/ter_results.json', 'w') as f:
    json.dump(results_ter, f)

**METEOR (Metric for Evaluation of Translation with Explicit ORdering)**

We will use the same approach as for BLEU score case. (using *chatGPT* translations as reference)

In [None]:
meteor = evaluate.load("ter")
results_meteor = meteor.compute(predictions=all_sentences_to_evaluate, references=all_sentences_gpt)

In [None]:
# save results
with open(f'{data_path}/meteor_results.json', 'w') as f:
    json.dump(results_meteor, f)