In [1]:
!pip install -U evaluate sacrebleu unbabel-comet



In [2]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')
evaluation_df = pd.read_csv('/content/drive/MyDrive/evaluation_df.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer


model_save_path = '/content/drive/MyDrive/models-finetuning-1/finetuned-m2m100-hr-to-nl-mmt/checkpoint-46095'

# Load the model
model = M2M100ForConditionalGeneration.from_pretrained(model_save_path)

# Load the tokenizer
tokenizer = M2M100Tokenizer.from_pretrained(model_save_path)

print('Model and tokenizer have been loaded successfully.')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model and tokenizer have been loaded successfully.


In [4]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

M2M100ForConditionalGeneration(
  (model): M2M100Model(
    (shared): Embedding(128112, 1024, padding_idx=1)
    (encoder): M2M100Encoder(
      (embed_tokens): Embedding(128112, 1024, padding_idx=1)
      (embed_positions): M2M100SinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0-11): 12 x M2M100EncoderLayer(
          (self_attn): M2M100Attention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm

In [5]:
def translate_croatian_to_dutch_batch(texts):
    tokenizer.src_lang = "hr"
    encoded_hr = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to('cuda')
    generated_tokens = model.generate(**encoded_hr, forced_bos_token_id=tokenizer.get_lang_id("nl"))
    return [tokenizer.decode(g, skip_special_tokens=True) for g in generated_tokens]

In [6]:
from tqdm.auto import tqdm
from sacrebleu.metrics import BLEU

bleu = BLEU()

# Prepare lists for translations and BLEU scores
croatian_to_dutch_translations = []
croatian_to_dutch_bleu_scores = []

batch_size = 8  # Set the batch size for processing
for i in tqdm(range(0, len(evaluation_df), batch_size)):
    batch = evaluation_df.iloc[i:i+batch_size]

    # Translate batches from Croatian to Dutch
    translated_dutch_batch = translate_croatian_to_dutch_batch(batch['Croatian'].tolist())  # Corrected function name and input column

    # Extend the translation list with the translated batch
    croatian_to_dutch_translations.extend(translated_dutch_batch)

    # Compute BLEU scores for Dutch translations in batch
    for translated_dutch, original_dutch in zip(translated_dutch_batch, batch['Dutch'].tolist()):  # Corrected reference to Dutch
        dutch_score = bleu.corpus_score([translated_dutch], [[original_dutch]])
        croatian_to_dutch_bleu_scores.append(dutch_score.score)

# Update DataFrame with translated Dutch and BLEU scores
evaluation_df['Translated Dutch'] = croatian_to_dutch_translations
evaluation_df['Croatian to Dutch BLEU'] = croatian_to_dutch_bleu_scores

  0%|          | 0/279 [00:00<?, ?it/s]



In [7]:
evaluation_df

Unnamed: 0.1,Unnamed: 0,Dutch,Croatian,Translated Dutch,Croatian to Dutch BLEU
0,0,Op maandag kondigden wetenschappers van de Sta...,U ponedjeljak su znanstvenici s Medicinskog fa...,Op maandag kondigden wetenschappers van de Uni...,20.552056
1,1,Hoofdonderzoekers zeggen dat dit kan leiden to...,Voditelji istraživanja izjavili su da bi ovo o...,Onderzoekers zeiden dat deze ontdekking het mo...,18.530390
2,2,De JAS 39C Gripen stortte rond 09.30 uur lokal...,JAS 39C Gripen srušio se na pistu i eksplodira...,JAS 39C Gripen crashte op de baan en explodeer...,13.181313
3,3,De piloot werd geïdentificeerd als majoor Dilo...,Pilot je identificiran kao zapovjednik eskadri...,De piloot werd geïdentificeerd als de commanda...,33.260250
4,4,De lokale media meldt dat er tijdens een actie...,Lokalni mediji izvješćuju da je došlo do prevr...,Lokale media melden dat er een brandweervoertu...,10.079037
...,...,...,...,...,...
2221,212,Mijn naam is Andrea.,Moje ime je Andrea.,Mijn naam is Andrea.,100.000000
2222,213,Jupiter is een planeet.,Jupiter je planet.,Jupiter is een planeet.,100.000000
2223,214,Venus is een planeet.,Venera je planet.,Venus is een planeet.,100.000000
2224,215,God is een olifant.,Bog je slon.,God is een olifant.,100.000000


In [8]:
import pandas as pd
from nltk.translate.meteor_score import single_meteor_score
from nltk import word_tokenize
import nltk
from tqdm import tqdm

# Download NLTK's tokenizer models
nltk.download('punkt')
nltk.download('wordnet')

# Initialize an empty list for the METEOR scores
croatian_to_dutch_meteor_scores = []

# Loop through each row in the DataFrame
for _, row in tqdm(evaluation_df.iterrows(), total=evaluation_df.shape[0]):
    # Tokenize the Croatian and Dutch sentences
    original_dutch = word_tokenize(row['Dutch'])  # Corrected to original Dutch sentences
    translated_dutch = word_tokenize(row['Translated Dutch'])  # Corrected sentences from Croatian to Dutch

    # Calculate the METEOR score for Croatian to Dutch translation
    meteor_score = single_meteor_score(original_dutch, translated_dutch)
    croatian_to_dutch_meteor_scores.append(meteor_score)

# Add the METEOR scores to DataFrame
evaluation_df['Croatian to Dutch METEOR'] = croatian_to_dutch_meteor_scores

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
100%|██████████| 2226/2226 [00:03<00:00, 559.74it/s] 


In [9]:
from evaluate import load
from tqdm import tqdm

# Load the COMET metric
comet_metric = load('comet')

# Function to compute COMET scores in batches
def compute_comet_in_batches(sources, predictions, references, batch_size=100):
    scores = []
    for i in tqdm(range(0, len(sources), batch_size)):
        batch_sources = sources[i:i + batch_size]
        batch_predictions = predictions[i:i + batch_size]
        batch_references = references[i:i + batch_size]
        batch_results = comet_metric.compute(predictions=batch_predictions, references=batch_references, sources=batch_sources)
        scores.extend(batch_results["scores"])
    return scores

# Dutch to Croatian COMET scores corrected to Croatian to Dutch
data_cro_to_dutch = {
    "sources": evaluation_df['Croatian'].tolist(),  # Original Croatian sentences corrected
    "predictions": evaluation_df['Translated Dutch'].tolist(),  # Translated sentences in Dutch corrected
    "references": evaluation_df['Dutch'].tolist()  # Reference sentences in Dutch corrected
}

# Calculate the COMET scores
croatian_to_dutch_scores = compute_comet_in_batches(data_cro_to_dutch['sources'], data_cro_to_dutch['predictions'], data_cro_to_dutch['references'])
evaluation_df['Croatian to Dutch COMET'] = [round(score, 5) for score in croatian_to_dutch_scores]

# Display the updated DataFrame
print(evaluation_df.head())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/saving.py:188: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
  0%|          | 0/23 [00:00<?, ?it/s]INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VIS

   Unnamed: 0                                              Dutch  \
0           0  Op maandag kondigden wetenschappers van de Sta...   
1           1  Hoofdonderzoekers zeggen dat dit kan leiden to...   
2           2  De JAS 39C Gripen stortte rond 09.30 uur lokal...   
3           3  De piloot werd geïdentificeerd als majoor Dilo...   
4           4  De lokale media meldt dat er tijdens een actie...   

                                            Croatian  \
0  U ponedjeljak su znanstvenici s Medicinskog fa...   
1  Voditelji istraživanja izjavili su da bi ovo o...   
2  JAS 39C Gripen srušio se na pistu i eksplodira...   
3  Pilot je identificiran kao zapovjednik eskadri...   
4  Lokalni mediji izvješćuju da je došlo do prevr...   

                                    Translated Dutch  Croatian to Dutch BLEU  \
0  Op maandag kondigden wetenschappers van de Uni...               20.552056   
1  Onderzoekers zeiden dat deze ontdekking het mo...               18.530390   
2  JAS 39C Gri




In [10]:
# Calculate average scores for each metric
average_croatian_to_dutch_bleu = evaluation_df['Croatian to Dutch BLEU'].mean()
average_croatian_to_dutch_meteor = evaluation_df['Croatian to Dutch METEOR'].mean()
average_croatian_to_dutch_comet = evaluation_df['Croatian to Dutch COMET'].mean()

# Print the averages
print("Translation scores from Croatian to Dutch:")
print(f"Average BLEU: {average_croatian_to_dutch_bleu}")
print(f"Average METEOR: {average_croatian_to_dutch_meteor}")
print(f"Average COMET: {average_croatian_to_dutch_comet}")

Translation scores from Croatian to Dutch:
Average BLEU: 20.20284578379734
Average METEOR: 0.4724022992653879
Average COMET: 0.824976051212938


In [11]:
evaluation_df.to_csv('/content/drive/MyDrive/evaluation_df_m2m100_1_nl_hr.csv')