In [1]:
!pip install -U sacrebleu datasets transformers sentencepiece protobuf accelerate evaluate unbabel-comet

Collecting sacrebleu
  Downloading sacrebleu-2.4.1-py3-none-any.whl (106 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/106.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m102.4/106.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.6/106.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/510.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m60.0 MB/s[0m e

In [2]:
from datasets import Dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import get_linear_schedule_with_warmup

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Load your dataset with pandas
file_path = '/content/drive/MyDrive/combined_df.csv' # Update with your file path
df = pd.read_csv(file_path)

In [5]:
df.dropna(inplace=True)

In [6]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [7]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq,
                          Seq2SeqTrainingArguments, Seq2SeqTrainer)

# Assuming `df` is your DataFrame with 'hr' and 'nl' columns

# Split the DataFrame into training and validation DataFrames
train_df, val_df = train_test_split(df, test_size=0.05, random_state=42)

# Convert the training and validation DataFrames into Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Initialize tokenizer and model
model_checkpoint = '/content/drive/MyDrive/models-finetuning-1/finetuned-m2m100-nl-to-hr-mmt/checkpoint-46095'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Model configuration settings
model.config.max_length = 64
model.config.early_stopping = True
model.config.num_beams = 5

# Specify source and target languages
tokenizer.src_lang = "nl"
tokenizer.tgt_lang = "hr"

# Define preprocess function to tokenize inputs and targets
def preprocess_function(examples):
    inputs = examples["nl"]
    targets = examples["hr"]
    model_inputs = tokenizer(inputs, max_length=64, truncation=True)

    # Tokenize the targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=64, truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=['hr', 'nl', '__index_level_0__'])
val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=['hr', 'nl', '__index_level_0__'])

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training arguments
args = Seq2SeqTrainingArguments(
    output_dir="finetuned-m2m100-minimal-effect",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-9,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.1,
    num_train_epochs=3,
    predict_with_generate=False,
    fp16=True,
    logging_dir='./logs',
    logging_steps=200,
    warmup_steps=0,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    label_smoothing_factor=0,
)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Now you can start training
# trainer.train()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/53280 [00:00<?, ? examples/s]



Map:   0%|          | 0/2805 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [8]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.6522,2.548484
2,2.6448,2.546086
3,2.594,2.5457


Non-default generation parameters: {'max_length': 64, 'early_stopping': True, 'num_beams': 5}
Non-default generation parameters: {'max_length': 64, 'early_stopping': True, 'num_beams': 5}
Non-default generation parameters: {'max_length': 64, 'early_stopping': True, 'num_beams': 5}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=2499, training_loss=2.6288748287400896, metrics={'train_runtime': 425.8856, 'train_samples_per_second': 375.312, 'train_steps_per_second': 5.868, 'total_flos': 4625210719076352.0, 'train_loss': 2.6288748287400896, 'epoch': 3.0})

In [9]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')
evaluation_df = pd.read_csv('/content/drive/MyDrive/evaluation_df.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
def translate_dutch_to_croatian_batch(texts):
    tokenizer.src_lang = "nl"
    encoded_nl = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to('cuda')
    generated_tokens = model.generate(**encoded_nl, forced_bos_token_id=tokenizer.get_lang_id("hr"))
    return [tokenizer.decode(g, skip_special_tokens=True) for g in generated_tokens]

In [11]:
from tqdm.auto import tqdm
from sacrebleu.metrics import BLEU

bleu = BLEU()

# Prepare lists for translations and BLEU scores
dutch_to_croatian_translations = []
dutch_to_croatian_bleu_scores = []

batch_size = 64  # Set the batch size for processing
for i in tqdm(range(0, len(evaluation_df), batch_size)):
    batch = evaluation_df.iloc[i:i+batch_size]

    # Translate batches from Dutch to Croatian
    translated_croatian_batch = translate_dutch_to_croatian_batch(batch['Dutch'].tolist())  # Correct function name and input column

    # Extend the translation list with the translated batch
    dutch_to_croatian_translations.extend(translated_croatian_batch)

    # Compute BLEU scores for Croatian translations in batch
    for translated_croatian, original_croatian in zip(translated_croatian_batch, batch['Dutch'].tolist()):  # Adjusted for clarity and correctness
        croatian_score = bleu.corpus_score([translated_croatian], [[original_croatian]])
        dutch_to_croatian_bleu_scores.append(croatian_score.score)

# Update DataFrame with translated Croatian and BLEU scores
evaluation_df['Translated Croatian'] = dutch_to_croatian_translations
evaluation_df['Dutch to Croatian BLEU'] = dutch_to_croatian_bleu_scores

  0%|          | 0/35 [00:00<?, ?it/s]



In [12]:
evaluation_df

Unnamed: 0.1,Unnamed: 0,Dutch,Croatian,Translated Croatian,Dutch to Croatian BLEU
0,0,Op maandag kondigden wetenschappers van de Sta...,U ponedjeljak su znanstvenici s Medicinskog fa...,U ponedjeljak su znanstvenici sa Sveučilišta S...,8.280336
1,1,Hoofdonderzoekers zeggen dat dit kan leiden to...,Voditelji istraživanja izjavili su da bi ovo o...,Glavni istraživači kažu da to može dovesti do ...,5.341814
2,2,De JAS 39C Gripen stortte rond 09.30 uur lokal...,JAS 39C Gripen srušio se na pistu i eksplodira...,JAS 39C Gripen srušio se oko 09 30 lokalnog vr...,34.854268
3,3,De piloot werd geïdentificeerd als majoor Dilo...,Pilot je identificiran kao zapovjednik eskadri...,Pilot je identificiran kao majstor Dilokrit Pa...,44.124845
4,4,De lokale media meldt dat er tijdens een actie...,Lokalni mediji izvješćuju da je došlo do prevr...,Lokalni mediji izvješćuju da je tijekom akcije...,19.468125
...,...,...,...,...,...
2221,212,Mijn naam is Andrea.,Moje ime je Andrea.,Moje ime je Andrea.,100.000000
2222,213,Jupiter is een planeet.,Jupiter je planet.,Jupiter je planet.,100.000000
2223,214,Venus is een planeet.,Venera je planet.,Venera je planet.,100.000000
2224,215,God is een olifant.,Bog je slon.,Bog je slon.,100.000000


In [17]:
import pandas as pd
from nltk.translate.meteor_score import single_meteor_score
from nltk import word_tokenize
import nltk
from tqdm import tqdm

# Download NLTK's tokenizer models
nltk.download('punkt')
nltk.download('wordnet')

# Initialize an empty list for the METEOR scores
dutch_to_croatian_meteor_scores = []

# Assuming evaluation_df is your DataFrame containing the 'Dutch' and 'Translated_Croatian' columns
# which are the original Dutch sentences and their Croatian translations, respectively.

# Loop through each row in the DataFrame
for _, row in tqdm(evaluation_df.iterrows(), total=evaluation_df.shape[0]):
    # Tokenize the original Croatian sentences and the translated Croatian sentences
    original_croatian = word_tokenize(row['Croatian'])  # Original Croatian sentences
    translated_croatian = word_tokenize(row['Translated Croatian'])  # Translated sentences from Dutch to Croatian

    # Calculate the METEOR score for each translation by comparing the translated Croatian against the original Croatian
    meteor_score = single_meteor_score(original_croatian, translated_croatian)
    dutch_to_croatian_meteor_scores.append(meteor_score)

# Add the METEOR scores to your DataFrame
evaluation_df['Dutch to Croatian METEOR'] = dutch_to_croatian_meteor_scores

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
100%|██████████| 2226/2226 [00:02<00:00, 985.70it/s] 


In [18]:
from evaluate import load
from tqdm import tqdm

# Load the COMET metric
comet_metric = load('comet')

# Function to compute COMET scores in batches
def compute_comet_in_batches(sources, predictions, references, batch_size=100):
    scores = []
    for i in tqdm(range(0, len(sources), batch_size)):
        batch_sources = sources[i:i + batch_size]
        batch_predictions = predictions[i:i + batch_size]
        batch_references = references[i:i + batch_size]
        batch_results = comet_metric.compute(predictions=batch_predictions, references=batch_references, sources=batch_sources)
        scores.extend(batch_results["scores"])
    return scores

# Assuming evaluation_df is your DataFrame with the necessary columns

# Preparing data for Dutch to Croatian COMET score calculation
data_dutch_to_croatian = {
    "sources": evaluation_df['Dutch'].tolist(),  # Original Dutch sentences as the source text
    "predictions": evaluation_df['Translated Croatian'].tolist(),  # Translated sentences in Croatian as the predicted text
    "references": evaluation_df['Croatian'].tolist()  # Original Croatian sentences as the reference text
}

# Calculate the COMET scores
dutch_to_croatian_scores = compute_comet_in_batches(data_dutch_to_croatian['sources'], data_dutch_to_croatian['predictions'], data_dutch_to_croatian['references'])
evaluation_df['Dutch to Croatian COMET'] = [round(score, 5) for score in dutch_to_croatian_scores]

# Display the updated DataFrame
print(evaluation_df.head())

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/saving.py:188: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
  0%|          | 0/23 [00:00<?, ?it/s]INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VIS

   Unnamed: 0                                              Dutch  \
0           0  Op maandag kondigden wetenschappers van de Sta...   
1           1  Hoofdonderzoekers zeggen dat dit kan leiden to...   
2           2  De JAS 39C Gripen stortte rond 09.30 uur lokal...   
3           3  De piloot werd geïdentificeerd als majoor Dilo...   
4           4  De lokale media meldt dat er tijdens een actie...   

                                            Croatian  \
0  U ponedjeljak su znanstvenici s Medicinskog fa...   
1  Voditelji istraživanja izjavili su da bi ovo o...   
2  JAS 39C Gripen srušio se na pistu i eksplodira...   
3  Pilot je identificiran kao zapovjednik eskadri...   
4  Lokalni mediji izvješćuju da je došlo do prevr...   

                                 Translated Croatian  Dutch to Croatian BLEU  \
0  U ponedjeljak su znanstvenici sa Sveučilišta S...                8.280336   
1  Glavni istraživači kažu da to može dovesti do ...                5.341814   
2  JAS 39C Gri




In [19]:
# Calculate average scores for each metric
average_dutch_to_croatian_bleu = evaluation_df['Dutch to Croatian BLEU'].mean()
average_dutch_to_croatian_meteor = evaluation_df['Dutch to Croatian METEOR'].mean()
average_dutch_to_croatian_comet = evaluation_df['Dutch to Croatian COMET'].mean()

# Print the averages
print("Translation scores from Dutch to Croatian:")
print(f"Average BLEU: {average_dutch_to_croatian_bleu}")
print(f"Average METEOR: {average_dutch_to_croatian_meteor}")
print(f"Average COMET: {average_dutch_to_croatian_comet}")

Translation scores from Dutch to Croatian:
Average BLEU: 18.14793389972014
Average METEOR: 0.422967240265338
Average COMET: 0.8420753459119497


In [20]:
evaluation_df.to_csv('/content/drive/MyDrive/evaluation_df_m2m100_nl_hr.csv')