In [1]:
!pip install -U sacrebleu datasets transformers sentencepiece protobuf accelerate evaluate unbabel-comet

Collecting sacrebleu
  Downloading sacrebleu-2.4.1-py3-none-any.whl (106 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/106.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m102.4/106.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.6/106.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m61.2 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf
  Downloading protobuf-5.26.0-cp37-abi3-manylinux2014_x86_64.whl (302 kB)


In [2]:
from datasets import Dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import get_linear_schedule_with_warmup

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Load your dataset with pandas
file_path = '/content/drive/MyDrive/combined_df.csv' # Update with your file path
df = pd.read_csv(file_path)

In [5]:
df.dropna(inplace=True)

In [6]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [7]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq,
                          Seq2SeqTrainingArguments, Seq2SeqTrainer)

# Assuming `df` is your DataFrame with 'hr' and 'nl' columns

# Split the DataFrame into training and validation DataFrames
train_df, val_df = train_test_split(df, test_size=0.05, random_state=42)

# Convert the training and validation DataFrames into Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Initialize tokenizer and model
model_checkpoint = '/content/drive/MyDrive/models-finetuning-1/finetuned-m2m100-hr-to-nl-mmt/checkpoint-46095'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Model configuration settings
model.config.max_length = 64
model.config.early_stopping = True
model.config.num_beams = 5

# Specify source and target languages
tokenizer.src_lang = "hr"
tokenizer.tgt_lang = "nl"

# Define preprocess function to tokenize inputs and targets
def preprocess_function(examples):
    inputs = examples["hr"]
    targets = examples["nl"]
    model_inputs = tokenizer(inputs, max_length=64, truncation=True)

    # Tokenize the targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=64, truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=['hr', 'nl', '__index_level_0__'])
val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=['hr', 'nl', '__index_level_0__'])

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training arguments
args = Seq2SeqTrainingArguments(
    output_dir="finetuned-m2m100-minimal-effect",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-9,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.1,
    num_train_epochs=3,
    predict_with_generate=False,
    fp16=True,
    logging_dir='./logs',
    logging_steps=200,
    warmup_steps=0,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    label_smoothing_factor=0,
)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Now you can start training
# trainer.train()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/53280 [00:00<?, ? examples/s]



Map:   0%|          | 0/2805 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [8]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.5565,2.422888
2,2.5442,2.422034
3,2.5408,2.421889


Non-default generation parameters: {'max_length': 64, 'early_stopping': True, 'num_beams': 5}
Non-default generation parameters: {'max_length': 64, 'early_stopping': True, 'num_beams': 5}
Non-default generation parameters: {'max_length': 64, 'early_stopping': True, 'num_beams': 5}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=2499, training_loss=2.5355303009875825, metrics={'train_runtime': 418.9766, 'train_samples_per_second': 381.501, 'train_steps_per_second': 5.965, 'total_flos': 4104834726494208.0, 'train_loss': 2.5355303009875825, 'epoch': 3.0})

In [9]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')
evaluation_df = pd.read_csv('/content/drive/MyDrive/evaluation_df.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
def translate_croatian_to_dutch_batch(texts):
    tokenizer.src_lang = "hr"
    encoded_hr = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to('cuda')
    generated_tokens = model.generate(**encoded_hr, forced_bos_token_id=tokenizer.get_lang_id("nl"))
    return [tokenizer.decode(g, skip_special_tokens=True) for g in generated_tokens]

In [12]:
from tqdm.auto import tqdm
from sacrebleu.metrics import BLEU

bleu = BLEU()

# Prepare lists for translations and BLEU scores
croatian_to_dutch_translations = []
croatian_to_dutch_bleu_scores = []

batch_size = 64  # Set the batch size for processing
for i in tqdm(range(0, len(evaluation_df), batch_size)):
    batch = evaluation_df.iloc[i:i+batch_size]

    # Translate batches from Croatian to Dutch
    translated_dutch_batch = translate_croatian_to_dutch_batch(batch['Croatian'].tolist())  # Corrected function name and input column

    # Extend the translation list with the translated batch
    croatian_to_dutch_translations.extend(translated_dutch_batch)

    # Compute BLEU scores for Dutch translations in batch
    for translated_dutch, original_dutch in zip(translated_dutch_batch, batch['Dutch'].tolist()):  # Corrected reference to Dutch
        dutch_score = bleu.corpus_score([translated_dutch], [[original_dutch]])
        croatian_to_dutch_bleu_scores.append(dutch_score.score)

# Update DataFrame with translated Dutch and BLEU scores
evaluation_df['Translated Dutch'] = croatian_to_dutch_translations
evaluation_df['Croatian to Dutch BLEU'] = croatian_to_dutch_bleu_scores

  0%|          | 0/35 [00:00<?, ?it/s]

In [13]:
evaluation_df

Unnamed: 0.1,Unnamed: 0,Dutch,Croatian,Translated Dutch,Croatian to Dutch BLEU
0,0,Op maandag kondigden wetenschappers van de Sta...,U ponedjeljak su znanstvenici s Medicinskog fa...,Op maandag kondigden wetenschappers van de Uni...,20.552056
1,1,Hoofdonderzoekers zeggen dat dit kan leiden to...,Voditelji istraživanja izjavili su da bi ovo o...,Onderzoekers zeiden dat deze ontdekking het mo...,18.530390
2,2,De JAS 39C Gripen stortte rond 09.30 uur lokal...,JAS 39C Gripen srušio se na pistu i eksplodira...,JAS 39C Gripen crashte op de baan en explodeer...,13.181313
3,3,De piloot werd geïdentificeerd als majoor Dilo...,Pilot je identificiran kao zapovjednik eskadri...,De piloot werd geïdentificeerd als de commanda...,33.260250
4,4,De lokale media meldt dat er tijdens een actie...,Lokalni mediji izvješćuju da je došlo do prevr...,Lokale media melden dat er een brandweervoertu...,10.079037
...,...,...,...,...,...
2221,212,Mijn naam is Andrea.,Moje ime je Andrea.,Mijn naam is Andrea.,100.000000
2222,213,Jupiter is een planeet.,Jupiter je planet.,Jupiter is een planeet.,100.000000
2223,214,Venus is een planeet.,Venera je planet.,Venus is een planeet.,100.000000
2224,215,God is een olifant.,Bog je slon.,God is een olifant.,100.000000


In [14]:
import pandas as pd
from nltk.translate.meteor_score import single_meteor_score
from nltk import word_tokenize
import nltk
from tqdm import tqdm

# Download NLTK's tokenizer models
nltk.download('punkt')
nltk.download('wordnet')

# Initialize an empty list for the METEOR scores
croatian_to_dutch_meteor_scores = []

# Assuming evaluation_df is your DataFrame containing the 'Croatian' and 'Translated_Dutch' columns
# which are the original Croatian sentences and their Dutch translations, respectively.

# Loop through each row in the DataFrame
for _, row in tqdm(evaluation_df.iterrows(), total=evaluation_df.shape[0]):
    # Tokenize the Croatian and Dutch sentences
    original_dutch = word_tokenize(row['Dutch'])  # Corrected to original Dutch sentences
    translated_dutch = word_tokenize(row['Translated Dutch'])  # Corrected sentences from Croatian to Dutch

    # Calculate the METEOR score for Croatian to Dutch translation
    meteor_score = single_meteor_score(original_dutch, translated_dutch)
    croatian_to_dutch_meteor_scores.append(meteor_score)

# Add the METEOR scores to your DataFrame
evaluation_df['Croatian to Dutch METEOR'] = croatian_to_dutch_meteor_scores

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
100%|██████████| 2226/2226 [00:04<00:00, 541.48it/s]


In [15]:
from evaluate import load
from tqdm import tqdm

# Load the COMET metric
comet_metric = load('comet')

# Function to compute COMET scores in batches
def compute_comet_in_batches(sources, predictions, references, batch_size=100):
    scores = []
    for i in tqdm(range(0, len(sources), batch_size)):
        batch_sources = sources[i:i + batch_size]
        batch_predictions = predictions[i:i + batch_size]
        batch_references = references[i:i + batch_size]
        batch_results = comet_metric.compute(predictions=batch_predictions, references=batch_references, sources=batch_sources)
        scores.extend(batch_results["scores"])
    return scores

# Assuming evaluation_df is your DataFrame with the necessary columns

# Dutch to Croatian COMET scores corrected to Croatian to Dutch
data_cro_to_dutch = {
    "sources": evaluation_df['Croatian'].tolist(),  # Original Croatian sentences corrected
    "predictions": evaluation_df['Translated Dutch'].tolist(),  # Translated sentences in Dutch corrected
    "references": evaluation_df['Dutch'].tolist()  # Reference sentences in Dutch corrected
}

# Calculate the COMET scores
croatian_to_dutch_scores = compute_comet_in_batches(data_cro_to_dutch['sources'], data_cro_to_dutch['predictions'], data_cro_to_dutch['references'])
evaluation_df['Croatian to Dutch COMET'] = [round(score, 5) for score in croatian_to_dutch_scores]

# Display the updated DataFrame
print(evaluation_df.head())

Downloading builder script:   0%|          | 0.00/6.97k [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model.ckpt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/9.69k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.53k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

hparams.yaml:   0%|          | 0.00/567 [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/saving.py:188: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
  0%|          | 0/23 [00:00<?, ?it/s]INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:L

   Unnamed: 0                                              Dutch  \
0           0  Op maandag kondigden wetenschappers van de Sta...   
1           1  Hoofdonderzoekers zeggen dat dit kan leiden to...   
2           2  De JAS 39C Gripen stortte rond 09.30 uur lokal...   
3           3  De piloot werd geïdentificeerd als majoor Dilo...   
4           4  De lokale media meldt dat er tijdens een actie...   

                                            Croatian  \
0  U ponedjeljak su znanstvenici s Medicinskog fa...   
1  Voditelji istraživanja izjavili su da bi ovo o...   
2  JAS 39C Gripen srušio se na pistu i eksplodira...   
3  Pilot je identificiran kao zapovjednik eskadri...   
4  Lokalni mediji izvješćuju da je došlo do prevr...   

                                    Translated Dutch  Croatian to Dutch BLEU  \
0  Op maandag kondigden wetenschappers van de Uni...               20.552056   
1  Onderzoekers zeiden dat deze ontdekking het mo...               18.530390   
2  JAS 39C Gri




In [16]:
# Calculate average scores for each metric
average_croatian_to_dutch_bleu = evaluation_df['Croatian to Dutch BLEU'].mean()
average_croatian_to_dutch_meteor = evaluation_df['Croatian to Dutch METEOR'].mean()
average_croatian_to_dutch_comet = evaluation_df['Croatian to Dutch COMET'].mean()

# Print the averages
print("Translation scores from Croatian to Dutch:")
print(f"Average BLEU: {average_croatian_to_dutch_bleu}")
print(f"Average METEOR: {average_croatian_to_dutch_meteor}")
print(f"Average COMET: {average_croatian_to_dutch_comet}")

Translation scores from Croatian to Dutch:
Average BLEU: 20.192258614234717
Average METEOR: 0.47189188112277436
Average COMET: 0.8249622102425876


In [18]:
evaluation_df.to_csv('/content/drive/MyDrive/evaluation_df_m2m100_hr_nl.csv')