In [2]:
import numpy as np
import pandas as pd
import os
import torch
import tensorflow as tf
from datasets import load_dataset

## Load pretrained model from Hgging Face

In [3]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  2


In [4]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-fr")

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

## Load the dataset and apply preprocessing

In [7]:
from datasets import load_dataset

dataset = load_dataset("kde4", lang1="en", lang2="fr")

README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

kde4.py:   0%|          | 0.00/4.25k [00:00<?, ?B/s]

The repository for kde4 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/kde4.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/7.05M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/210173 [00:00<?, ? examples/s]

Before we start we need to split the data into train and validation sets, but first, we see the dataset is very large so we will take only `80k` samples as our dataset and split accordingly

In [8]:
SEED = 4243
reduced_data = dataset["train"].shuffle(seed=SEED).select(range(80001))
split_dataset = reduced_data.train_test_split(train_size=0.9, seed=SEED)
# renaming  the "test" key of the "split_dataset" as "validation"
split_dataset["validation"] = split_dataset.pop("test")
split_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 72000
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 8001
    })
})

### Preprocessing 
1. Preparing the DatasetDict object for the model
2. Tokenization using the model's tokenizer

In [9]:
split_dataset['train'][1]

{'id': '121135',
 'translation': {'en': 'Teacher text:', 'fr': 'Texte du professeur & #160;:'}}

In [10]:
# define maximum length of the input sequence
max_len = 100

# define a function to tokenize the text using the model's tokenizer
def tokenize_dataset(sentences):
    # separate the english and french sentences into 2 lists
    english = [sentence['en'] for sentence in sentences['translation']]
    french = [sentence['fr'] for sentence in sentences['translation']]

    # apply the tokenizer
    inputs = tokenizer(english, text_target = french, max_length = max_len, truncation=True)
    
    # return tokenized inputs to be used by the model
    return inputs

In [11]:
# use the method to preprocess the dataset
tokenized_data = split_dataset.map(function=tokenize_dataset, batched=True, remove_columns = split_dataset["train"].column_names)

Map:   0%|          | 0/72000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8001 [00:00<?, ? examples/s]

#### Data Collator
Data collators are objects that will form a batch by using a list of dataset elements as input. These elements are of the same type as the elements of train dataset. 
So first we load it using our *tokenizer* and *model*, then we apply it on the data

instantiate the model as a **TF seq2seq model** to be able to apply the collator


In [12]:
from transformers import TFAutoModelForSeq2SeqLM
# instantiate the model as a TF seq2seq model
model = TFAutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-fr")

tf_model.h5:   0%|          | 0.00/301M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-fr.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [13]:
# import the collator
from transformers import DataCollatorForSeq2Seq

# load the data collator
collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model= model, return_tensors="tf")
# apply it to convert the dataset to tf.data.Dataset object
train_dataset = model.prepare_tf_dataset(
    tokenized_data['train'],
    collate_fn = collator,
    shuffle = True,
    batch_size = 32,
)

eval_dataset = model.prepare_tf_dataset(
    tokenized_data['validation'],
    collate_fn = collator,
    shuffle = True, 
    batch_size = 16,
)

## Fine-tuning the model
Notice that we're keeping the number of epochs very low as:
* 1st: 1 epoch can take 1h maybe 2
* 2nd: the model learns in one epoch what a traditional model can learn in 100 (an d even more) epochs



In [30]:
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback

# define number of epochs
epochs = 3
# nbr of training steps 
train_steps = len(train_dataset)*epochs

# define the optimizer
optimizer, schedule = create_optimizer(
    init_lr = 5e-5,
    num_warmup_steps = 0,
    num_train_steps = train_steps,
    weight_decay_rate = 0.01,
)

# compiling the model
model.compile(optimizer=optimizer)

When we train the model we want the model to be saved on Hugging Face so we need to log to the account and specify the directory it will b saved to

In [15]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [31]:
from transformers.keras_callbacks import PushToHubCallback

callback = PushToHubCallback(output_dir="helsinki-finetuned-en-to-fr",
                             tokenizer=tokenizer)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/kaggle/working/helsinki-finetuned-en-to-fr is already a clone of https://huggingface.co/ranwakhaled/helsinki-finetuned-en-to-fr. Make sure you pull the latest changes with `repo.git_pull()`.


In [32]:
!git config --global user.name "ranwakhaled"

  pid, fd = os.forkpty()


In [33]:
! git config --global user.email "sydneysageivashkov24@gmail.com"

In [34]:
# now we train the model
history = model.fit(train_dataset, 
                   validation_data= eval_dataset,
                   callbacks = [callback],
                   epochs = epochs)

Epoch 1/3

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}


Epoch 2/3

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}


Epoch 3/3

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}




### Evaluating the model
We use the **BLEU** score available in the `sacreBLEU` library using the test set
#### Loading the fine-tuned model to evaluate it

In [29]:
# Load model directly
tokenizer = AutoTokenizer.from_pretrained("ranwakhaled/helsinki-finetuned-en-to-fr")
model = TFAutoModelForSeq2SeqLM.from_pretrained("ranwakhaled/helsinki-finetuned-en-to-fr")

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at ranwakhaled/helsinki-finetuned-en-to-fr.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [30]:
!pip install sacrebleu evaluate

  pid, fd = os.forkpty()


Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.0.0-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, sacrebleu, evaluate
Successfully installed evaluate-0.4.3 portalocker-3.0.0 sacrebleu-2.4.3


In [31]:
import evaluate
from tqdm import tqdm

# load the scareBLEU metric
metric = evaluate.load("sacrebleu")

# create data collator for test set
test_collator = DataCollatorForSeq2Seq(tokenizer= tokenizer,
                                      model=model,
                                      return_tensors = "tf",
                                      pad_to_multiple_of = 128)
# generated tensorflow dataset object
test_dataset = model.prepare_tf_dataset(tokenized_data['validation'],
                                       collate_fn = test_collator,
                                       shuffle=False,
                                       batch_size=4,)
# shuffle dataset 
test_dataset = test_dataset.shuffle(buffer_size=10000, reshuffle_each_iteration=True)
# take the first 200 samples
test_dataset = test_dataset.take(200)

# wrapping the fun in a @tf.function to make compilation faster
#@tf.function(jit_compile=True)
def generate(batch):
    return model.generate(input_ids = batch["input_ids"],
                         attention_mask = batch["attention_mask"],
                         max_new_tokens = 128,)

def compute_metrics():
    # function to compute the bleu score for the dataset
    all_preds = []  # list to store predictions
    all_labels = []  # list to store labels

    # using tqdm we show a progress bar for our evaluation
    for batch, labels in tqdm(test_dataset):
        # get the french translation (prediction)
        predictions = generate(batch)
        # convert tokens to words (decoding the output)
        translations = tokenizer.batch_decode(predictions, akip_special_tokens=True)
        # convert labels to ndarray
        labels = labels.numpy()
        #Replace the -100 tokens with pad_token_id (here, 59513)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

        # decode the labels as well
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        # strip any extra spaces resulted from post processing
        translations = [trans.strip() for trans in translations]
        decoded_labels = [[label.strip()] for label in decoded_labels]

        all_preds.extend(translations)
        all_labels.extend(decoded_labels)

    # computing the BLEU metric
    BLEU = metric.compute(predictions=all_preds, references = all_labels)
    return BLEU

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [32]:
# use the compute_metrics function to evaluate the model 
metrics = compute_metrics()
print(metrics)

100%|██████████| 200/200 [54:39<00:00, 16.40s/it] 


{'score': 4.799920346318993, 'counts': [5957, 4389, 3371, 2630], 'totals': [82496, 81696, 80896, 80096], 'precisions': [7.220956167571761, 5.3723560517038775, 4.16707871835443, 3.2835597283260087], 'bp': 1.0, 'sys_len': 82496, 'ref_len': 8178}


### Test on an external sentence
Creating the `translate_sentence()` that takes a single sentence and returns the translation

I've noticed that the sentence is translated followed by this string  
`Please take the official translations! You find them here: http: / /europa. eu. int/ eur-lex/ lex/ LexUriServ/ LexUriServ/ LexUriServ. do? uri=CELEX:32001L0059: EN: HTML`   
which could be caused by the training data and could very well be eliminated if we increase training samples however since it doesn't necessarily affect the quality of the translation we'll just remove it in the post processing after the sentence is translated

In [33]:
def translate(sentence):
    batch = tokenizer([sentence], return_tensors='pt')
    generated_ids = model.generate(**batch)
    result = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    # Remove unwanted text
    unwanted_snippet = "Please take the official translations! You find them here:"
    return result.split(unwanted_snippet)[0].strip()

In [28]:
# test the loaded model on a sample
# Marie Curie est la savante la plus intelligente de son époque
# le chat est assit sur le tapis
# j'ai un controle le samedi prochain
# Ella a besoin de passer le DELF pour prouver son niveau
# Emile Zola était un des plus grands supporteur de Dreffus et il s'est engagé pour prouver son innocence

# load the model from hugging face
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("ranwakhaled/helsinki-finetuned-en-to-fr")
model = AutoModelForSeq2SeqLM.from_pretrained("ranwakhaled/helsinki-finetuned-en-to-fr")

sample_text = ["Marie Curie is the smartest scientist of her time",
              'the cat is sitting on the mat', 
             "I have an exam next saturday",
              "Ella needs to take the DELF  to prove her language level",
              "Emile Zola was one of the biggest supporters of Dreffus and was keen on proving his innocence"]
for i in range(len(sample_text)):
    print(f'Example {i}:\nEnglish: {sample_text[i]}\nTranslation: {translate(sample_text[i])}')

Example 1:
English: Marie Curie is the smartest scientist of her time
Translation: Marie Curie est la scientifique la plus intelligente de son temps
Example 1:
English: the cat is sitting on the mat
Translation: le chat est couché sur la grille
Example 1:
English: I have an exam next saturday
Translation: J'ai un examen le samedi prochain
Example 1:
English: Ella needs to take the DELF  to prove her language level
Translation: Ella doit prendre le DELF pour démontrer son niveau de langue
Example 1:
English: Emile Zola was one of the biggest supporters of Dreffus and was keen on proving his innocence
Translation: Émile Zola était l'un des plus grands supporters de Dreffus et avait hâte de démontrer son innocence
