In [101]:
pip install transformers datasets evaluate sacrebleu

Note: you may need to restart the kernel to use updated packages.


# Load a sample dataset and split to train and test

In [102]:
from datasets import load_dataset

books = load_dataset("opus_books", "en-es") # isws vrw kai allo ena dataset en-es
books = books["train"].train_test_split(test_size=0.2)

books_train_validation_split = books["train"].train_test_split(test_size=0.1)

books["train"] = books_train_validation_split["train"]
books["validation"] = books_train_validation_split["test"]

# An example of the dataset

In [103]:
books["train"][0]

{'id': '80523',
 'translation': {'en': 'During all this time the necessary works had not been neglected.',
  'es': 'Durante aquel tiempo no se habían descuidado las obras necesarias.'}}

# Import a tokenizer to process language pairs

In [104]:
from transformers import AutoTokenizer

checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, model_max_length=128)

# Preprocess language pairs by tokenizing inputs and targets separately
(since you can’t tokenize Spanish text with a tokenizer pretrained on an English vocabulary)

In [105]:
source_lang = "en"
target_lang = "es"
prefix = "translate English to Spanish: "

def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True) # 128 for t5-small
    return model_inputs

# Apply the preprocess function to the entire dataset

In [106]:
tokenized_books = books.map(preprocess_function, batched=True)

Map:   0%|          | 0/67298 [00:00<?, ? examples/s]

Map:   0%|          | 0/18694 [00:00<?, ? examples/s]

Map:   0%|          | 0/7478 [00:00<?, ? examples/s]

# Create batches of dict-like objects

In [107]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")

# Import a metric for evaluation

In [108]:
import evaluate

# more metrics
metric = evaluate.load("sacrebleu")

# Create a function that passes your predictions and labels to compute to calculate the SacreBLEU score

In [109]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    try:
        preds, labels = eval_preds
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    
        result = metric.compute(predictions=decoded_preds, references=decoded_labels)
        result = {"bleu": result["score"]}
    
        prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
        result["gen_len"] = np.mean(prediction_lens)
        result = {k: round(v, 4) for k, v in result.items()}
        return result
    except Exception as e:
        print("Error in compute_metrics:", e)
        # Optionally, re-raise the error after logging
        raise

# Transformer finetuning

In [110]:
from transformers import AdamWeightDecay

optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [111]:
from transformers import TFAutoModelForSeq2SeqLM

model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [112]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_books["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    tokenized_books["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_books["validation"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [113]:
for batch in tf_train_set.take(1):
    print(batch)
for batch in tf_test_set.take(1):
    print(batch)
for batch in tf_validation_set.take(1):
    print(batch)

({'input_ids': <tf.Tensor: shape=(16, 119), dtype=int64, numpy=
array([[13959,  1566,    12, ...,     0,     0,     0],
       [13959,  1566,    12, ...,     0,     0,     0],
       [13959,  1566,    12, ...,     0,     0,     0],
       ...,
       [13959,  1566,    12, ...,     0,     0,     0],
       [13959,  1566,    12, ...,     0,     0,     0],
       [13959,  1566,    12, ...,     0,     0,     0]], dtype=int64)>, 'attention_mask': <tf.Tensor: shape=(16, 119), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int64)>}, <tf.Tensor: shape=(16, 128), dtype=int64, numpy=
array([[    3,     2,   476, ...,  -100,  -100,  -100],
       [    3,     2, 15046, ...,  -100,  -100,  -100],
       [  180,     2,     6, ...,  -100,  -100,  -100],
       ...,
       [ 1915,    32,   975, ...,  -100,  -100,  -100],

## Metrics

In [114]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set, predict_with_generate=True)

# Compile and train the model

In [115]:
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
model.compile(optimizer=optimizer)

## See the model summary

In [116]:
model.summary()

Model: "tft5_for_conditional_generation_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (Embedding)          multiple                  16449536  
                                                                 
 encoder (TFT5MainLayer)     multiple                  35330816  
                                                                 
 decoder (TFT5MainLayer)     multiple                  41625344  
                                                                 
Total params: 60,506,624
Trainable params: 60,506,624
Non-trainable params: 0
_________________________________________________________________


In [117]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [125]:
from tqdm import tqdm

generation_data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128
)

def generate_with_xla(batch):
    return model.generate(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
        max_new_tokens=128,
    )
    
def compute_metrics():
    all_preds = []
    all_labels = []

    for batch, labels in tqdm(tf_validation_set):
        predictions = generate_with_xla(batch)
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = labels.numpy()
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]
        all_preds.extend(decoded_preds)
        all_labels.extend(decoded_labels)

    result = metric.compute(predictions=all_preds, references=all_labels)
    return {"bleu": result["score"]}

print(compute_metrics())

100%|██████████| 468/468 [1:31:06<00:00, 11.68s/it]


{'bleu': 1.3294719997610709}


In [126]:
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=1)
model.save_pretrained("tf_model/")

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
print(compute_metrics())

 84%|████████▍ | 395/468 [1:18:16<14:12, 11.68s/it]

In [None]:
input_text = "translate English to Spanish: Legumes share resources with nitrogen-fixing bacteria."

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained("tf_model/")

## Generate the translations

In [None]:
tokenized = tokenizer([input_text], return_tensors='np')
out = model.generate(**tokenized, max_length=128)
print(out)

with tokenizer.as_target_tokenizer():
    print(tokenizer.decode(out[0], skip_special_tokens=True))

# Second Implementation

In [40]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [None]:
input_text = "This is a sentence that we want to translate."

tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "es_XX"
encoded_input = tokenizer(input_text, return_tensors="pt")

## Translate the text

In [None]:
translated_tokens = model.generate(**encoded_input)

## Tokens to strings

In [None]:
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
print(translated_text)

# Third implementation

In [None]:
from transformers import MarianMTModel, MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-en-es"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Example usage
input_text = "Hello, how are you?"
tokenized = tokenizer([input_text], return_tensors='pt', max_length=512, truncation=True, padding="max_length")
translated = model.generate(**tokenized)
translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
print(translated_text)