In [76]:
pip install transformers datasets evaluate sacrebleu

Note: you may need to restart the kernel to use updated packages.


# Load a sample dataset and split to train and test

In [77]:
from datasets import load_dataset

books = load_dataset("opus_books", "en-fr")
books = books["train"].train_test_split(test_size=0.2)

# An example of the dataset

In [78]:
books["train"][0]

{'id': '108121',
 'translation': {'en': 'It might be thought that this was Captain Speedy.',
  'fr': 'Certes, on doit croire que cet homme était le capitaine Speedy !'}}

# Import a tokenizer to process language pairs

In [79]:
from transformers import AutoTokenizer

checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Preprocess language pairs by tokenizing inputs and targets separately
(since you can’t tokenize French text with a tokenizer pretrained on an English vocabulary)

In [80]:
source_lang = "en"
target_lang = "fr"
prefix = "translate English to French: "

def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

# Apply the preprocess function to the entire dataset

In [81]:
tokenized_books = books.map(preprocess_function, batched=True)

Map: 100%|██████████| 101668/101668 [00:07<00:00, 12851.16 examples/s]
Map: 100%|██████████| 25417/25417 [00:02<00:00, 12551.21 examples/s]


# Create batches of dict-like objects

In [82]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")

# Import a metric for evaluation

In [83]:
import evaluate

metric = evaluate.load("sacrebleu")

# Create a function that passes your predictions and labels to compute to calculate the SacreBLEU score

In [84]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Transformer finetuning

In [85]:
from transformers import AdamWeightDecay

optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [86]:
from transformers import TFAutoModelForSeq2SeqLM

model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [87]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_books["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    tokenized_books["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


# Compile and train the model

In [88]:
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
model.compile(optimizer=optimizer)

In [89]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [90]:
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=1)
model.save_pretrained("tf_model/")



In [92]:
input_text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained("tf_model/")

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at tf_model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [95]:
tokenized = tokenizer([input_text], return_tensors='np')
out = model.generate(**tokenized, max_length=128)
print(out)

with tokenizer.as_target_tokenizer():
    print(tokenizer.decode(out[0], skip_special_tokens=True))

tf.Tensor(
[[    0   622     3    40   154  1744  2687 16762    29    17   110 17126
    393   110     3  9305  2229  2593  2210 11488     7     3    26    31
  17694    17    15     5     1]], shape=(1, 29), dtype=int32)
Les légumes partagent les ressources avec les bactéries fixatrices d'azote.
