In [25]:
pip install transformers datasets evaluate sacrebleu

Note: you may need to restart the kernel to use updated packages.


# Load a sample dataset and split to train and test

In [26]:
from datasets import load_dataset

books = load_dataset("opus_books", "en-es") # isws vrw kai allo ena dataset en-es
books = books["train"].train_test_split(test_size=0.2)

# An example of the dataset

In [46]:
books["train"][0]

{'id': '2030',
 'translation': {'en': "The tea things were brought in, and already had Marianne been disappointed more than once by a rap at a neighbouring door, when a loud one was suddenly heard which could not be mistaken for one at any other house, Elinor felt secure of its announcing Willoughby's approach, and Marianne, starting up, moved towards the door.",
  'es': 'Trajeron las cosas para el té, y ya Marianne había tenido más de una decepción ante los golpes en alguna puerta vecina, cuando de repente se escuchó uno muy fuerte que no podía confundirse con alguno en otra casa. Elinor se sintió segura de que anunciaba la llegada de Willoughby, y Marianne, levantándose de un salto, se dirigió hacia la puerta.'}}

# Import a tokenizer to process language pairs

In [28]:
from transformers import AutoTokenizer

checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Preprocess language pairs by tokenizing inputs and targets separately
(since you can’t tokenize Spanish text with a tokenizer pretrained on an English vocabulary)

In [29]:
source_lang = "en"
target_lang = "es"
prefix = "translate English to Spanish: "

def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True) # 128 for t5-small
    return model_inputs

# Apply the preprocess function to the entire dataset

In [30]:
tokenized_books = books.map(preprocess_function, batched=True)

Map:   0%|          | 0/74776 [00:00<?, ? examples/s]

Map:   0%|          | 0/18694 [00:00<?, ? examples/s]

# Create batches of dict-like objects

In [31]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")

# Import a metric for evaluation

In [32]:
import evaluate

# more metrics
metric = evaluate.load("sacrebleu")

# Create a function that passes your predictions and labels to compute to calculate the SacreBLEU score

In [33]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Transformer finetuning

In [34]:
from transformers import AdamWeightDecay

optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [35]:
from transformers import TFAutoModelForSeq2SeqLM

model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [57]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_books["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    tokenized_books["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

TypeError: Dataset argument should be a datasets.Dataset!

## Metrics

In [44]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

NameError: name 'tf_validation_set' is not defined

# Compile and train the model

In [None]:
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
model.compile(optimizer=optimizer)

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [15]:
history = model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=1, callbacks=[metric_callback])
model.save_pretrained("tf_model/")



## Plot accuracy

In [16]:
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

NameError: name 'history' is not defined

In [17]:
input_text = "translate English to Spanish: Legumes share resources with nitrogen-fixing bacteria."

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained("tf_model/")

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at tf_model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


## Generate the translations

In [18]:
tokenized = tokenizer([input_text], return_tensors='np')
out = model.generate(**tokenized, max_length=128)
print(out)

with tokenizer.as_target_tokenizer():
    print(tokenizer.decode(out[0], skip_special_tokens=True))

tf.Tensor(
[[    0   312  1744  2687     3     9  5444  4922     3    15    40     3
     60     7    23  1259    32     3     9    50     7     3  9305     9
      7    20  2210   291     3 17694    17     9     5     1]], shape=(1, 34), dtype=int32)
Legumes asoció el residuo a las bacas de fixar azota.




# Second Implementation

In [19]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [20]:
input_text = "This is a sentence that we want to translate."

tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "es_XX"
encoded_input = tokenizer(input_text, return_tensors="pt")

## Translate the text

In [21]:
translated_tokens = model.generate(**encoded_input)

## Tokens to strings

In [22]:
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
print(translated_text)

Questa è una frase che vogliamo tradurre.


# Third implementation

In [17]:
from transformers import MarianMTModel, MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-en-es"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Example usage
input_text = "Hello, how are you?"
tokenized = tokenizer([input_text], return_tensors='pt', max_length=512, truncation=True, padding="max_length")
translated = model.generate(**tokenized)
translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
print(translated_text)

Hola, ¿cómo estás?
