In [1]:
pip install transformers datasets evaluate sacrebleu

Note: you may need to restart the kernel to use updated packages.


# Load a sample dataset and split to train and test

In [2]:
from datasets import load_dataset

books = load_dataset("opus_books", "en-es") # isws vrw kai allo ena dataset en-es
books = books["train"].train_test_split(test_size=0.2)

books_train_validation_split = books["train"].train_test_split(test_size=0.1)

books["train"] = books_train_validation_split["train"]
books["validation"] = books_train_validation_split["test"]

# An example of the dataset

In [3]:
books["train"][0]

{'id': '92627',
 'translation': {'en': 'I afterwards learned that it was to discover if we had any diamonds concealed. This practice had been established since time immemorial among those civilized nations that scour the seas. I was informed that the religious Knights of Malta never fail to make this search whenever any Moors of either sex fall into their hands. It is a part of the law of nations, from which they never deviate.',
  'es': 'mas luego supe que era por ver si en aquel sitio habíamos escondido algunos diamantes, y que es estilo establecido de tiempo inmemorial en las naciones civilizadas que andan barriendo los mares, y que los señores religiosos caballeros de Malta nunca le omiten quando apresan á Turcos ó Turcas, porque es ley del derecho de gentes, que nunca ha sido quebrantada.'}}

# Import a tokenizer to process language pairs

In [4]:
from transformers import AutoTokenizer

checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, model_max_length=128)

# Preprocess language pairs by tokenizing inputs and targets separately
(since you can’t tokenize Spanish text with a tokenizer pretrained on an English vocabulary)

In [5]:
source_lang = "en"
target_lang = "es"
prefix = "translate English to Spanish: "

def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True) # 128 for t5-small
    return model_inputs

# Apply the preprocess function to the entire dataset

In [6]:
tokenized_books = books.map(preprocess_function, batched=True)

Map:   0%|          | 0/67298 [00:00<?, ? examples/s]

Map:   0%|          | 0/18694 [00:00<?, ? examples/s]

Map:   0%|          | 0/7478 [00:00<?, ? examples/s]

# Create batches of dict-like objects

In [7]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")

# Import a metric for evaluation

In [8]:
import evaluate

# more metrics
metric = evaluate.load("sacrebleu")

# Create a function that passes your predictions and labels to compute to calculate the SacreBLEU score

In [9]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    try:
        preds, labels = eval_preds
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    
        result = metric.compute(predictions=decoded_preds, references=decoded_labels)
        result = {"bleu": result["score"]}
    
        prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
        result["gen_len"] = np.mean(prediction_lens)
        result = {k: round(v, 4) for k, v in result.items()}
        return result
    except Exception as e:
        print("Error in compute_metrics:", e)
        # Optionally, re-raise the error after logging
        raise

# Transformer finetuning

In [10]:
from transformers import AdamWeightDecay

optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [11]:
from transformers import TFAutoModelForSeq2SeqLM

model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [12]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_books["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    tokenized_books["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_books["validation"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


# Compile and train the model

In [13]:
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
model.compile(optimizer=optimizer)

## See the model summary

In [14]:
model.summary()

Model: "tft5_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (Embedding)          multiple                  16449536  
                                                                 
 encoder (TFT5MainLayer)     multiple                  35330816  
                                                                 
 decoder (TFT5MainLayer)     multiple                  41625344  
                                                                 
Total params: 60,506,624
Trainable params: 60,506,624
Non-trainable params: 0
_________________________________________________________________


In [15]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [16]:
from tqdm import tqdm

generation_data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128
)

def generate_with_xla(batch):
    return model.generate(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
        max_new_tokens=128,
    )
    
def compute_metrics():
    all_preds = []
    all_labels = []

    for batch, labels in tqdm(tf_validation_set):
        predictions = generate_with_xla(batch)
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = labels.numpy()
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]
        all_preds.extend(decoded_preds)
        all_labels.extend(decoded_labels)

    result = metric.compute(predictions=all_preds, references=all_labels)
    return {"bleu": result["score"]}

print(compute_metrics())

100%|██████████| 468/468 [1:17:36<00:00,  9.95s/it]


{'bleu': 0.4436166800441257}


In [17]:
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=10)
model.save_pretrained("tf_model/")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
print(compute_metrics())

100%|██████████| 468/468 [1:33:22<00:00, 11.97s/it]


{'bleu': 10.883976180425792}


In [19]:
input_text = "translate English to Spanish: He wrote a letter to a friend."

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained("tf_model/")

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at tf_model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


## Generate the translations

In [20]:
tokenized = tokenizer([input_text], return_tensors='np')
out = model.generate(**tokenized, max_length=128)
print(out)

with tokenizer.as_target_tokenizer():
    print(tokenizer.decode(out[0], skip_special_tokens=True))

tf.Tensor(
[[    0  1289     3    15     7 12563    32     3     9    73     3  3690
    839     5     1]], shape=(1, 15), dtype=int32)
El escrito a un amigo.




# Second Implementation

In [21]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [22]:
input_text = "Allow raters to post-edit translation and measure difference"

tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "es_XX"
encoded_input = tokenizer(input_text, return_tensors="pt")

## Translate the text

In [23]:
translated_tokens = model.generate(**encoded_input)

## Tokens to strings

In [24]:
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
print(translated_text)

អនុញ្ញាត ឲ្យ អ្នក វាយតម្លៃ កែសម្រួល ការ បកប្រែ ក្រោយ និង វាស់ ភាព ខុស គ្នា


# Third implementation

In [25]:
from transformers import MarianMTModel, MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-en-es"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Example usage
input_text = "Works poorly when comparing different kinds of systems"
tokenized = tokenizer([input_text], return_tensors='pt', max_length=512, truncation=True, padding="max_length")
translated = model.generate(**tokenized)
translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
print(translated_text)



Funciona mal al comparar diferentes tipos de sistemas
