In [6]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import TFT5ForConditionalGeneration, AutoTokenizer

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv("/content/drive/MyDrive/Datas/data.csv")

print(df.head())

  english  spanish
0     Go.      Ve.
1     Go.    Vete.
2     Go.    Vaya.
3     Go.  Váyase.
4     Hi.    Hola.


In [4]:
# Load the T5-Base modele
model = TFT5ForConditionalGeneration.from_pretrained("t5-base", use_safetensors=False, from_pt=True)
tokenizer = AutoTokenizer.from_pretrained("t5-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [5]:
def preprocessing(english_txt, spanish_txt, max_length = 128):
  inputs = ['translate English to Spanish: ' + x for x in english_txt]

  model_inputs = tokenizer(
      inputs,
      max_length=max_length,
      padding="max_length",
      truncation=True,
      return_tensors="tf")

  labels = tokenizer(
      spanish_txt,
      max_length = max_length,
      padding = "max_length",
      truncation = True,
      return_tensors = "tf")

  labels_ids = labels["input_ids"]
  labels_ids = tf.where(labels_ids == tokenizer.pad_token_id, -100, labels_ids)

  return{
      'input_ids': model_inputs['input_ids'],
      'attention_mask': model_inputs['attention_mask'],
      'labels': labels_ids
  }

In [6]:
train_data = preprocessing(df['english'].tolist(), df['spanish'].tolist())

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


In [18]:
train_data

{'input_ids': <tf.Tensor: shape=(118964, 128), dtype=int32, numpy=
 array([[13959,  1566,    12, ...,     0,     0,     0],
        [13959,  1566,    12, ...,     0,     0,     0],
        [13959,  1566,    12, ...,     0,     0,     0],
        ...,
        [13959,  1566,    12, ...,     0,     0,     0],
        [13959,  1566,    12, ...,     0,     0,     0],
        [13959,  1566,    12, ...,     0,     0,     0]], dtype=int32)>,
 'attention_mask': <tf.Tensor: shape=(118964, 128), dtype=int32, numpy=
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>,
 'labels': <tf.Tensor: shape=(118964, 128), dtype=int32, numpy=
 array([[ 3901,     5,     1, ...,  -100,  -100,  -100],
        [20495,    15,     5, ...,  -100,  -100,  -100],
        [ 2964,    63,     9, ...,  -100,  -100,  -100],
        ...,
        [  597,

# Test/Train Split

In [20]:
dataset_size = train_data['input_ids'].shape[0]
split_index = int(0.8 * dataset_size) #80% training

train_dataset = {
    'input_ids': train_data['input_ids'][:split_index],
    'attention_mask': train_data['attention_mask'][:split_index],
    'labels': train_data['labels'][:split_index]
}

val_dataset = {
    'input_ids': train_data['input_ids'][split_index:],
    'attention_mask': train_data['attention_mask'][split_index:],
    'labels': train_data['labels'][split_index:]
}

In [7]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

model.compile(
    optimizer=optimizer
)

In [10]:
model.summary()

Model: "tft5_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (Embedding)          multiple                  24674304  
                                                                 
 encoder (TFT5MainLayer)     multiple                  109628544 
                                                                 
 decoder (TFT5MainLayer)     multiple                  137949312 
                                                                 
Total params: 222903552 (850.31 MB)
Trainable params: 222903552 (850.31 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [28]:
history = model.fit(
    x={'input_ids': train_dataset['input_ids'],
       'attention_mask': train_dataset['attention_mask']},
    y=train_dataset['labels'],
    validation_data=(
        {'input_ids': val_dataset['input_ids'],
         'attention_mask': val_dataset['attention_mask']},
        val_dataset['labels']
    ),
    epochs=10,
    batch_size=16
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [29]:
model.save_pretrained("my_t5_model")
tokenizer.save_pretrained("my_t5_model")

('my_t5_model/tokenizer_config.json',
 'my_t5_model/special_tokens_map.json',
 'my_t5_model/spiece.model',
 'my_t5_model/added_tokens.json',
 'my_t5_model/tokenizer.json')

In [3]:
def load_model():
    model_path = "./my_t5_model"
    model = TFT5ForConditionalGeneration.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    return model, tokenizer

In [7]:
model, tokenizer = load_model()




TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at ./my_t5_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [8]:
def translate(text):
  inputs = 'translate English to Spanish: ' + text
  input_ids = tokenizer(inputs, return_tensors = 'tf')

  outputs = model.generate(
    input_ids['input_ids'],
    max_length=128,
    num_beams=5,
    early_stopping=True
  )

  translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return translation

In [9]:
print(translate("You look very beautiful today."))

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


Hoy te ves muy bonita.


In [2]:
import evaluate

bleu = evaluate.load("sacrebleu")
bertscore = evaluate.load("bertscore")

  from .autonotebook import tqdm as notebook_tqdm
  if not hasattr(np, "object"):
Downloading builder script: 8.15kB [00:00, 15.8MB/s]
Downloading builder script: 7.95kB [00:00, 10.9MB/s]


In [10]:
# A small test batch
predictions = [translate("Hello world"), translate("The dog is big")]
references = ["Hola mundo", "El perro es grande"]

# Calculate BLEU (Word overlap)
bleu_results = bleu.compute(predictions=predictions, references=[[r] for r in references])
print(f"BLEU Score: {bleu_results['score']}")

# Calculate BERTScore (Meaning similarity)
bert_results = bertscore.compute(predictions=predictions, references=references, lang="es")
print(f"BERTScore F1 Mean: {np.mean(bert_results['f1'])}")

BLEU Score: 59.460355750136046


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


BERTScore F1 Mean: 0.9093613028526306


In [None]:
testing = []
predictions = [tra]

In [48]:
test_data = [
    # 1. Imperatives (One word commands)
    "Go.",
    "Run.",
    "Wait.",
    "Stop.",

    # 2. Simple Greetings & Questions
    "Hi.",
    "Hello.",
    "Who are you?",
    "How are you?",
    "Where is he?",
    "Can I help you?",

    # 3. Simple Sentences (Subject + Verb + Object)
    "I am happy.",
    "She is my friend.",
    "He acts like a child.",
    "I like to read.",
    "The car is blue.",
    "We need more money.",

    # 4. Slightly Complex (Time/Conditionals)
    "I will go tomorrow.",
    "If you want, I can help.",
    "He speaks English very well.",
    "I have to go to sleep."
]

In [11]:
test_data_complex = [
    # 1. The Subjunctive Challenge (Testing Es/Que/Subj logic)
    "I want you to tell me the truth even if it is difficult.",
    "It is important that we finish the project before the deadline.",
    "I wish you were here to see this result.",
    "If I had known, I would have acted differently.",

    # 2. Idioms & Figurative Language (Testing 'Meaning' vs. 'Literal Translation')
    "Don't beat around the bush; just get to the point.",
    "That car cost me an arm and a leg.",
    "We are all in the same boat, so let's work together.",
    "He decided to call it a day after hours of debugging.",

    # 3. Technical & CS Domain (Testing your Texas State CS background)
    "The neural network uses back-propagation to minimize the loss function.",
    "You need to refactor the code to improve memory efficiency.",
    "The database uses a primary key to ensure data integrity.",
    "Asynchronous functions allow the program to run multiple tasks concurrently.",

    # 4. Long, Compound-Complex Sentences (Testing Attention Mechanisms)
    "Although the initial results were promising, the research team decided to re-verify the data to ensure that no errors had occurred during the collection process.",
    "The person who contacted you yesterday is the same one who will be leading the seminar on artificial intelligence next week.",
    "If you decide to go to the conference in Austin, make sure to bring your laptop so we can finish the presentation on the way.",

    # 5. Ambiguity & Context (Testing Homonyms)
    "I am going to the bank to deposit my check.", # Money bank
    "The fisherman sat on the bank of the river.", # River bank
    "He left his glasses on the table because he couldn't see clearly.",
    "The plant in the corner needs more sunlight to grow."
]

In [50]:
# ==========================================
# RUN BATCH PREDICTION
# ==========================================
print(f"{'ENGLISH INPUT':<30} | {'MODEL TRANSLATION'}")
print("-" * 60)

for text in test_data:
  print(f"{text:<30} | {translate(text)}")

ENGLISH INPUT                  | MODEL TRANSLATION
------------------------------------------------------------
Go.                            | Vete.
Run.                           | Corre.
Wait.                          | Espera.
Stop.                          | Deténganse.
Hi.                            | Hola.
Hello.                         | Hola.
Who are you?                   | Quién sos?
How are you?                   | Cómo estás?
Where is he?                   | Dónde está él?
Can I help you?                | Puedo ayudarte?
I am happy.                    | Soy feliz.
She is my friend.              | Ella es mi amiga.
He acts like a child.          | Él se comporta como un nio.
I like to read.                | Me gusta leer.
The car is blue.               | El auto es azul.
We need more money.            | Necesitamos más dinero.
I will go tomorrow.            | Iré maana.
If you want, I can help.       | Si quieres, puedo ayudar.
He speaks English very well.   | Él habla muy

In [17]:
print(f"{'ENGLISH INPUT':<160} | {'MODEL TRANSLATION'}")
print("-" * 60)

for text in test_data_complex:
  print(f"{text:<160} | {translate(text)}")

ENGLISH INPUT                                                                                                                                                    | MODEL TRANSLATION
------------------------------------------------------------
I want you to tell me the truth even if it is difficult.                                                                                                         | Quiero que me digas la verdad, incluso si es difcil.
It is important that we finish the project before the deadline.                                                                                                  | Es importante que terminemos el proyecto antes de la fecha.
I wish you were here to see this result.                                                                                                                         | Ojalá estuvieras aqu para ver este resultado.
If I had known, I would have acted differently.                                                               