# Train T5 for English → Romanian Translation
# This notebook fine-tunes Google's T5 (t5-base) model using the WMT datasets.



In [18]:
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq
import evaluate
import torch


In [19]:

# Load and Preprocess Dataset
# Choose dataset: use 'wmt14', 'de-en' for English-German or 'wmt16', 'ro-en' for English-Romanian
# Also change the PREFIX and output paths accordingly
DATASET_NAME = "wmt16"
LANG_PAIR = "ro-en"
PREFIX = "translate English to Romanian: "

# Load a small subset for training
raw_dataset = load_dataset(DATASET_NAME, LANG_PAIR, split="train[:200]")



In [20]:

# Load Tokenizer and Model

tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

# Tokenize Dataset

def preprocess(example):
    input_text = PREFIX + example["translation"]["en"]
    target_text = example["translation"]["ro"]
    model_inputs = tokenizer(input_text, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(target_text, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

filtered = raw_dataset.filter(lambda x: x["translation"]["en"].strip() and x["translation"]["ro"].strip())


# Apply preprocessing
tokenized_dataset = filtered.map(preprocess, remove_columns=raw_dataset.column_names)


Filter:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [21]:

# Set Up Training

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
training_args = TrainingArguments(
    output_dir="./t5-finetuned-en-ro", 
    per_device_train_batch_size=4,
    num_train_epochs=1,
    logging_dir="./logs",
    save_strategy="epoch",
    evaluation_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)


  trainer = Trainer(


In [22]:

# Train the Model

trainer.train()

# Save Model and Tokenizer

model.save_pretrained("./t5-finetuned-en-ro")
tokenizer.save_pretrained("./t5-finetuned-en-ro")
print(" T5 model trained and saved.")


Step,Training Loss


 T5 model trained and saved.


In [23]:

# Evaluate BLEU Score

bleu = evaluate.load("bleu")
model.eval()

predictions = []
references = []
sample_dataset = filtered.select(range(50))

for example in sample_dataset:
    input_text = PREFIX + example["translation"]["en"]
    target_text = example["translation"]["ro"]

    if not input_text.strip() or not target_text.strip():
        continue

    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(model.device)
    with torch.no_grad():
        output_tokens = model.generate(**inputs, max_length=128)
    prediction = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

    if prediction.strip():
        predictions.append(prediction.strip())
        references.append([target_text.strip()])

if not predictions or not references:
    print("⚠️ BLEU evaluation skipped: no valid predictions or references.")
else:
    try:
        bleu_result = bleu.compute(predictions=predictions, references=references)
        print(f"BLEU score on 50 samples: {bleu_result['bleu']:.4f}")
    except ZeroDivisionError:
        print("BLEU score could not be computed due to zero-division (no matching n-grams).")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


⚠️ BLEU evaluation skipped: no valid predictions or references.
