# Train T5 for English → German or English → Romanian Translation
# This notebook fine-tunes Google's T5 (t5-base) model using the WMT datasets.



In [7]:
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq
import evaluate
import torch


In [8]:

# Load and Preprocess Dataset
# Choose dataset: use 'wmt14', 'de-en' for English-German or 'wmt16', 'ro-en' for English-Romanian
# Also change the PREFIX and output paths accordingly
DATASET_NAME = "wmt14"
LANG_PAIR = "de-en"
PREFIX = "translate English to German: "  # Use "translate English to Romanian: " for RO

# Load a small subset for training
raw_dataset = load_dataset(DATASET_NAME, LANG_PAIR, split="train[:200]")



In [9]:

# Load Tokenizer and Model

tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

# Tokenize Dataset

def preprocess(example):
    input_text = PREFIX + example["translation"]["en"]
    target_text = example["translation"]["de"]  # Change to ["ro"] if using RO
    model_inputs = tokenizer(input_text, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(target_text, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_dataset = raw_dataset.map(preprocess, remove_columns=raw_dataset.column_names)


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [10]:

# Set Up Training

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
training_args = TrainingArguments(
    output_dir="./t5-finetuned-en-de",  # Change to ./t5-finetuned-en-ro if using RO
    per_device_train_batch_size=4,
    num_train_epochs=1,
    logging_dir="./logs",
    save_strategy="epoch",
    evaluation_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)


  trainer = Trainer(


In [11]:

# Train the Model

trainer.train()

# Save Model and Tokenizer

model.save_pretrained("./t5-finetuned-en-de")
tokenizer.save_pretrained("./t5-finetuned-en-de")
print(" T5 model trained and saved.")


Step,Training Loss


 T5 model trained and saved.


In [12]:

# Evaluate BLEU Score

bleu = evaluate.load("bleu")
model.eval()

predictions, references = [], []
sample_dataset = raw_dataset.select(range(50))

for example in sample_dataset:
    input_text = PREFIX + example["translation"]["en"]
    target_text = example["translation"]["de"]  # Change to ["ro"] for RO

    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(model.device)
    with torch.no_grad():
        output_tokens = model.generate(**inputs, max_length=128)
    prediction = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

    predictions.append(prediction.strip())
    references.append([target_text.strip()])

bleu_result = bleu.compute(predictions=predictions, references=references)
print(f"BLEU score on 50 samples: {bleu_result['bleu']:.4f}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


BLEU score on 50 samples: 0.2653
