# Train T5 for English → Romanian Translation
# This notebook fine-tunes Google's T5 (t5-small) model using the WMT datasets.



In [58]:
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq
import evaluate
import torch
from evaluate import load



In [59]:

# Load and Preprocess Dataset
# Choose dataset: use 'wmt14', 'de-en' for English-German or 'wmt16', 'ro-en' for English-Romanian
# Also change the PREFIX and output paths accordingly
DATASET_NAME = "wmt16"
LANG_PAIR = "ro-en"
PREFIX = "translate English to Romanian: "

# Load a small subset for training
raw_dataset = load_dataset(DATASET_NAME, LANG_PAIR, split="train[:5000]")
#num_train_epochs=3


In [60]:

# Load Tokenizer and Model

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Tokenize Dataset

def preprocess(example):
    input_text = PREFIX + example["translation"]["en"]
    target_text = example["translation"]["ro"]
    model_inputs = tokenizer(input_text, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(target_text, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_dataset = raw_dataset.map(preprocess, remove_columns=raw_dataset.column_names)


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [61]:

# Set Up Training

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
training_args = TrainingArguments(
    output_dir="./t5-finetuned-en-ro", 
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch",
    evaluation_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)


  trainer = Trainer(


In [62]:

# Train the Model

trainer.train()

# Save Model and Tokenizer

#model.save_pretrained("./t5-finetuned-en-ro")
#tokenizer.save_pretrained("./t5-finetuned-en-ro")
# Save Model and Tokenizer
model.save_pretrained("./t5-small-finetuned-en-ro")
tokenizer.save_pretrained("./t5-small-finetuned-en-ro")
print(" T5 model trained and saved.")


Step,Training Loss
500,1.1512
1000,0.1079
1500,0.1003
2000,0.0993
2500,0.0971
3000,0.0935
3500,0.0902


 T5 model trained and saved.


In [63]:
# Evaluate BLEU Score
bleu = load("bleu")
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

predictions, references = [], []
sample_dataset = raw_dataset.select(range(50))

for idx, example in enumerate(sample_dataset):
    input_text = PREFIX + example["translation"]["en"]
    target_text = example["translation"]["ro"]

    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        output_tokens = model.generate(**inputs, max_length=128)

    prediction = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

    print("\n🔹 INPUT:", input_text)
    print("🔸 TARGET:", target_text)
    print("✅ PREDICTION:", prediction)

    if prediction.strip():
        predictions.append(prediction.strip().lower())
        references.append([target_text.strip().lower()])

# Compute BLEU with smoothing if available
if predictions and references:
    try:
        bleu_result = bleu.compute(predictions=predictions, references=references, smooth=True)
        print(f"\n✅ BLEU score on 50 samples: {bleu_result['bleu']:.4f}")
    except ZeroDivisionError:
        print("❌ BLEU score could not be computed due to zero n-gram overlap.")
else:
    print("⚠️ No valid predictions or references to evaluate BLEU.")


🔹 INPUT: translate English to Romanian: Membership of Parliament: see Minutes
🔸 TARGET: Componenţa Parlamentului: a se vedea procesul-verbal
✅ PREDICTION: Componenţa Parlamentului: a se vedea procesul-verbal

🔹 INPUT: translate English to Romanian: Approval of Minutes of previous sitting: see Minutes
🔸 TARGET: Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal
✅ PREDICTION: Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal

🔹 INPUT: translate English to Romanian: Membership of Parliament: see Minutes
🔸 TARGET: Componenţa Parlamentului: a se vedea procesul-verbal
✅ PREDICTION: Componenţa Parlamentului: a se vedea procesul-verbal

🔹 INPUT: translate English to Romanian: Verification of credentials: see Minutes
🔸 TARGET: Verificarea prerogativelor: a se vedea procesul-verbal
✅ PREDICTION: Verificarea prerogativelor: a se vedea procesul-verbal

🔹 INPUT: translate English to Romanian: Documents received: see Minutes
🔸 TARGET: Depuner