In [None]:
!pip install transformers datasets tensorflow
!pip install peft
!pip install evaluate sacrebleu rouge-score



In [None]:
import torch
import numpy as np
import math
from datasets import load_dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    T5Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
from peft import LoraConfig, get_peft_model
import evaluate

In [None]:
from datasets import load_dataset

# Load the full dataset
dataset = load_dataset('wmt16', 'de-en')

# Limit the dataset size (e.g., 500 examples for training and 100 for validation)
num_train_samples = 500
num_eval_samples = 100

# Take subsets of the dataset
train_subset = dataset['train'].select(range(num_train_samples))
eval_subset = dataset['validation'].select(range(num_eval_samples))

# Update the dataset with the subsets
dataset = {
    'train': train_subset,
    'validation': eval_subset
}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
dataset['train']['translation'][0]

{'de': 'Wiederaufnahme der Sitzungsperiode', 'en': 'Resumption of the session'}

In [None]:
model_name = "t5-small"

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

model.config.pad_token_id = tokenizer.pad_token_id

print("Tokenizer pad_token_id:", tokenizer.pad_token_id)
print("Model pad_token_id:", model.config.pad_token_id)

Loading weights:   0%|          | 0/131 [00:00<?, ?it/s]

Tokenizer pad_token_id: 0
Model pad_token_id: 0


In [None]:
example = {'translation':
           {'de': 'Wiederaufnahme der Sitzungsperiode',
            'en': 'Resumption of the session'}}

In [None]:
def preprocess_function(examples):

    # Add instruction-style prompt (important for T5)
    inputs = [
        "translate English to German: " + ex["en"]
        for ex in examples["translation"]
    ]

    targets = [ex["de"] for ex in examples["translation"]]

    # Tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=128,
        padding="max_length",
        truncation=True,
        add_special_tokens=True,
        return_attention_mask=True
    )

    # Tokenize targets
    labels = tokenizer(
        targets,
        max_length=128,
        padding="max_length",
        truncation=True,
        add_special_tokens=True
    )

    # Ignore padding in loss
    labels_ids = labels["input_ids"]

    labels_ids = [
        [(token if token != tokenizer.pad_token_id else -100)
         for token in label]
        for label in labels_ids
    ]

    model_inputs["labels"] = labels_ids

    return model_inputs


# Apply preprocessing
tokenized_train = dataset["train"].map(preprocess_function, batched=True)
tokenized_eval = dataset["validation"].map(preprocess_function, batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets = {
    "train": tokenized_train,
    "validation": tokenized_eval
}

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 294,912 || all params: 60,801,536 || trainable%: 0.4850


In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model
)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=100,
    num_train_epochs=4,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="steps",
    save_strategy="no"
)

`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.


In [None]:
# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

In [None]:
# Train the model
trainer.train()

Step,Training Loss,Validation Loss
10,1.314219,1.088967
20,1.260239,1.088962
30,1.305167,1.088969
40,1.326619,1.088969
50,1.25489,1.088971
60,1.321903,1.088972


TrainOutput(global_step=64, training_loss=1.2968619465827942, metrics={'train_runtime': 28.2664, 'train_samples_per_second': 70.755, 'train_steps_per_second': 2.264, 'total_flos': 68123885568000.0, 'train_loss': 1.2968619465827942, 'epoch': 4.0})

In [None]:
# Evaluate model
eval_results = trainer.evaluate()

print("Evaluation Results:")
print(eval_results)

# Extract evaluation loss
eval_loss = eval_results["eval_loss"]

# Compute perplexity
import math
perplexity = math.exp(eval_loss)

print("Evaluation Loss:", eval_loss)
print("Perplexity:", perplexity)

Evaluation Results:
{'eval_loss': 1.0889724493026733, 'eval_runtime': 0.6053, 'eval_samples_per_second': 165.198, 'eval_steps_per_second': 1.652, 'epoch': 4.0}
Evaluation Loss: 1.0889724493026733
Perplexity: 2.9712194248372366


In [None]:
model.eval()

import torch
import numpy as np

#  Get model device (cuda or cpu)
device = model.device

#  Move inputs to same device as model
input_ids = torch.tensor(tokenized_eval["input_ids"]).to(device)
attention_mask = torch.tensor(tokenized_eval["attention_mask"]).to(device)

#  Generate translations
generated_tokens = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=128
)

#  Move outputs back to CPU before decoding
generated_tokens = generated_tokens.cpu()

decoded_preds = tokenizer.batch_decode(
    generated_tokens,
    skip_special_tokens=True
)

# Prepare true labels
labels = tokenized_eval["labels"]

# Replace -100 back with pad_token_id
labels = np.where(np.array(labels) != -100, labels, tokenizer.pad_token_id)

decoded_labels = tokenizer.batch_decode(
    labels,
    skip_special_tokens=True
)

print("Sample Prediction:")
print("Prediction:", decoded_preds[0])
print("Reference:", decoded_labels[0])

Sample Prediction:
Prediction: In Tokio treffen sich die Premierminister Indiens und Japans
Reference: Die Premierminister Indiens und Japans trafen sich in Tokio.


In [None]:
import evaluate

bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")

# BLEU
bleu_score = bleu.compute(
    predictions=decoded_preds,
    references=[[label] for label in decoded_labels]
)

# ROUGE
rouge_score = rouge.compute(
    predictions=decoded_preds,
    references=decoded_labels
)

print("\nFinal Evaluation Scores:")
print("BLEU:", bleu_score["score"])
print("ROUGE-1:", rouge_score["rouge1"])
print("ROUGE-2:", rouge_score["rouge2"])
print("ROUGE-L:", rouge_score["rougeL"])


Final Evaluation Scores:
BLEU: 28.537465661787206
ROUGE-1: 0.5756095721639727
ROUGE-2: 0.3556055244844813
ROUGE-L: 0.5266694801795305
