In [None]:
# Cell 1 - Imports
import numpy as np
import nltk
import torch
from datasets import load_dataset
from transformers import (
    PegasusTokenizer,
    PegasusForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from evaluate import load

# Download punkt for sentence splitting if needed
nltk.download("punkt")


In [None]:
# Cell 2 - Load CNN/DailyMail dataset (FULL, not subset)
dataset = load_dataset("cnn_dailymail", "default")

print(dataset)
print("Train size:", len(dataset["train"]))
print("Validation size:", len(dataset["validation"]))
print("Test size:", len(dataset["test"]))


In [None]:
# Cell 3 - Load Pegasus tokenizer & model
model_name = "google/pegasus-large"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)


In [None]:
# Cell 4 - Preprocessing
max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    inputs = examples["article"]
    targets = examples["highlights"]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["article", "highlights", "id"])


In [None]:
# Cell 5 - Training setup
batch_size = 4  # Increase if you have more GPUs
args = Seq2SeqTrainingArguments(
    output_dir="./pegasus_cnn",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=1,   # change to more epochs if you have compute
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),   # use mixed precision if GPU supports
    logging_dir="./logs",
    logging_steps=500,
    report_to="none"
)


In [None]:
# Cell 6 - Metrics
rouge = load("rouge")
bertscore = load("bertscore")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # BERTScore
    bertscore_result = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")

    # Return average
    result = {
        "rouge1": rouge_result["rouge1"],
        "rouge2": rouge_result["rouge2"],
        "rougeL": rouge_result["rougeL"],
        "bertscore_f1": np.mean(bertscore_result["f1"])
    }
    return result


In [None]:
# Cell 7 - Trainer
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],        # full train set
    eval_dataset=tokenized_datasets["validation"],    # full val set
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:
# Cell 8 - Training
trainer.train()


In [None]:
# Cell 9 - Evaluate on FULL Test set
test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print(test_results)