In [None]:
!pip install evaluate

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    PegasusTokenizer,
    PegasusForConditionalGeneration,
    Trainer,
    TrainingArguments
)
import evaluate


In [None]:
dataset = load_dataset("cnn_dailymail", "3.0.0")

In [None]:
model_name = "google/pegasus-arxiv"

tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)
model.gradient_checkpointing_enable()

In [None]:
def preprocess(batch):
    inputs = tokenizer(
        batch["article"],
        truncation=True,
        padding="max_length",
        max_length=384   # ⬅️ OBLIGATOIRE si 5000
    )

    targets = tokenizer(
        batch["highlights"],
        truncation=True,
        padding="max_length",
        max_length=96    # ⬅️ OBLIGATOIRE si 5000
    )

    inputs["labels"] = targets["input_ids"]
    return inputs

In [None]:
tokenized_datasets = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset["train"].column_names
)


In [None]:
train_subset = tokenized_datasets["train"].shuffle(seed=42).select(range(7000))
val_subset   = tokenized_datasets["validation"].select(range(1400))
test_subset  = tokenized_datasets["test"].select(range(1400))


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,   # ⬅️ réduit
    fp16=True,
    learning_rate=2e-5,
    num_train_epochs=5,              # ⬅️ suffisant
    logging_steps=100,
    save_steps=500,
    save_total_limit=5,
    report_to="none"
)

In [None]:
!pip install rouge_score
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    decoded_preds = tokenizer.batch_decode(
        predictions, skip_special_tokens=True
    )
    decoded_labels = tokenizer.batch_decode(
        labels, skip_special_tokens=True
    )

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )

    return {
        "rouge1": result["rouge1"],
        "rouge2": result["rouge2"],
        "rougeL": result["rougeL"],
        "rougeLsum": result["rougeLsum"]
    }

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=val_subset,
    tokenizer=tokenizer
)

# Reduce the maximum length for generation during evaluation to mitigate OutOfMemoryError
# The default generation_max_length is often 128 for Pegasus-CNN_Dailymail.
# Reducing it to 64 should help with memory without severely impacting summary quality.

trainer.train()


In [None]:
import torch

def evaluate_rouge_manually(dataset, model, tokenizer, max_samples=200):
    model.eval()
    preds, refs = [], []

    for i in range(max_samples):
        example = dataset[i]

        input_ids = torch.tensor(example["input_ids"]).unsqueeze(0).to(model.device)
        attention_mask = torch.tensor(example["attention_mask"]).unsqueeze(0).to(model.device)

        with torch.no_grad():
            summary_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=64,
                num_beams=4
            )

        preds.append(tokenizer.decode(summary_ids[0], skip_special_tokens=True))
        refs.append(tokenizer.decode(example["labels"], skip_special_tokens=True))

        del input_ids, attention_mask, summary_ids
        torch.cuda.empty_cache()

    return preds, refs


In [None]:
predictions, references = evaluate_rouge_manually(
    test_subset,
    model,
    tokenizer,
    max_samples=200
)

results = rouge.compute(
    predictions=predictions,
    references=references
)

print(results)
