In [None]:
pip install evaluate



In [None]:
pip install rouge_score



In [None]:
import os
import re
import html
from typing import Dict, List
from dataclasses import dataclass
import numpy as np
import evaluate
from datasets import load_dataset, DatasetDict
import time
import matplotlib.pyplot as plt
import torch
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

In [None]:
model_name = os.environ.get("T5_MODEL_NAME", "t5-base")
dataset_config = "3.0.0"

In [None]:
# Sequence lengths
max_source_length = 512
max_target_length = 128
val_max_target_length = 128

In [None]:
# Batch / optimization
per_device_train_batch_size = 4
per_device_eval_batch_size = 2
grad_accum_steps = 4
num_train_epochs = 3
learning_rate = 1e-5
weight_decay = 0.01
warmup_ratio = 0.03
lr_scheduler = "cosine"

In [None]:
# Mixed precision
mixed_precision = "fp16"

In [None]:
# Preprocessing knobs
MIN_SRC_CHARS, MIN_TGT_CHARS = 120, 10
MAX_SRC_CHARS = 5000

In [None]:
NUM_PROC = None
gen_num_beams = 4

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
BOILERPLATE_PATTERNS = [
    r"^Editor.?s Note:.*$",
    r"^READ:\s.*$",
    r"^WATCH:\s.*$",
    r"^\(CNN\)\s*[-–—]?\s*"
]
BP_RE = [re.compile(p, re.IGNORECASE | re.MULTILINE) for p in BOILERPLATE_PATTERNS]
URL_RE = re.compile(r"https?://\S+")

In [None]:
def clean_text(t: str) -> str:
    t = html.unescape(t)
    t = URL_RE.sub("", t)
    for pat in BP_RE:
        t = pat.sub("", t)
    t = t.replace("\u00A0", " ")
    t = re.sub(r"\s+", " ", t).strip()
    return t

In [None]:
def cleaner_batch(batch):
    articles = [clean_text(a) for a in batch["article"]]
    highlights = [clean_text(h) for h in batch["highlights"]]
    return {"article": articles, "highlights": highlights}

In [None]:
print("Loading CNN/DailyMail...")
raw = load_dataset("cnn_dailymail", dataset_config)

Loading CNN/DailyMail...


In [None]:
# --- Define desired sizes ---
train_size = 15000
validation_size = 3000
test_size = 2000

# --- Select subsets ---
print(f"Selecting {train_size} training examples...")
train_subset = raw["train"].select(range(train_size))

print(f"Selecting {validation_size} validation examples...")
validation_subset = raw["validation"].select(range(validation_size))

print(f"Selecting {test_size} testing examples...")
test_subset = raw["test"].select(range(test_size))

# --- Create a new DatasetDict with the subsets ---
raw_subset = DatasetDict({
    "train": train_subset,
    "validation": validation_subset,
    "test": test_subset
})

Selecting 2500 training examples...
Selecting 500 validation examples...
Selecting 500 testing examples...


In [None]:
# --- Verify the sizes ---
print("\nNew subset sizes:")
print(raw_subset)


New subset sizes:
DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 2500
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 500
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 500
    })
})


In [None]:
def len_filter(ex):
    return (len(ex["article"]) >= MIN_SRC_CHARS) and (len(ex["highlights"]) >= MIN_TGT_CHARS)

In [None]:
raw_subset = raw_subset.filter(len_filter, num_proc=NUM_PROC)

In [None]:
def preclip_batch(batch):
    arts = [a[:MAX_SRC_CHARS] for a in batch["article"]]
    return {"article": arts, "highlights": batch["highlights"]}

In [None]:
raw_subset = raw_subset.map(preclip_batch, batched=True, num_proc=NUM_PROC, desc="Pre-clipping articles")
raw_subset = raw_subset.map(cleaner_batch, batched=True, num_proc=NUM_PROC, desc="Cleaning text")

In [None]:
def non_empty(ex):
    return (len(ex["article"]) > 0) and (len(ex["highlights"]) > 0)

raw_subset = raw_subset.filter(non_empty, num_proc=NUM_PROC)

In [None]:
print(f"Loading model/tokenizer: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

prefix = "summarize: "

Loading model/tokenizer: t5-base


In [None]:
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(
        inputs,
        max_length=max_source_length,
        truncation=True,
        padding="max_length",
    )
    # Target tokenization (new API)
    labels = tokenizer(
        text_target=examples["highlights"],
        max_length=max_target_length,
        truncation=True,
        padding="max_length",
    )
    # Mask pad tokens in labels with -100 so they don't contribute to loss
    label_ids = []
    for label in labels["input_ids"]:
        label_ids.append([lid if lid != tokenizer.pad_token_id else -100 for lid in label])
    model_inputs["labels"] = label_ids
    return model_inputs

In [None]:
print("Tokenizing...")
tokenized = raw_subset.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_subset["train"].column_names,
    num_proc=NUM_PROC,
    desc="Tokenizing dataset",
)

Tokenizing...


Tokenizing dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    pad_to_multiple_of=8,
)

In [None]:
import evaluate

print("\nLoading ROUGE metric...")
rouge = evaluate.load("rouge")

print("Loading BLEU metric...")
bleu = evaluate.load("bleu")   # or "sacrebleu" if you prefer

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    # Replace -100 with pad_token_id for decoding predictions
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 with pad_token_id for decoding references
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Strip whitespace
    decoded_preds = [p.strip() for p in decoded_preds]
    decoded_labels = [l.strip() for l in decoded_labels]

    # Sentence-split with newlines for ROUGE-Lsum style
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label)) for label in decoded_labels]

    # ---- ROUGE ----
    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )
    result = {k: round(v * 100, 2) for k, v in result.items()}

    # ---- BLEU ----
    # evaluate's "bleu" expects references as List[List[str]] (list of refs per example)
    bleu_result = bleu.compute(
        predictions=decoded_preds,
        references=[[ref] for ref in decoded_labels]
    )
    # bleu_result["bleu"] is already in [0, 100] like sacrebleu
    result["bleu"] = round(bleu_result["bleu"]*100, 2)

    # Add "eval_" prefix for Trainer
    return {f"eval_{k}": v for k, v in result.items()}



Loading ROUGE metric...
Loading BLEU metric...


In [None]:
output_dir = f"t5-cnn-dm-{model_name.replace('/', '-')}"
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    # --- keep only widely supported args ---
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=grad_accum_steps,

    num_train_epochs=num_train_epochs,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler,

    logging_strategy="steps",
    logging_steps=200,
    #eval_strategy="epoch",
    save_strategy="steps",
    save_steps=2000,           # checkpoints still saved periodically
    save_total_limit=2,        # if your version supports it; if not, remove this line
    gradient_checkpointing = True,

    fp16=(mixed_precision == "fp16"),
    bf16=(mixed_precision == "bf16"),
    #dataloader_num_workers=4,
    report_to=["none"],

    predict_with_generate=True,
    generation_max_length=val_max_target_length,
    generation_num_beams=gen_num_beams,
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [None]:
print("\nStarting training...")
train_result = trainer.train()


Starting training...


Step,Training Loss
200,1.7447
400,1.6237


In [None]:
trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)
trainer.save_state() # Saves log history in trainer_state.json

***** train metrics *****
  epoch                    =        3.0
  total_flos               =  4142930GF
  train_loss               =     1.6797
  train_runtime            = 0:14:39.63
  train_samples_per_second =      8.305
  train_steps_per_second   =      0.522


In [None]:
print("\nEvaluating final model...")
metrics = trainer.evaluate(
    max_length=val_max_target_length, # Use consistent generation settings
    num_beams=gen_num_beams
)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

print("Final eval metrics:", metrics)


Evaluating final model...


***** eval metrics *****
  epoch                   =        3.0
  eval_bleu               =       10.8
  eval_loss               =     1.8761
  eval_rouge1             =      34.16
  eval_rouge2             =      14.73
  eval_rougeL             =      25.01
  eval_rougeLsum          =      31.13
  eval_runtime            = 0:09:36.88
  eval_samples_per_second =      0.867
  eval_steps_per_second   =      0.433
Final eval metrics: {'eval_rouge1': 34.16, 'eval_rouge2': 14.73, 'eval_rougeL': 25.01, 'eval_rougeLsum': 31.13, 'eval_bleu': 10.8, 'eval_loss': 1.8760634660720825, 'eval_runtime': 576.8858, 'eval_samples_per_second': 0.867, 'eval_steps_per_second': 0.433, 'epoch': 3.0}


In [None]:
print("\nSaving final model...")
trainer.save_model()
tokenizer.save_pretrained(output_dir)

print("\nFinal eval metrics:", metrics)


Saving final model...

Final eval metrics: {'eval_rouge1': 34.16, 'eval_rouge2': 14.73, 'eval_rougeL': 25.01, 'eval_rougeLsum': 31.13, 'eval_bleu': 10.8, 'eval_loss': 1.8760634660720825, 'eval_runtime': 576.8858, 'eval_samples_per_second': 0.867, 'eval_steps_per_second': 0.433, 'epoch': 3.0}


In [None]:
print("\nLoading ROUGE and BLEU metrics for TEST evaluation...")
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")


def test_compute(preds_ids, label_ids, tokenizer):
    if isinstance(preds_ids, tuple):
        preds_ids = preds_ids[0]

    # Replace -100 with pad token for decoding
    preds_ids = np.where(preds_ids != -100, preds_ids, tokenizer.pad_token_id)
    label_ids = np.where(label_ids != -100, label_ids, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds_ids, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    decoded_preds = [p.strip() for p in decoded_preds]
    decoded_labels = [l.strip() for l in decoded_labels]

    # -------- ROUGE (aggregate) --------
    preds_for_rouge = [
        "\n".join(nltk.sent_tokenize(p)) if p else "" for p in decoded_preds
    ]
    refs_for_rouge = [
        "\n".join(nltk.sent_tokenize(r)) if r else "" for r in decoded_labels
    ]

    rouge_result = rouge.compute(
        predictions=preds_for_rouge,
        references=refs_for_rouge,
        use_stemmer=True,
    )

    #BLEU (aggregate)
    bleu_result = bleu.compute(
        predictions=decoded_preds,
        references=[[r] for r in decoded_labels]
    )

    metrics = {
        "rouge1": round(rouge_result["rouge1"] * 100, 2),
        "rouge2": round(rouge_result["rouge2"] * 100, 2),
        "rougeL": round(rouge_result["rougeL"] * 100, 2),
        "rougeLsum": round(rouge_result.get("rougeLsum", 0.0) * 100, 2),
        "bleu": round(bleu_result["bleu"] * 100, 2),
    }
    return metrics


Loading ROUGE and BLEU metrics for TEST evaluation...


In [None]:
print("\nGenerating predictions on TEST set...")

test_output = trainer.predict(
    test_dataset=tokenized["test"],
    max_length=val_max_target_length,
    num_beams=gen_num_beams,
)

test_metrics = test_compute(
    preds_ids=test_output.predictions,
    label_ids=test_output.label_ids,
    tokenizer=tokenizer,
)

print(test_metrics)


Generating predictions on TEST set...


{'rouge1': np.float64(33.33), 'rouge2': np.float64(13.63), 'rougeL': np.float64(24.37), 'rougeLsum': np.float64(30.42), 'bleu': 10.08}
