In [None]:
# (use your conda/venv; Python 3.9–3.11 recommended)
!pip install "transformers>=4.40.0" "datasets>=2.18.0" "evaluate>=0.4.1" "accelerate>=0.27.0" rouge_score sentencepiece




In [None]:
import os
import random
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Union

import evaluate
import datasets
from datasets import load_dataset

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)

In [None]:
MODEL_NAME = "t5-large"            # try "t5-base" or "t5-large" if you have more GPU
OUTPUT_DIR = "./t5-cnn-dm"
MAX_SOURCE_LEN = 512               # CNN/DM articles can be long; 512 is a good start
MAX_TARGET_LEN = 128               # typical summary length
VAL_MAX_TARGET_LEN = 128
BATCH_SIZE = 4                     # per-device batch size (increase if you have VRAM)
GRAD_ACCUM_STEPS = 4               # effective batch size = BATCH_SIZE * GRAD_ACCUM_STEPS
NUM_EPOCHS = 3
LEARNING_RATE = 3e-4
WEIGHT_DECAY = 0.01
WARMUP_STEPS = 500
LOGGING_STEPS = 50
EVAL_STEPS = 1000
SAVE_STEPS = 1000
SEED = 42
FP16 = torch.cuda.is_available()    # enable mixed precision if on GPU
NUM_BEAMS = 4

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
  torch.cuda.manual_seed_all(SEED)

In [None]:
dataset = load_dataset("cnn_dailymail", "3.0.0")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# T5 expects a task prefix
TASK_PREFIX = "summarize: "

In [None]:
def preprocess_function(examples):
    inputs = [TASK_PREFIX + doc for doc in examples["article"]]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_SOURCE_LEN,
        truncation=True,
        padding="max_length",
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["highlights"],
            max_length=MAX_TARGET_LEN,
            truncation=True,
            padding="max_length",
        )
        labels_ids = labels["input_ids"]
    labels_ids = [
        [(lid if lid != tokenizer.pad_token_id else -100) for lid in label]
        for label in labels_ids
    ]
    model_inputs["labels"] = labels_ids
    return model_inputs

tokenized = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
    desc="Tokenizing"
)


Tokenizing:   0%|          | 0/13368 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
metric = evaluate.load("rouge")

In [None]:
def postprocess_text(preds, labels):
    preds = [p.strip() for p in preds]
    labels = [l.strip() for l in labels]
    return preds, labels
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    # Decode
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can’t decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
    )
    # R1/R2/RL as percentages
    result = {k: round(v * 100, 2) for k, v in result.items()}
    # Length of generated summaries (for info)
    prediction_lens = [np.count_nonzero(np.array(p) != tokenizer.pad_token_id) for p in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    do_train=True,
    do_eval=True,
    eval_strategy="steps",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    weight_decay=WEIGHT_DECAY,
    warmup_steps=WARMUP_STEPS,
    num_train_epochs=NUM_EPOCHS,
    logging_steps=LOGGING_STEPS,
    eval_steps=EVAL_STEPS,
    save_steps=SAVE_STEPS,
    save_total_limit=2,
    predict_with_generate=True,
    fp16=FP16,
    report_to="none",  # set to "tensorboard" if you want TB logs
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    generation_max_length=VAL_MAX_TARGET_LEN,
    generation_num_beams=NUM_BEAMS,
    seed=SEED,
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Seq2SeqTrainer(


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


OverflowError: out of range integral type conversion attempted

In [None]:
sample = datasets.load_dataset("cnn_dailymail", "3.0.0", split="test[:3]")
inputs = [TASK_PREFIX + x for x in sample["article"]]
enc = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True, max_length=MAX_SOURCE_LEN).to(model.device)
with torch.no_grad():
    generated = model.generate(
        **enc,
        max_length=MAX_TARGET_LEN,
        num_beams=NUM_BEAMS,
        length_penalty=1.0,
        early_stopping=True,
    )
summaries = tokenizer.batch_decode(generated, skip_special_tokens=True)
for i, s in enumerate(summaries):
    print(f"\n--- EXAMPLE {i+1} ---")
    print("Reference:", sample["highlights"][i][:400], "...")
    print("Pred     :", s)


--- EXAMPLE 1 ---
Reference: Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis . ...
Pred     : The Palestinian Authority officially becomes the 123rd member of the International Criminal Court. The court is based in The Hague, in the Netherlands, where the court is based. Palestinians signed the ICC's founding Rome Statute in January.

--- EXAMPLE 2 ---
Reference: Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer and buried in a field .
"She's a true miracle dog and she deserves a good life," says Sara Mellado, who is looking for a home for Theia . ...
Pred     : A stray pooch in Washington has used up at least three of her own after being hit by a car. Theia is a friendly white-and-black bully breed mix now named Theia.

--- EXAMPLE 3 ---
Reference: Mohammad Javad Zarif has spe

In [None]:
!hf auth login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
The token `Hftoken` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authentica

In [None]:
model.push_to_hub("Turvash/t5-cnn-dm-summarizer")
tokenizer.push_to_hub("Turvash/t5-cnn-dm-summarizer")


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmpjbu_i4s4/model.safetensors    :   0%|          |  552kB /  242MB            

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmpo4q1m8b3/spiece.model         : 100%|##########|  792kB /  792kB            

CommitInfo(commit_url='https://huggingface.co/Turvash/t5-cnn-dm-summarizer/commit/94aa68b3c47770c9f80da6bd5a1e81e253cbcd59', commit_message='Upload tokenizer', commit_description='', oid='94aa68b3c47770c9f80da6bd5a1e81e253cbcd59', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Turvash/t5-cnn-dm-summarizer', endpoint='https://huggingface.co', repo_type='model', repo_id='Turvash/t5-cnn-dm-summarizer'), pr_revision=None, pr_num=None)

In [None]:
metrics = trainer.evaluate(max_length=VAL_MAX_TARGET_LEN, num_beams=NUM_BEAMS)
print("Validation metrics:", metrics)

trainer.save_model(OUTPUT_DIR)       # saves adapter weights + config
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Saved to {OUTPUT_DIR}")


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss
