Install and import the right libraries

In [None]:
# Install necessary libraries (run this in a cell)
!pip install -q transformers datasets sentencepiece rouge_score evaluate accelerate torch

# Import modules
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import (
    T5Tokenizer,    # tokenizer specilized for working with T5 transformer
    T5ForConditionalGeneration, # conditional generation acts as a language modelling head because T5 outputs vectors/numbers and this converts it back to words
    Seq2SeqTrainingArguments, # we are not using normal training arguments because it only caluclates values(errors) but
    # seq2seqta has special feature i.e. predict_with_generate means whenever we evaluate we not just calculate error but
    # generate summary also. Because of this feature we are able to calculate rogue score
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
    # to achieve dynamic padding and to not use padding values during calcuations
)
import evaluate
from datasets import Dataset, DatasetDict
# DatasetDict is kind of wrapper which binds train, test and validate data, so that we can run operations on all of them together
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [None]:
# Load and Prepare the Dataset
# 1. Load the dataset
df = pd.read_csv('/content/news_summary.csv', encoding='latin-1')

# 2. Select and rename columns (ctext=Article, text=Summary)
df = df[['ctext', 'text']]
df.columns = ['article', 'summary']

# 3. Clean data: Drop duplicates and missing values
df = df.drop_duplicates()
df = df.dropna()

# 4. Prefix the input with "summarize: " (T5 requirement otherwiese T5 won't be able to know whether to translate or summarize or anything else)
df['article'] = 'summarize: ' + df['article']

# 5. Split into Train and Test (90% train, 10% test)
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

# 6. Convert to Hugging Face Dataset format becuase it occupies lesser memory and also compatible with trainer
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset})

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['article', 'summary', '__index_level_0__'],
        num_rows: 3956
    })
    test: Dataset({
        features: ['article', 'summary', '__index_level_0__'],
        num_rows: 440
    })
})


In [None]:
# Tokenization
# 1. Load Tokenizer
model_checkpoint = "t5-base"  # Using t5-small for speed; use 't5-base' if you have good GPU/RAM
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)

# 2. Define Tokenization Function
def preprocess_function(examples):
    # Tokenize inputs (Articles)
    model_inputs = tokenizer(
        examples["article"],
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    # Tokenize targets (Summaries)
    labels = tokenizer(
        examples["summary"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]   # making a new column to store input_ids of labels
    return model_inputs

# 3. Apply Tokenization to dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)  # converting dataset in numbers/vectors

# 4. Remove text columns to format for PyTorch
tokenized_datasets = tokenized_datasets.remove_columns(dataset["train"].column_names)  # removing text columns
tokenized_datasets.set_format("torch")

print("Tokenization complete. Keys available:", tokenized_datasets["train"].features.keys())

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Map:   0%|          | 0/3956 [00:00<?, ? examples/s]

Map:   0%|          | 0/440 [00:00<?, ? examples/s]

Tokenization complete. Keys available: dict_keys(['input_ids', 'attention_mask', 'labels'])


In [None]:
import numpy as np

# 1. Load Pre-trained Model
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

# 2. Data Collator (Handles dynamic padding for batches)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# 3. Load ROUGE Metric
rouge = evaluate.load("rouge")

# 4. Define Metric Computation Function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels (ignored indices) with pad token for decoding
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE scores
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Add mean generation length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# 1. Define Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",             # Evaluate validation set every epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,     # Reduce to 4 if you run out of memory
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,                # Number of passes through the data
    predict_with_generate=True,        # Required for ROUGE score generation during eval
    fp16=torch.cuda.is_available(),    # Use mixed precision if GPU is available (faster)
)

# 2. Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 3. Start Training
trainer.train()

  trainer = Seq2SeqTrainer(
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"
[34m[1mwandb[0m: Using W&B in offline mode.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,1.299255,0.0488,0.0298,0.042,0.0421,3.5409
2,1.976200,1.262988,0.226,0.1277,0.1913,0.1916,17.45
3,1.420900,1.256912,0.2343,0.1317,0.1983,0.199,18.2273


TrainOutput(global_step=1485, training_loss=1.5984568586253156, metrics={'train_runtime': 493.4187, 'train_samples_per_second': 24.053, 'train_steps_per_second': 3.01, 'total_flos': 1606236499869696.0, 'train_loss': 1.5984568586253156, 'epoch': 3.0})

In [None]:
# 1. Grab a sample article (replace this string with any text you want to summarize)
sample_text = dataset['test'][0]['article']
print(f"Original Article (Snippet): {sample_text[:200]}...")

# 2. Prepare the input
# We must prefix with "summarize: " if it wasn't already there, but our dataset has it.
inputs = tokenizer(sample_text, return_tensors="pt", max_length=512, truncation=True).input_ids.to(device)

# 3. Generate the summary
outputs = model.generate(
    inputs,
    max_length=128,
    min_length=30,
    length_penalty=2.0,
    num_beams=4,
    early_stopping=True
)

# 4. Decode and print result
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"\nGenerated Summary: {summary}")

# 5. Save the model
model.save_pretrained("./my_summarization_model")
tokenizer.save_pretrained("./my_summarization_model")
print("\nModel saved to ./my_summarization_model")

Original Article (Snippet): summarize: Washington, Mar 27 (PTI) A 38-year-old woman in the US, who was apprehended twice for allegedly trying to jump the White House fence last week, has been arrested for scaling a fence at the ...

Generated Summary: Marci Anderson Wahl of Everett, Washington, was arrested after an alarm sounded at about 2:15 am yesterday when she scaled a fence at the Treasury Building, next to the White House. Wahl was charged with unlawful entry and contempt of court.

Model saved to ./my_summarization_model
