In [1]:
# Installs Hugging Face Transformers with SentencePiece (for models like T5),
#  Datasets (for easy dataset access), sacrebleu & rouge_score (evaluation metrics),
# and py7zr (for handling .7z files), all in quiet mode
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [2]:
!pip install evaluate -q

In [3]:
!nvidia-smi


Sat Apr 26 06:21:32 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   46C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
!pip show datasets

Name: datasets
Version: 3.5.0
Summary: HuggingFace community-driven open-source library of datasets
Home-page: https://github.com/huggingface/datasets
Author: HuggingFace Inc.
Author-email: thomas@huggingface.co
License: Apache 2.0
Location: /usr/local/lib/python3.11/dist-packages
Requires: aiohttp, dill, filelock, fsspec, huggingface-hub, multiprocess, numpy, packaging, pandas, pyarrow, pyyaml, requests, tqdm, xxhash
Required-by: evaluate


In [5]:
# Import libraries
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset
from evaluate import load as load_metric  # ✅ Updated version of load_metric
import matplotlib.pyplot as plt
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
import torch

# Download required tokenizer data
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# Checks if a CUDA-compatible GPU is available and sets the device accordingly.
# If GPU is available, set device to "cuda", else fallback to "cpu".

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [7]:
# Specify the pre-trained model checkpoint (Pegasus model fine-tuned on CNN/DailyMail dataset for summarization)
model_ckpt = "google/pegasus-cnn_dailymail"

# Load the tokenizer associated with the specified model checkpoint. This tokenizer will convert text into token IDs
# that the model can understand, and will also decode model outputs back into human-readable text.
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
# Load the pre-trained model (Pegasus model for sequence-to-sequence tasks such as summarization)
# .to(device) sends the model to the selected device (GPU if available, otherwise CPU)
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Load data
dataset_samsung = load_dataset("samsum")
dataset_samsung

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [10]:
split_lengths = [len(dataset_samsung[split]) for split in dataset_samsung]
split_lengths

[14732, 819, 818]

In [11]:
print(f"Features: {dataset_samsung['train'].column_names}")

Features: ['id', 'dialogue', 'summary']


In [12]:
# Printing dialogue at index 1 from the dataset
print("\nDialogue:")
print(dataset_samsung["test"][1]["dialogue"])

print("\nSummary:")
print(dataset_samsung["test"][1]["summary"])



Dialogue:
Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)

Summary:
Eric and Rob are going to watch a stand-up on youtube.


In [13]:
# Evaluating PEGASUS in samsum
dataset_samsung['test'][0]

{'id': '13862856',
 'dialogue': "Hannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye",
 'summary': "Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry."}

In [14]:
# I checked the model performance, if it is doing good, no need to fine tuning as tunign can be costly
# If the model is not doing good, then only do fine tuning

In [15]:
dialogue = dataset_samsung['test'][0]['dialogue']
dialogue

"Hannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye"

In [16]:
# Create a summarization pipeline using the loaded Pegasus model.
# This pipeline handles tokenization, model inference, and decoding in one step,
# allowing you to easily generate summaries from input text.
pipe = pipeline("summarization", model=model_pegasus, tokenizer=tokenizer)

Device set to use cuda:0


In [17]:
pipe_out = pipe(dialogue)
pipe_out

Your max_length is set to 128, but your input_length is only 122. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)


[{'summary_text': "Amanda: Ask Larry Amanda: He called her last time we were at the park together .<n>Hannah: I'd rather you texted him .<n>Amanda: Just text him ."}]

In [18]:
# For better displaying of summary
print(pipe_out[0]['summary_text'].replace(" .<n>", ".\n"))

Amanda: Ask Larry Amanda: He called her last time we were at the park together.
Hannah: I'd rather you texted him.
Amanda: Just text him .


In [19]:
# Since while training this model, they have not used samsum dataset, so i need to do fine tuning for better summary generation
# I fine tuned using samsum data

In [20]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """
    Splits a large list into smaller batches of a specified size.

    This is useful for processing long lists of text (e.g., articles or documents)
    in manageable chunks, especially when passing data to models that have input length
    or memory constraints (like Pegasus). It also improves performance by enabling
    batch processing instead of looping through items one by one.

    Yields:
        Slices (chunks) of the original list, each of length 'batch_size' (except the last one, which may be smaller).
    """
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]


In [21]:
def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                                batch_size=16, device=device,
                                column_text="article", column_summary="highlights"):
    """
    Calculates the evaluation metric (like ROUGE) on a test dataset.

    Args:
        dataset: The dataset split (e.g., dataset["test"])
        metric: The metric object (like ROUGE)
        model: The summarization model
        tokenizer: Tokenizer corresponding to the model
        batch_size: Number of samples to process at once
        column_text: The name of the column containing the input text
        column_summary: The name of the column containing the reference summaries

    Returns:
        The computed metric scores.
    """
    # model.eval()
    # for batch in generate_batch_sized_chunks(dataset, batch_size=batch_size):
    #     inputs = tokenizer([x[column_text] for x in batch],
    #                        max_length=1024,
    #                        truncation=True,
    #                        padding="max_length",
    #                        return_tensors="pt").to(device)

    #     with torch.no_grad():
    #         summaries = model.generate(input_ids=inputs["input_ids"],
    #                                    attention_mask=inputs["attention_mask"],
    #                                    length_penalty=0.8,
    #                                    num_beams=8,
    #                                    max_length=128)

    #     decoded_preds = tokenizer.batch_decode(summaries, skip_special_tokens=True)
    #     decoded_labels = [x[column_summary] for x in batch]

    #     metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    # return metric.compute()
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                            attention_mask=inputs["attention_mask"].to(device),
                            length_penalty=0.8, num_beams=8, max_length=128)
        '''Parameter for length oenalty ensures that the model does not generate sequences that are too long.'''

        # Finally, we decode the generated texts
        # replace the token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
               for s in summaries]

        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]


        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    # Finally compute and return the ROUGE scores
    score = metric.compute()
    return score


In [22]:
# Checking accuracy score for this pretrained model
rouge_metric = load_metric("rouge")
score = calculate_metric_on_test_ds(dataset_samsung["test"], rouge_metric, model_pegasus, tokenizer, batch_size=2, column_text="dialogue", column_summary="summary")

100%|██████████| 410/410 [19:44<00:00,  2.89s/it]


In [28]:
# List of ROUGE metric types we want to extract from the evaluation scores
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

# Create a dictionary of {metric_name: score} using only the needed metrics
rouge_dict = {rn: score[rn] for rn in rouge_names}

# Convert the dictionary into a one-row DataFrame for neat tabular display
# Useful for logging, plotting, or comparing different models
pd.DataFrame(rouge_dict, index=["pegasus"])


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.015421,0.0003,0.015408,0.015412


In [25]:
# Training with custom data

In [26]:
dataset_samsung

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [29]:
# Tokenizing the data for model input

def convert_examples_to_features(example_batch):
    """
    Converts a batch of examples into input features usable by the model.

    This includes:
    - Tokenizing the input text (dialogue) with a max length of 1024 tokens.
    - Tokenizing the target text (summary) separately with a shorter max length.
    - Creating attention masks and labels required for model training/inference.

    Returns a dictionary containing:
    - input_ids: Tokenized input text
    - attention_mask: Indicates which tokens are actual input vs. padding
    - labels: Tokenized target summary (used for training loss or evaluation)
    """

    # Tokenize the input text (dialogue)
    input_encodings = tokenizer(example_batch['dialogue'], max_length=1024, truncation=True)

    # Tokenize the target text (summary); this tells the tokenizer we’re preparing labels
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length=128, truncation=True)

    # Return all features needed for the model
    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

# Apply the tokenization function to the whole dataset using batching for speed
dataset_samsung_pt = dataset_samsung.map(convert_examples_to_features, batched=True)


Map:   0%|          | 0/14732 [00:00<?, ? examples/s]



Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [30]:
dataset_samsung_pt["train"][0]

{'id': '13818513',
 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)",
 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.',
 'input_ids': [12195,
  151,
  125,
  7091,
  3659,
  107,
  842,
  119,
  245,
  181,
  152,
  10508,
  151,
  7435,
  147,
  12195,
  151,
  125,
  131,
  267,
  650,
  119,
  3469,
  29344,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [12195, 7091, 3659, 111, 138, 650, 10508, 181, 3469, 107, 1]}

In [31]:
# Import the data collator used for preparing batches of data for sequence-to-sequence models
from transformers import DataCollatorForSeq2Seq

# Create a data collator for the Pegasus model that:
# - Pads inputs and labels to the longest sequence in each batch
# - Ensures label padding is handled correctly for loss computation
# - Works seamlessly with the tokenizer and model
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)


In [36]:
from transformers import TrainingArguments, Trainer

# Define the training configuration using Hugging Face's TrainingArguments
trainer_args = TrainingArguments(
    output_dir='pegasus-samsum',              # Directory to save model checkpoints and logs
    num_train_epochs=1,                       # Number of times the model sees the entire training dataset
    warmup_steps=500,                         # Number of steps for learning rate warm-up
    per_device_train_batch_size=1,            # Batch size per GPU/CPU for training
    per_device_eval_batch_size=1,             # Batch size per GPU/CPU for evaluation
    weight_decay=0.01,                        # Weight decay to reduce overfitting
    logging_steps=10,                         # Log training metrics every 10 steps
    gradient_accumulation_steps=16            # Accumulate gradients over 16 steps before performing an update
    # (This helps simulate a larger batch size without needing more memory)
)

# Initialize the Trainer using:
trainer = Trainer(
    model=model_pegasus,
    args=trainer_args,
    tokenizer=tokenizer,
    data_collator=seq2seq_data_collator,
    train_dataset=dataset_samsung_pt["train"],
    eval_dataset=dataset_samsung_pt["validation"]
)


  trainer = Trainer(


In [37]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mniteshofficial876[0m ([33mniteshofficial876-pulchowk-campus[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,3.2171
20,2.9936
30,3.1082
40,3.0212
50,2.6753
60,2.6905
70,2.7259
80,2.5269
90,2.3754
100,2.508




Step,Training Loss
10,3.2171
20,2.9936
30,3.1082
40,3.0212
50,2.6753
60,2.6905
70,2.7259
80,2.5269
90,2.3754
100,2.508


TrainOutput(global_step=920, training_loss=1.8249299857927406, metrics={'train_runtime': 2941.1595, 'train_samples_per_second': 5.009, 'train_steps_per_second': 0.313, 'total_flos': 5528248038285312.0, 'train_loss': 1.8249299857927406, 'epoch': 0.9991854466467553})

In [38]:
print("Model trained successfully")

Model trained successfully


In [None]:
# ROUGE-1	Overlap of unigrams (individual words) between the generated and reference summary.
# ROUGE-2	Overlap of bigrams (two-word sequences).
# ROUGE-L	Longest Common Subsequence (LCS) — how long the longest matching word sequence is.
# ROUGE-Lsum	A version of ROUGE-L tailored for summarization across multiple sentences.
# Higher values = better summaries (closer to human-written ones).

In [41]:
score = calculate_metric_on_test_ds(
  dataset_samsung["test"], rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = "dialogue", column_summary= "summary"
)

rouge_dict = {rn: score[rn] for rn in rouge_names}

pd.DataFrame(rouge_dict, index = ["pegasus"] )

100%|██████████| 410/410 [12:50<00:00,  1.88s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.018536,0.00033,0.018457,0.018493


In [42]:
# Saving the model
model_pegasus.save_pretrained("pegasus-samsum-model")

In [43]:
# saving the tokenizer
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/spiece.model',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [49]:
# Saving in drive
from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [51]:
# Create the folder inside your Drive if it doesn't exist
!mkdir -p /content/drive/MyDrive/TextSummarizer

# Move the model folder
!cp -r pegasus-samsum-model /content/drive/MyDrive/TextSummarizer/

# Move the tokenizer folder
!cp -r tokenizer /content/drive/MyDrive/TextSummarizer/


In [None]:
# Inferencing using our trained model


In [52]:
pwd

'/content'

In [53]:
ls

[0m[01;34mdrive[0m/           [01;34mpegasus-samsum-model[0m/  [01;34mtokenizer[0m/
[01;34mpegasus-samsum[0m/  [01;34msample_data[0m/           [01;34mwandb[0m/


[0m[01;34mdrive[0m/           [01;34mpegasus-samsum-model[0m/  [01;34mtokenizer[0m/
[01;34mpegasus-samsum[0m/  [01;34msample_data[0m/           [01;34mwandb[0m/


In [55]:
# Loading tokenizers
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/TextSummarizer/tokenizer")

In [56]:
dataset_samsung = load_dataset("samsum")

In [57]:
sample_text = dataset_samsung["test"][0]["dialogue"]

reference = dataset_samsung["test"][0]["summary"]

In [58]:
sample_text, reference

("Hannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye",
 "Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.")

In [59]:
# Define the generation parameters for the summarization process.
gen_kwargs = {
    "length_penalty": 0.8,  # Controls the length of the generated summary. Lower values favor shorter summaries.
    "num_beams": 8,         # Defines the number of beams used for beam search (a search strategy during generation).
    "max_length": 128       # Maximum length of the generated summary. Limits the length of the output.
}

# Create a summarization pipeline using the fine-tuned Pegasus model.
# This pipeline will automatically handle tokenization, model inference, and decoding.
pipe = pipeline(
    "summarization",       # Task type is summarization.
    model="pegasus-samsum-model",  # The model that was fine-tuned and saved earlier.
    tokenizer=tokenizer    # The tokenizer to convert input text into tokens and decode the output.
)

Device set to use cuda:0


In [61]:
# Dialogue and summary from the dataset

print("Dialogue:")
print(sample_text)


print("\nReference Summary:")
print(reference)

Dialogue:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Reference Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


In [62]:
# Model Prediction from our fine tuned model
print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])

Your max_length is set to 128, but your input_length is only 122. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)



Model Summary:
Amanda can't find Betty's number. Larry called Betty last time they were at the park together. Hannah wants Amanda to text Larry. Amanda will text Larry.
