<a href="https://colab.research.google.com/github/Umerfarooq122/Text-Summarization-by-Fine-tuning-a-pre-trained-transformer-PEGASUS-from-hugging-face-library/blob/main/PEGASUS_on_Samsum%2C_PubMed_and_Arxiv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Text Summarization Using Pre-training with Extracted Gap-sentences for Abstractive Summarization (PEGASUS) from Huggingface**

#### **Environment Setup:**

In [None]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_metric py7zr -q
!pip3 install -q -U bitsandbytes
!pip3 install -q -U peft
!pip3 install -q -U trl
!pip3 install -q -U accelerate

In [None]:
!pip install datasets --upgrade
!pip install evaluate --upgrade
!pip install rouge_score --upgrade
!pip install transformers --upgrade



In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig, BigBirdPegasusForConditionalGeneration, PegasusForConditionalGeneration, PegasusTokenizer
from peft import PeftModel, LoraConfig
from datasets import load_dataset
from evaluate import load
from trl import SFTTrainer
import torch
import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")
import pandas as pd
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

### **PEGASUS large:**

In [None]:
model_large = "google/pegasus-large"
tokenizer_large = AutoTokenizer.from_pretrained("google/pegasus-large")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
import os
# Set the environment variable to enable more verbose error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
model_pega = AutoModelForSeq2SeqLM.from_pretrained(model_large)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### **PubMED Dataset:**

In [None]:
dataset_science = load_dataset("ccdv/pubmed-summarization", trust_remote_code=True)
dataset_science

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

##### **Splitting the Dataset:**

In [None]:
split_lengths = [len(dataset_science[split]) for split in dataset_science]
split_lengths

[14732, 819, 818]

In [None]:
print(f"Features: {dataset_science['train'].column_names}")

Features: ['id', 'dialogue', 'summary']


In [None]:
print("\nArticle:")
print(dataset_science["train"][0]['article'])
print("\nAbstract:")
print(dataset_science["train"][0]['abstract'])



Article:
Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-)

Abstract:
Amanda baked cookies and will bring Jerry some tomorrow.


##### **Test Running Raw Model:**

In [None]:
dialogue = dataset_science["train"][1]["article"]
summary = dataset_science["train"][1]["abstract"]


In [None]:
pipe = pipeline("summarization", model=model_pega, tokenizer=tokenizer_large, device=device)

In [None]:

max_length = 4000 # Set a suitable maximum length for your input

 #Truncate dialogue if it exceeds max_length
if len(dialogue) > max_length:
    dialogue = dialogue[:max_length]



In [None]:

dialogue


'Olivia: Who are you voting for in this election? \r\nOliver: Liberals as always.\r\nOlivia: Me too!!\r\nOliver: Great'

In [None]:
summary

'Olivia and Olivier are voting for liberals in this election. '

In [None]:
from transformers import DataCollatorForSeq2Seq
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer_large, model = model_pega)

In [None]:
pipe_out = pipe(dialogue)
pipe_outp = pipe_out[0]["summary_text"]
pipe_outp


Your max_length is set to 128, but your input_length is only 26. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)


"Olivia: Who's you voting for in this election? Oliver: Liberals as always .<n>Olivia: Me too!!<n>Oliver: Great ."

In [None]:
pipe_out

[{'summary_text': ''}]

In [None]:
#print(pipe_outp.replace(" . <n>", ".\n"))

In [None]:
#summary

##### **Evaluating The Performance of Raw Model:**

In [None]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def calculate_metric_on_TEST_ds(dataset, metric, model, tokenizer, batch_size=16, device=device, column_text="article", column_summary="abstract"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))
    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):
        inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=8, max_length=128)

        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
               for s in summaries]
        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
    return metric.compute()

In [None]:

rouge_metric = load("rouge")

score = calculate_metric_on_TEST_ds(dataset_science["train"][0:10], rouge_metric, model_pega, tokenizer_large, batch_size=2, column_text="article", column_summary='abstract')


100%|██████████| 5/5 [00:09<00:00,  1.87s/it]


In [None]:

rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

rouge_dict = {rn: score[rn] for rn in rouge_names}
print(pd.DataFrame(rouge_dict, index=["pegasus"]))


##### **Fine-Tuning PEGASUS:**

###### **Tokenizing The Text:**

In [None]:
tokens = tokenizer_large(dialogue, return_tensors="pt").input_ids
num_tokens = tokens.shape[1]
print(num_tokens)

4845


In [None]:
def convert_examples_to_features(example_batch):

  input_encodings = tokenizer_large(example_batch["dialogue"], max_length=1024, truncation=True)
  with tokenizer_large.as_target_tokenizer():
      target_encodings = tokenizer_large(example_batch["summary"], max_length=256, truncation=True)
  return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": target_encodings["input_ids"]
    }
dataset_science_pt = dataset_science.map(convert_examples_to_features, batched=True)

In [None]:
#dataset_science_pt["train"][0]

In [None]:
from transformers import DataCollatorForSeq2Seq
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer_large, model = model_pega)

In [None]:
lora_config = LoraConfig(r = 8, target_modules = ["q_proj","o_proj","k_proj", "v_proj",
                                                  "gate_proj", "up_proj", "down_proj"],
                         task_type = "CAUSAL_LM",)

###### **Setting The Training Arguments:**

In [None]:
from transformers import TrainingArguments, Trainer
trainer_args = TrainingArguments(
    output_dir="pegasus-science",
    num_train_epochs=5,  # Reduced for monitoring potential overfitting
    warmup_steps=500,
    per_device_train_batch_size=2,  # Increased batch size, if possible
    per_device_eval_batch_size=2,
    weight_decay=0.005,  # Adjusted for potential improvement
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=20,  # More frequent evaluations
    save_steps=1000,  # More frequent saves
    gradient_accumulation_steps=8,  # Reduced to speed up updates
    fp16=True  # Enable mixed precision if hardware supports it
)


###### **Training The Model:**

In [None]:
trainer = SFTTrainer(model=model_pega, args=trainer_args,
                  processing_class=tokenizer_large, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_science_pt["train"],
                  eval_dataset=dataset_science_pt["validation"])

In [None]:
trainer.train()

###### **Evaluating The Performance:**

In [None]:
score = calculate_metric_on_TEST_ds(dataset_science["test"], rouge_metric, model_pega, tokenizer_large, batch_size=2, column_text="article", column_summary="abstract")
rouge_dict = {rn: score[rn] for rn in rouge_names}
pd.DataFrame(rouge_dict, index=["pegasus-PubMed"])

###### **Saving The Model:**

In [None]:
model_pega.save_pretrained("pegasus-science-model")

In [None]:
tokenizer_large.save_pretrained("pegasus-science-tokenizer")

#### **Arxiv Dataset:**

##### **Loading Dataset:**

In [None]:
data_arxiv = load_dataset("ccdv/arxiv-summarization", trust_remote_code=True)
data_arxiv

##### **Tokenizing Data:**

In [None]:
def convert_examples_to_features(example_batch):

  input_encodings = tokenizer_large(example_batch["article"], max_length=1024, truncation=True)
  with tokenizer_large.as_target_tokenizer():
      target_encodings = tokenizer_large(example_batch["abstract"], max_length=256, truncation=True)
  return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": target_encodings["input_ids"]
    }
dataset_arxiv_pt = data_arxiv.map(convert_examples_to_features, batched=True)

##### **Setting the training parameters:**

In [None]:
trainer_args = TrainingArguments(
    output_dir="pegasus-science",
    num_train_epochs=5,  # Reduced for monitoring potential overfitting
    warmup_steps=500,
    per_device_train_batch_size=2,  # Increased batch size, if possible
    per_device_eval_batch_size=2,
    weight_decay=0.005,  # Adjusted for potential improvement
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=20,  # More frequent evaluations
    save_steps=1000,  # More frequent saves
    gradient_accumulation_steps=8,  # Reduced to speed up updates
    fp16=True  # Enable mixed precision if hardware supports it
)

##### **Training The Model:**

In [None]:
Arxtrainer = SFTTrainer(model=model_pega, args=trainer_args,
                  processing_class=tokenizer_large, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_arxiv_pt["train"],
                  eval_dataset=dataset_arxiv_pt["validation"])

In [None]:
Arxtrainer.train()

##### **Evaluating The Performance:**

In [None]:
score = calculate_metric_on_TEST_ds(data_arxiv["test"], rouge_metric, model_pega, tokenizer_large, batch_size=2, column_text="article", column_summary="abstract")
rouge_dict = {rn: score[rn] for rn in rouge_names}
pd.DataFrame(rouge_dict, index=["pegasus-Arxiv"])

#### **Samsum Dataset:**

##### **Loading Dataset:**

In [None]:
data_samsum = load_dataset("Samsung/samsum", trust_remote_code=True)
data_samsum

##### **Tokenizing Data:**

In [None]:
def convert_examples_to_features(example_batch):

  input_encodings = tokenizer_large(example_batch["dialogue"], max_length=1024, truncation=True)
  with tokenizer_large.as_target_tokenizer():
      target_encodings = tokenizer_large(example_batch["summary"], max_length=256, truncation=True)
  return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": target_encodings["input_ids"]
    }
dataset_samsum_pt = data_samsum.map(convert_examples_to_features, batched=True)

##### **Training The Model:**

In [None]:
samtrainer = SFTTrainer(model=model_pega, args=trainer_args,
                  processing_class=tokenizer_large, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["train"],
                  eval_dataset=dataset_samsum_pt["validation"])

In [None]:
samtrainer.train()

##### **Evaluating The Performance:**

In [None]:
score = calculate_metric_on_TEST_ds(data_samsum["test"], rouge_metric, model_pega, tokenizer_large, batch_size=2, column_text="dialogue", column_summary="summary")
rouge_dict = {rn: score[rn] for rn in rouge_names}
pd.DataFrame(rouge_dict, index=["pegasus-Samsum"])

### **PEGASUS BigBird:**

#### **PubMed:**

##### **Loading The Model and Tokenizer:**

In [None]:
tokenizer_big = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-pubmed")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model_bigbird = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-pubmed")
#model_pega = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv")

##### **Tokenizing The Data:**

In [None]:
def convert_examples_to_features(example_batch):

  input_encodings = tokenizer_big(example_batch["article"], max_length=1024, truncation=True)
  with tokenizer_large.as_target_tokenizer():
      target_encodings = tokenizer_big(example_batch["abstract"], max_length=256, truncation=True)
  return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": target_encodings["input_ids"]
    }
dataset_science_pt_big = dataset_science.map(convert_examples_to_features, batched=True)

##### **Model Parameters:**

In [None]:
trainer_args = TrainingArguments(
    output_dir="pegasus-science",
    num_train_epochs=5,  # Reduced for monitoring potential overfitting
    warmup_steps=500,
    per_device_train_batch_size=2,  # Increased batch size, if possible
    per_device_eval_batch_size=2,
    weight_decay=0.005,  # Adjusted for potential improvement
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=20,  # More frequent evaluations
    save_steps=1000,  # More frequent saves
    gradient_accumulation_steps=8,  # Reduced to speed up updates
    fp16=True  # Enable mixed precision if hardware supports it
)

##### **Training Model:**

In [None]:
bigptrainer = SFTTrainer(model=model_bigbird, args=trainer_args,
                  processing_class=tokenizer_big, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_science_pt_big["train"],
                  eval_dataset=dataset_science_pt_big["validation"])

In [None]:
bigptrainer.train()

##### **Evaluating The Model's Performance:**

In [None]:
score = calculate_metric_on_TEST_ds(dataset_science["test"], rouge_metric, model_bigbird, tokenizer_big, batch_size=2, column_text="article", column_summary="abstract")
rouge_dict = {rn: score[rn] for rn in rouge_names}
pd.DataFrame(rouge_dict, index=["pegasus-big-PubMed"])

#### **Arxiv:**

##### **Tokenizing The Data:**

In [None]:
def convert_examples_to_features(example_batch):

  input_encodings = tokenizer_big(example_batch["article"], max_length=1024, truncation=True)
  with tokenizer_large.as_target_tokenizer():
      target_encodings = tokenizer_big(example_batch["abstract"], max_length=256, truncation=True)
  return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": target_encodings["input_ids"]
    }
dataset_arxiv_pt_big = data_arxiv.map(convert_examples_to_features, batched=True)

##### **Training Model:**

In [None]:
bigatrainer = SFTTrainer(model=model_bigbird, args=trainer_args,
                  processing_class=tokenizer_big, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_arxiv_pt_big["train"],
                  eval_dataset=dataset_arxiv_pt_big["validation"])

In [None]:
bigatrainer.train()

##### **Evaluating The Model's Performance:**

In [None]:
score = calculate_metric_on_TEST_ds(data_arxiv["test"], rouge_metric, model_bigbird, tokenizer_big, batch_size=2, column_text="article", column_summary="abstract")
rouge_dict = {rn: score[rn] for rn in rouge_names}
pd.DataFrame(rouge_dict, index=["pegasus-big-arxiv"])