In [16]:
!pip install transformers[sentencepiece] protobuf==3.20.3 datasets rouge_score py7zr  -q

# !pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [17]:
# set seed to predict the same output everytime
from transformers import pipeline , set_seed

# for data manuplation and analysis
import pandas as pd

# library for NLP that provides tools for text processing, such as tokenization, stemming, and parsing
import nltk

# To load dataset from collection of pre-built datasets and evaluation metrics
from datasets import load_dataset, load_metric

# imports the AutoModelForSeq2SeqLM and AutoTokenizer classes from the transformers module
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# imports the sent_tokenize function from the nltk.tokenize module. This function is used for tokenizing text into sentences.
from nltk.tokenize import sent_tokenize

# downloads the punkt package needed for sent_tokenize, which contains pre-trained models for tokenizing text into sentences and words.
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
from datasets import load_dataset

# loading the CNN/DailyMail dataset
dataset= load_dataset("cnn_dailymail", version="3.0.0")





  0%|          | 0/3 [00:00<?, ?it/s]

In [19]:
# The resulting dataset object is a dictionary-like object that contains several splits of the dataset, such as "train", "validation", and "test".
print(f"features are- {dataset['train'].column_names}")

features are- ['article', 'highlights', 'id']


In [20]:
# creating a summarization pipeline object using the Hugging Face Transformers library(importing and loading pretrained model)
pipe = pipeline('summarization', model="google/pegasus-cnn_dailymail")


In [21]:
#  takes the first 1000 characters of that 1st article in training set
sample_text= dataset['train'][0]['article'][:1000]

# testing model on sample
pipe_out= pipe(sample_text)

# printing predicted summary
print(pipe_out)

[{'summary_text': 'Harry Potter star Daniel Radcliffe gains access to a reported £20 million fortune .<n>The young actor says he has no plans to fritter his cash away .<n>Radcliffe: "I don\'t think I\'ll be particularly extravagant"'}]


In [22]:
# retrieves the summary and makes it more readable
# string method to replace special separator tokens used by the Pegasus model to separate sentences in the summary text with newline characters. 
print(pipe_out[0]["summary_text"].replace(".<n>", ".\n"))

Harry Potter star Daniel Radcliffe gains access to a reported £20 million fortune .
The young actor says he has no plans to fritter his cash away .
Radcliffe: "I don't think I'll be particularly extravagant"


In [23]:
print(dataset['train'][0]['article'].replace(".", ".\n"))

LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.
1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him.
 Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties.
 "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month.
 "I don't think I'll be particularly extravagant.
 "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs.
" At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart.
 Details

In [24]:
# original summary for evaluating the quality of the summary generated by a the model
reference= dataset['train'][0]['highlights']

print(reference)


Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .


In [25]:
# loading the ROUGE metric from the datasets library
rouge_metric= load_metric('rouge')

#  adding prediction-reference pair to the rouge_metric object, which will be used to compute the ROUGE scores
rouge_metric.add(prediction=pipe_out[0]["summary_text"].replace(".<n>", ".\n"),reference=reference )

# computing the ROUGE scores for the prediction-reference pair
score= rouge_metric.compute()



In [26]:
rouge_names= ["rouge1","rouge2","rougeL","rougeLsum"]

# creating a dictionary mapping each ROUGE metric name to its computed rouge
# calculating the mid-level F1 score averaged across all examples in the dataset
rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )
# print('rouge_dict ', rouge_dict )
rec=[]
rec.append(rouge_dict)

print("Testing on 1st article only\n")

# display rouge accuracy
pd.DataFrame.from_records(rec,index=['Pegasus'])


Testing on 1st article only



Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
Pegasus,0.506667,0.438356,0.506667,0.506667


# Evaluating accuracy on first 1000 articles

In [27]:
# selecting a random subset of 1000 examples from the training datase
test_subset = dataset['train'].shuffle(seed = 42).select(range(1000))



In [28]:
# Python library that provides a progress bar that can be used to track the progress
from tqdm import tqdm

# importing the PyTorch library
import torch

# using NVIDIAs parallel computing GPU platform if avalable
device = "cuda" if torch.cuda.is_available() else "cpu"

# spliting the dataset into smaller batches that we can process simultaneously
def generate_batchs(list_of_elements, batch_size):
    for i in range(0, len(list_of_elements), batch_size):
        # creating a generator object that produces sub-lists of batch sized elements
        yield list_of_elements[i : i + batch_size]

# function to calculate metrics on test dataset
def calculate_metrics(dataset, metric, model, tokenizer, batch_size=16, device=device):

    # splitting the articles and their summaries into batches
    article_batches = list(generate_batchs(dataset["article"], batch_size))
    summary_batches = list(generate_batchs(dataset["highlights"], batch_size))


    # for keeping track of progress on iteration
    for article_batch, summary_batch in tqdm(
        
        # creating an iterator of tuples where each tuple contains one batch of articles and one batch of target summaries.  
        zip(article_batches, summary_batches), total=len(article_batches)):
        
        # creating  PyTorch tensors by tokenizing articles into sequences of same length
        inputs = tokenizer(article_batch, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
        


        # function to generate a tensor of token ids representing the summary.
        summaries = model.generate(input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device), length_penalty=0.8, num_beams=8, max_length=128)
        # specifying which tokens in the input sequence should be attended-considered(1) by the using attention_mask 
        # initializing length_penalty to penalize longer output sequences.(longer or shorter summaries)
        # initializing number of beams to use during beam search decoding.
        # truncating the generated sequence if the model generates a sequence that is longer than max_length
        
        # decoding the generated summary tokens using the tokenizer's decode method. 
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in summaries]      
        # skipping any special tokens, such as padding tokens by setting skip_special_tokens argument 
        # removeing unnecessary whitespaces added during tokenization by setting clean_up_tokenization_spaces argument


        # replacing <n> tokens to ensure that the generated summaries are comparable to the reference summaries
        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
        
        # adding the decoded summaries to the metric, along with the corresponding reference summaries
        metric.add_batch(predictions=decoded_summaries, references=summary_batch)
        
    #  computing and returning the ROUGE scores
    score = metric.compute()
    return score

In [14]:
# Loading checkpoint of the pre-trained Pegasus model
model_ckpt = "google/pegasus-cnn_dailymail"

# initializing a tokenizer object specific to the Pegasus model
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Loading and moving the model to the device
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

# calling metrics function on the test subset
score = calculate_metrics(test_subset, rouge_metric , model_pegasus, tokenizer, batch_size=8)

100%|██████████| 125/125 [23:07<00:00, 11.10s/it]


In [15]:
rouge_names= ["rouge1","rouge2","rougeL","rougeLsum"]

# creating a dictionary mapping each ROUGE metric name to its computed rouge
# calculating the mid-level F1 score averaged across all examples in the dataset
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)

# display rouge accuracy
pd.DataFrame(rouge_dict, index=["pegasus"])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.475187,0.278272,0.374555,0.429171
