# Loading in Data

In [1]:
import os
import pandas as pd

base_folder = '/kaggle/input/bbc-news-summary/BBC News Summary'

article_texts = []
summaries = []

In [None]:
for folder_name in os.listdir(os.path.join(base_folder, 'News Articles')):

    article_folder = os.path.join(base_folder, 'News Articles', folder_name)
    summary_folder = os.path.join(base_folder, 'Summaries', folder_name)

    for file_name in os.listdir(article_folder):
        article_file = os.path.join(article_folder, file_name)
        summary_file = os.path.join(summary_folder, file_name)
        
        if os.path.exists(article_file) and os.path.exists(summary_file):
            try:
                article_df = open(article_file, 'r', encoding='utf-8').read()
                summary_df = open(summary_file, 'r', encoding='utf-8').read()
                
                article_texts.append(article_df)
                summaries.append(summary_df)
            except UnicodeDecodeError:
                print(f"File with incorrect encoding: {article_file}")
                continue

data = {'Article Text': article_texts, 'Summary': summaries}
df = pd.DataFrame(data)

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = df
data.columns = ['Article Text', 'Summary']

train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

train_count = train_data.shape[0]
val_count = val_data.shape[0]
test_count = test_data.shape[0]

print("Train data count:", train_count)
print("Validation data count:", val_count)
print("Test data count:", test_count)



Train data count: 1556
Validation data count: 334
Test data count: 334


# Preparing Data for Training

In [4]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm
import torch

tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

def convert_to_features(index, row):
    input_encodings = tokenizer(row['Article Text'], truncation=True, padding='max_length', max_length=1024, return_tensors="pt")
    target_encodings = tokenizer(row['Summary'], truncation=True, padding='max_length', max_length=512, return_tensors="pt")
    
    return {
        'input_ids': input_encodings['input_ids'],
        'labels': target_encodings['input_ids']
    }

train_features = [convert_to_features(index, row) for index, row in tqdm(train_data.iterrows(), total=len(train_data), desc='Processing Training Data')]
val_features = [convert_to_features(index, row) for index, row in tqdm(val_data.iterrows(), total=len(val_data), desc='Processing Validation Data')]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Processing Training Data: 100%|██████████| 1556/1556 [00:09<00:00, 161.31it/s]
Processing Validation Data: 100%|██████████| 334/334 [00:02<00:00, 161.95it/s]


In [5]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

# Train the Model

## 1. Create a Dataset Class:

In [6]:
from torch.utils.data import Dataset, DataLoader

class TextSummarizationDataset(Dataset):
    def __init__(self, features):
        self.features = features

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx]

train_dataset = TextSummarizationDataset(train_features)
val_dataset = TextSummarizationDataset(val_features)

## 2. Create Data Loaders:

In [7]:
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)

In [8]:
for idx, batch in enumerate(train_dataloader):
    print(f"Batch {idx}:")
    print(batch)
    if idx > 3:
        break

Batch 0:
{'input_ids': tensor([[[12528,   467, 13288,  ...,     0,     0,     0]]]), 'labels': tensor([[[17716, 12528,  1031,   114,  5190,  6608,    11,  7462,   955,  9428,
            130,   182, 11766,    11,   410,     3,     9,  1399,   613,    13,
              3, 18147,     8,  1254,    13, 27330,     6,  8936,    11, 27939,
             13,     8,   814,  7884,     5,  3713,  2233,  2136,     7,    24,
           9437,     6,    11,   116,     8, 12528,  5189,  9567,     7,   190,
              6,    34,  4227,   114,     3,     9,  5722,  3125,     5,   634,
           9347,    19,    13,   503,    12,   617,     3,     9,  3714,  2164,
           3023,    12,     3,     9,     3,  1206,   940,   467,     6,    68,
              8,     3, 17398,   914,   405,    59,   129,     8,  5143,     7,
          15401,   114,    34,   523,    12,     5,  5680,     8,   467,    19,
            207,    44,    19,     3,    15, 17430,    53,    25,    16, 26847,
              6,  4896,  

## 3. Set Up Training Loop:


In [9]:
import torch.optim as optim
from tqdm import tqdm
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=5e-4)

num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    for batch in tqdm(train_dataloader, desc=f'Training Epoch {epoch + 1}'):
        optimizer.zero_grad()
        inputs = batch['input_ids'].squeeze(dim=1).to(device)
        labels = batch['labels'].squeeze(dim=1).to(device)
        outputs = model(input_ids=inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()

    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc=f'Validation Epoch {epoch + 1}'):
            inputs = batch['input_ids'].squeeze(dim=1).to(device)
            labels = batch['labels'].squeeze(dim=1).to(device)
            outputs = model(input_ids=inputs, labels=labels)
            total_val_loss += outputs.loss.item()

    print(f'Epoch {epoch + 1}, Training Loss: {total_train_loss / len(train_dataloader)}, Validation Loss: {total_val_loss / len(val_dataloader)}')

Training Epoch 1: 100%|██████████| 1556/1556 [07:57<00:00,  3.26it/s]
Validation Epoch 1: 100%|██████████| 334/334 [00:30<00:00, 10.93it/s]


Epoch 1, Training Loss: 0.18104116259032627, Validation Loss: 0.10223920538207282


Training Epoch 2: 100%|██████████| 1556/1556 [07:57<00:00,  3.26it/s]
Validation Epoch 2: 100%|██████████| 334/334 [00:30<00:00, 10.92it/s]


Epoch 2, Training Loss: 0.11550749267587931, Validation Loss: 0.09881793623957448


Training Epoch 3: 100%|██████████| 1556/1556 [07:57<00:00,  3.26it/s]
Validation Epoch 3: 100%|██████████| 334/334 [00:30<00:00, 10.91it/s]


Epoch 3, Training Loss: 0.10000800559713785, Validation Loss: 0.10124160113750015


Training Epoch 4: 100%|██████████| 1556/1556 [07:57<00:00,  3.26it/s]
Validation Epoch 4: 100%|██████████| 334/334 [00:30<00:00, 10.91it/s]


Epoch 4, Training Loss: 0.09043314843154261, Validation Loss: 0.100906733377463


Training Epoch 5: 100%|██████████| 1556/1556 [07:57<00:00,  3.26it/s]
Validation Epoch 5: 100%|██████████| 334/334 [00:30<00:00, 10.91it/s]


Epoch 5, Training Loss: 0.07575574906253804, Validation Loss: 0.10444623536937786


Training Epoch 6: 100%|██████████| 1556/1556 [07:57<00:00,  3.26it/s]
Validation Epoch 6: 100%|██████████| 334/334 [00:30<00:00, 10.91it/s]


Epoch 6, Training Loss: 0.06697620540841753, Validation Loss: 0.10830523139061016


Training Epoch 7: 100%|██████████| 1556/1556 [07:57<00:00,  3.26it/s]
Validation Epoch 7: 100%|██████████| 334/334 [00:30<00:00, 10.91it/s]


Epoch 7, Training Loss: 0.05907870677855239, Validation Loss: 0.11134369678594573


Training Epoch 8: 100%|██████████| 1556/1556 [07:57<00:00,  3.26it/s]
Validation Epoch 8: 100%|██████████| 334/334 [00:30<00:00, 10.91it/s]


Epoch 8, Training Loss: 0.0553179509372296, Validation Loss: 0.10860263129618175


Training Epoch 9: 100%|██████████| 1556/1556 [07:57<00:00,  3.26it/s]
Validation Epoch 9: 100%|██████████| 334/334 [00:30<00:00, 10.90it/s]


Epoch 9, Training Loss: 0.047281200996334344, Validation Loss: 0.11689257234109086


Training Epoch 10: 100%|██████████| 1556/1556 [07:57<00:00,  3.26it/s]
Validation Epoch 10: 100%|██████████| 334/334 [00:30<00:00, 10.91it/s]


Epoch 10, Training Loss: 0.04072276950792283, Validation Loss: 0.11560804796339609


Training Epoch 11: 100%|██████████| 1556/1556 [07:57<00:00,  3.26it/s]
Validation Epoch 11: 100%|██████████| 334/334 [00:30<00:00, 10.91it/s]


Epoch 11, Training Loss: 0.03893251708250973, Validation Loss: 0.12559958647805528


Training Epoch 12: 100%|██████████| 1556/1556 [07:57<00:00,  3.26it/s]
Validation Epoch 12: 100%|██████████| 334/334 [00:30<00:00, 10.90it/s]


Epoch 12, Training Loss: 0.03588726652285161, Validation Loss: 0.12705536797561845


Training Epoch 13: 100%|██████████| 1556/1556 [07:57<00:00,  3.26it/s]
Validation Epoch 13: 100%|██████████| 334/334 [00:30<00:00, 10.90it/s]


Epoch 13, Training Loss: 0.030174349281747746, Validation Loss: 0.1264774601193407


Training Epoch 14: 100%|██████████| 1556/1556 [07:57<00:00,  3.26it/s]
Validation Epoch 14: 100%|██████████| 334/334 [00:30<00:00, 10.90it/s]


Epoch 14, Training Loss: 0.02714253457974136, Validation Loss: 0.12480318584565832


Training Epoch 15: 100%|██████████| 1556/1556 [07:58<00:00,  3.25it/s]
Validation Epoch 15: 100%|██████████| 334/334 [00:30<00:00, 10.90it/s]


Epoch 15, Training Loss: 0.02531093106631663, Validation Loss: 0.13243456644774276


Training Epoch 16: 100%|██████████| 1556/1556 [07:58<00:00,  3.25it/s]
Validation Epoch 16: 100%|██████████| 334/334 [00:30<00:00, 10.90it/s]


Epoch 16, Training Loss: 0.02807855675715066, Validation Loss: 0.14028386377043184


Training Epoch 17: 100%|██████████| 1556/1556 [07:57<00:00,  3.26it/s]
Validation Epoch 17: 100%|██████████| 334/334 [00:30<00:00, 10.90it/s]


Epoch 17, Training Loss: 0.01943401976729142, Validation Loss: 0.1431833399279084


Training Epoch 18: 100%|██████████| 1556/1556 [07:57<00:00,  3.26it/s]
Validation Epoch 18: 100%|██████████| 334/334 [00:30<00:00, 10.90it/s]


Epoch 18, Training Loss: 0.01993153448622537, Validation Loss: 0.1431729905032375


Training Epoch 19: 100%|██████████| 1556/1556 [07:58<00:00,  3.25it/s]
Validation Epoch 19: 100%|██████████| 334/334 [00:30<00:00, 10.87it/s]


Epoch 19, Training Loss: 0.017543688958521157, Validation Loss: 0.142804924091197


Training Epoch 20: 100%|██████████| 1556/1556 [07:57<00:00,  3.26it/s]
Validation Epoch 20: 100%|██████████| 334/334 [00:30<00:00, 10.89it/s]

Epoch 20, Training Loss: 0.015772049136231897, Validation Loss: 0.1494130449852769





In [10]:
model.save_pretrained('Model-Files')
tokenizer.save_pretrained('Model-Files')

('Model-Files/tokenizer_config.json',
 'Model-Files/special_tokens_map.json',
 'Model-Files/spiece.model',
 'Model-Files/added_tokens.json')

# Test on a new summary here:

In [12]:
def summarize(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True).to(device)
    summary_ids = model.generate(
    inputs['input_ids'], 
    max_length=1024, 
    num_beams=4, 
    length_penalty=2.0, 
    early_stopping=True, 
    no_repeat_ngram_size=2
).to(device)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

article= '''Abstract. The amount of text data available online is increasing at a very fast pace hence text summarization has become essential. Most of the modern 
recommender and text classification systems require going through a huge 
amount of data. Manually generating precise and fluent summaries of lengthy 
articles is a very tiresome and time-consuming task. Hence generating automated 
summaries for the data and using it to train machine learning models will make 
these models space and time-efficient. Extractive summarization and abstractive 
summarization are two separate methods of generating summaries. The extractive 
technique identifies the relevant sentences from the original document and 
extracts only those from the text. Whereas in abstractive summarization 
techniques, the summary is generated after interpreting the original text, hence 
making it more complicated. In this paper, we will be presenting a comprehensive 
comparison of a few transformer architecture based pre-trained models for text 
summarization. For analysis and comparison, we have used the BBC news dataset 
that contains text data that can be used for summarization and human generated 
summaries for evaluating and comparing the summaries generated by machine 
learning models. 
Keywords: Natural Language Processing, Deep Learning, Summarization, 
Transformers  
1   Introduction 
The aim of news summarization is to create a concise summary from a long document 
or news articles such that no information is lost. In recent times, computing text 
summaries using Deep Learning has gained popularity 
1.1   Need for Text Summarization 
Automating summarization [1] would eliminate manual efforts. Shorter texts, which 
are summaries of longer texts, would reduce reading time. With the ever-growing 
amount of data, text summarization would reduce the size of files and hence solve the 
problem of storage. A shorter text or summary would provide more significant insights. 
Moreover, accurate summaries are very useful when it comes to text mining and data 
analysis. 
1.2   Summarization Techniques 
Text summarization can be broadly classified into two approaches [2] -  
Extractive Summarization - In extractive summarization [3[], a summary from the 
given text is created by selecting a subset of the total sentence base. Most important 
phrases or sentences from the text are identified and selected based on a score that is 
computed depending on the words in that sentence. 
Abstractive Summarization - In the method of abstractive summarization [4], an 
interpretation is first created by analysing the text document. Based on this 
interpretation, the machine predicts a summary. It transforms the text by paraphrasing 
sections of the original document. 
This work will focus on abstractive summarization to create an accurate and fluent 
summary as this task is more challenging and simulates human perception for 
developing summaries. For this task, we have used some machine learning models pre
trained on a large dataset. 
2   Related Work 
The task of summarization using NLP first came into the picture in 1958. Initially, 
statistical approaches were used to compute a score for every sentence and then select 
the sentences with the highest scores. Several techniques were employed to calculate 
this score, such as TF-IDF [5], Bayesian models [6], etc. While these techniques were 
able to compute a sound summary by key phrase extraction, all of them were extractive 
approaches and were simply trimming the original text. Then the focus came onto 
utilizing Machine learning algorithms [1] for summarization, such as Bayesian 
Learning Models as was done in the paper [6]. These machine learning techniques 
proved to be successful for pattern recognition in texts and establishing a correlation 
between different words. In this section, we will be examining why and how machine 
learning techniques were employed for the task of summarization. Every text or 
sentence can be thought of as sequential data as the order of words is essential for the 
natural language interpretation and formation. In order to process sequential data, which 
is the case for most NLP problems, the architecture needs to retain information with the 
help of some memory.  
One of the variants of RNN [7, 8] the LSTM network [9], retains sequential 
information with the help of connected nodes by keeping relevant information and 
forgetting insignificant information that helped in generating summaries. This 
methodology of LSTM network [9] was utilized to develop the encoder-decoder model 
[9]. Seq2seq models implemented with the help of the encoder-decoder framework for 
solving NLP tasks gave wonderful results, but there was still the issue of parallelization. 
Even though the sequential information is retained in the case of the encoder-decoder 
model [10] , the processing, in this case, is done by taking one input at a time as LSTM 
[8] takes only a single input at a time. This is a problematic situation as even though 
this model gives improved results, it proves to be unsuccessful for every possible case 
and defeats the purpose of creating machine perception.  
This led to the addition of the Attention layer [8]. As depicted in Figure 1 [10], an 
attention layer in the encoder-decoder model [10] analyses the input sequence at every 
step, and based on the previous sequences, assigns a weight to it. The attention layer 
[11] creates vector matrices by considering every word in the sentence for one input. 
Hence, the attention layer forces the machine to look over the entire text as one input 
rather than separate sequences as separate inputs. This mechanism was extremely 
effective [11] for the abstractive approach and became popular. For this work, we have 
utilized the transformer architecture [11], developed by Google as a baseline model. 
Fig. 1. Attention mechanism in Encoder-Decoder architecture [10] 
The introduction of several pre-trained language models, for example, BERT [12], 
PEGASUS [13], UNiLM [14], GPT [15], etc., has transformed the field of NLP and 
their great results encouraged us to employ these technologies for the task of 
summarization. Most of the pre-trained models used in this project are based on 
Google’s powerful Transformer architecture [11] that was developed in 2017. It is 
similar to RNN and is inspired from the encoder-decoder framework.  Transformer 
model [11] was invented to solve natural language processing tasks that involved 
transforming an input sequence into an output sequence. These pre-trained language 
models from Hugging face library [16] can be used to solve multiple NLP problems. 
3   Methodology 
3.1 Dataset 
From this section onwards, we will outline our basic experimental setup, discuss the 
evaluation metrics, and then describe various models that we used for our study. Then 
we will combine insights from our study and show the comparative performance of the 
models. The dataset we used was generated from a dataset used for text classification. 
It consists of 2225 BBC news website documents relating to stories used in the paper 
[17] in five topical areas from 2004-2005, all of whose rights, including copyright, are 
held by the BBC in the content of the original papers.           
3.2   Preprocessing 
This dataset consists of long news articles along with short summaries for comparison. 
The raw dataset was then cleaned using various pre-processing techniques such as: 
Lower casing - To convert the input text into the same casing format so that all capital, 
lower case and mixed case are treated similarly. 
Eliminate Punctuation - HTML tags and links- Removal of punctuations, links and 
tags that do not add meaning to the text such as “!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~`” 
to standardise the text. 
Eliminate Stopwords and frequently occurring words - Removal of common words 
such as ‘the’, ‘a’, etc that are frequently used in a text but do not provide valuable 
information for downstream analysis. 
Stemming - Reducing the inflected words to their root form. 
Lemmatization - Reducing derived words to their base or root form while making sure 
that root words belong to the language. 
Contraction mapping - Expanding the shortened version of words or syllables. 
2.3   Model Explanation 
Basic Understanding of Transformer model -The transformer network [11] is solely 
based upon multiple attention layers. It does not make use of RNN and is reliant on 
attention layers and positional encoding for remembering the sequence of words in the 
input sequence. The global dependencies created with the help of multiple attention 
layers help in creating parallelization in processing the input.  
The transformer model [11] contains encoder and decoder layers, where each is 
connected to a multi-head attention layer and feed forward network layers. The model 
remembers the position and sequence of words with the help of cosine and sine 
functions that creates positional encoding. The multi-head attention layer [11] in the 
encoder and decoder layer applies a mechanism called self-attention. The input is fed 
into three connected layers to create query (Q), key (K), and value (V) vectors [11]. 
It contains an 
encoder and decoder layer and the various normalization and multi-head 
attention layers are also depicted in the figure. 
Fig. 2. Transformer Model Architecture [11] 
Pretrained Models based on Transformers - Hugging face [16] works as an open
source for providing many useful NLP libraries and datasets. Its most famous library is 
the Transformer library. The transformer library consists of various pre-trained models 
to predict summaries of texts that can be fine-tuned for any dataset. Here we will be 
discussing some pre-trained models that were tuned and implemented for the BBC news 
dataset to give fairly good summaries. The models we used are as follows: 
Pipeline – The pipelines are a great and quick way to use different pre-trained models 
for inference. These pipelines are objects that abstract most of the library's complicated 
code, offering a simple API dedicated to several tasks, including text summarization. 
Pipelines enclose the overall steps of every NLP process such as Tokenization, 
Inference, which maps every token into a more meaningful representation, and 
Decoding. The Hugging Face transformers summarization pipeline has made the task 
easier, faster and more efficient to execute in English language. We used the machine 
learning model that has been trained on the CNN news corpus by using a fine-tuned 
BART algorithm [18] and is loaded from pipeline() using the task identifier: 
"summarization".  
BART – BART stands for Bidirectional and Auto Regressive Transformers [18]. It is 
built with a seq2seq model trained with denoising as a pre-training purpose. It uses a 
standard seq2seq model architecture combining an encoder similar to BERT [12] and a 
GPT-like decoder [15]. The pre-training task involves changing the order of the original 
phrases randomly and a new scheme where text ranges are switched with a single mask 
token. The large model of BART [18] consists of twice as many layers as are present 
in the base model. It is quite similar to the BERT [12] model but BART contains about 
10% more features than the BERT model of comparable size. BART's decoder is 
autoregressive, and it is regulated for generating sequential NLP tasks such as text 
summarization. The data is taken from the input but changed, which is closely related 
to the denoising pre-training objective. Hence, the input sequence embedding is the 
input of the encoder, and the decoder autoregressively produces output. We have used 
the "facebook/bart-large-cnn" pre-trained model and then the Bart tokenizer, which is 
constructed from the GPT-2 tokenizer. Hence words are encoded differently depending 
on their position in the sentence. 
T5 - T5 is the abbreviation for "Text-to-Text Transfer Transformer" [19]. The idea 
behind the T5 model is transfer learning [20]. The model was initially trained on a task 
containing large text in Transfer Learning before it was finely tuned on a downstream 
task so that the model learns general-purpose skills and information to be applied to 
tasks such as summarization T5 [19] uses a sequence-to-sequence generation method 
that feeds the encoded input via cross-attention layers to the decoder and generates the 
decoder output autoregressive. We have fine-tuned a T5 model [19], where the encoder 
takes an input a series of tokens which are mapped to a sequence of embeddings. A 
block containing two subcomponents are present in the encoder block namely, a self
attention layer and feed forward network. The decoder and encoder are similar in 
structure, except that there's a generalized attention mechanism after every self
attention layer. This allows the model to operate only on the previous outputs. The final 
decoder block produces an output which is fed into another layer. This final layer is a 
dense layer where the activation function is softmax. The weights from the output of 
this layer are fed into the input embedding matrix. 
PEGASUS - PEGASUS stands for Pre-training with Extracted Gap-sentences for 
Abstractive Summarization Sequence-to-sequence models [13]. In this model, 
significant lines are eliminated from the input text and are compiled as separate outputs. 
Also, choosing only relevant sentences outperforms the randomly selected sentences. 
This methodology is preferred for abstractive summarization as it is similar to the task 
of interpreting the entire document and generating a summary. It is used to train a 
Transformer model on a text data resulting in the PEGASUS model. The model is pre
trained on CNN/DailyMail summarization datasets. 
Fig. 3. The PEGASUS Model Architecture 
As shown in Figure 3 [13], it was found that it works as a prior-training aim for text 
summarization to mask complete sentences from the text and create gap-sentences from 
the remaining document.     
4   Results 
4.1 Qualitative Analysis 
For generating summaries, we fine-tuned the following transformer-based pre-trained 
language models from the Hugging face library [16]. The BBC News Dataset was used 
for generating summaries that consisted of text and human generated summaries which 
are summaries that have been written by humans. 
We used the human generated summaries to perform a comprehensive analysis of 
the summaries generated by different models. Table 1 gives the results obtained from 
different models which have been compared with the given reference summary - “An 
incident of robbery that occurred at the shopping complex last night was reported at 
the local police station this morning. A lot of valuables were stolen and multiple such 
robberies have been reported in that area. The people have been asked to stay alert and 
notice any suspicious activity. A CCTV camera from a nearby house captured the 
incident and there were a total four robbers who can be seen carrying bags. The 
shopkeeper suffered a loss and hopes the police catch the culprits.”.'''
new_summary = summarize(article)
print(new_summary)

While these techniques were able to compute a sound summary by key phrase extraction, all of them were extractive approaches and were simply trimming the original text.1.2 Summarization Techniques Text summarisation can be broadly classified into two approaches [2] - Extractive SumMarization [3[], if summary from the given text is created by selecting o subset of the total sentence base.It transforms the text by paraphrasing sections of this original document.Writer Dudley Nichols refused his Academy Award in 1935, creating the encoder-decoder model [9].A shorter text or summary would provide more significant insights.Analyst comparison, we have used the BBC news dataset that contains text data that can been used for summariization and human generated summaries for evaluating and comparing the sumesian Learning Models as was done in the paper [6].]This work will focus on abstractive sumarization to create an accurate and fluent summary as this task is more challenging and simulates hum