## Text Summarization on BBC News Corpus using Abstractive and Extractive Approach

In [65]:
# System Library
import os

# Data Wrangling Libraries
import pandas as pd
import numpy as np
import json
import gc
import textwrap
from termcolor import colored

# Machine Learning and NLP Libraries
import torch 
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, TQDMProgressBar
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AdamW, T5ForConditionalGeneration, T5TokenizerFast as T5Tokenizer
from tqdm.auto import tqdm
import networkx as nx
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
import gensim
from rouge import rouge
from gensim.models import Word2Vec

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

# Graph Plotting Libraries
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
from matplotlib import rc

%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
rcParams['figure.figsize']=16,10

[nltk_data] Downloading package wordnet to /Users/nidarsh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/nidarsh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/nidarsh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [77]:
DATASET_PATH = '../Dataset/'
DATA_PATH = '../Dataset/bbc-news-data.csv'
DOCUMENTS_PATH = '../Documents/'
CHECKPOINT_PATH = '../Checkpoints/'
WORD2VEC_PATH = '../Checkpoints/bbc_word2vec.wordvectors_train.txt'
MODEL_CHECKPOINT = './Checkpoints/model.ckpt'
MODEL_NAME = 't5-base'
EXTRACTIVE_EPOCHS = 3
ABSTRACTIVE_EPOCHS = 3
BATCH_SIZE = 8

In [67]:
with open(DATA_PATH, 'r') as f:
    df_header = f.readline().split()
df = pd.read_csv(DATA_PATH, names=df_header, sep='\t', skiprows=1)
df['content'] = df['content'].str.encode('ascii', 'ignore').str.decode('ascii')
train_df, test_df = train_test_split(df, test_size=0.2)
print(df)
print(f"Training Data Size: {train_df.shape}")
print(f"Testing Data Size: {test_df.shape}")


      category filename                              title  \
0     business  001.txt  Ad sales boost Time Warner profit   
1     business  002.txt   Dollar gains on Greenspan speech   
2     business  003.txt  Yukos unit buyer faces loan claim   
3     business  004.txt  High fuel prices hit BA's profits   
4     business  005.txt  Pernod takeover talk lifts Domecq   
...        ...      ...                                ...   
2220      tech  397.txt   BT program to beat dialler scams   
2221      tech  398.txt    Spam e-mails tempt net shoppers   
2222      tech  399.txt            Be careful how you code   
2223      tech  400.txt    US cyber security chief resigns   
2224      tech  401.txt   Losing yourself in online gaming   

                                                content  
0      Quarterly profits at US media giant TimeWarne...  
1      The dollar has hit its highest level against ...  
2      The owners of embattled Russian oil giant Yuk...  
3      British Airways 

In [68]:
class Extractive:
    """
    Extractive: Class used to perform Extractive Text Summarization.
    """
    def __init__(
        self,
        train_df: pd.DataFrame = None,
        test_df: pd.DataFrame = None,
        model: Word2Vec = None,
        word_lemmatizer: WordNetLemmatizer = None,
        word2vec_path: str = WORD2VEC_PATH,
        epochs: int = 3
    ):
        self.train_df = train_df
        self.test_df = test_df
        self.model = model
        self.word_lemmatizer = word_lemmatizer
        self.word2vec_path = word2vec_path
        self.epochs = epochs
        
        
    def word_vector_tokenization(self):
        self.sentences = []
        for i in tqdm(range(self.train_df.shape[0]), desc="Loading..."):
            content = self.train_df.loc[self.train_df.index[i]]['content']
            tokens = nltk.sent_tokenize(content)
            for token in tokens:
                self.sentences.append(token.split(' '))
        print('Storing Word Vectors!')
        self.model = Word2Vec(self.sentences, min_count=1)
        word_vectors = self.model.wv
        word_vectors.save_word2vec_format(self.word2vec_path, binary=False)
        print(f"Word Vectors Saved at {self.word2vec_path}!")

    def extract_word_vectors(self) -> dict:
        """
        Extracting word embeddings from wordvector file.
        """
        word_embeddings = {}
        f = open(self.word2vec_path, encoding='utf-8')
        for index, line in enumerate(f):
            if(index == 0):
                continue
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            word_embeddings[word] = coefs

        f.close()
        return word_embeddings

    def text_preprocessing(self, sentences: list) -> list:
        """
        Pre processing text to remove unnecessary words.
        """
        stop_words = set(stopwords.words('english'))

        clean_words = None
        for sent in sentences:
            words = word_tokenize(sent)
            words = [self.word_lemmatizer.lemmatize(word.lower()) for word in words if word.isalnum()]
            clean_words = [word for word in words if word not in stop_words]

        return clean_words

    def sentence_vector_representation(self, sentences: list, word_embeddings: dict) -> list:
        """
        Creating sentence vectors from word embeddings.
        """
        sentence_vectors = []
        for sent in sentences:
            clean_words = self.text_preprocessing([sent])
            # Averaging the sum of word embeddings of the sentence to get sentence embedding vector
            v = sum([word_embeddings.get(word, np.zeros(100, )) for word in clean_words]) / (len(clean_words) + 0.001)
            sentence_vectors.append(v)

        return sentence_vectors

    def create_similarity_matrix(self, sentences: list, sentence_vectors: list) -> np.ndarray:
        """
        Using cosine similarity, generate similarity matrix.
        """
        # Defining a zero matrix of dimension n * n
        sim_mat = np.zeros([len(sentences), len(sentences)])
        for i in range(len(sentences)):
            for j in range(len(sentences)):
                if i != j:
                    # Replacing array value with similarity value.
                    # Not replacing the diagonal values because it represents similarity with its own sentence.
                    if(type(sentence_vectors[i]) != type(0.1) and type(sentence_vectors[j])!= type(0.1)):
                        sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1, 100), sentence_vectors[j].reshape(1, 100))[0, 0]

        return sim_mat

    def determine_sentence_rank(self, sentences: list, sim_mat: np.ndarray):
        """
        Determining sentence rank using Page Rank algorithm.
        """
        nx_graph = nx.from_numpy_array(sim_mat)
        scores = nx.pagerank(nx_graph)
        ranked_sentences = sorted([(scores[i], s[:15]) for i, s in enumerate(sentences)], reverse=True)
        return ranked_sentences

    def generate_summary(self, sentences: list, ranked_sentences: list):
        """
        Generate a sentence for sentence score greater than average.
        """
        # Get top 1/3 th ranked sentences
        top_ranked_sentences = ranked_sentences[:int(len(sentences) / 3)] if len(sentences) >= 3 else ranked_sentences
        sentence_count = 0
        summary = ''

        for i in sentences:
            for j in top_ranked_sentences:
                if i[:15] == j[1]:
                    summary += i + ' '
                    sentence_count += 1
                    break
        return summary

    def train(self):
        self.word_vector_tokenization()

    def rouge_evaluation(self, evaluator, reference, hypothesis):
        rouge_scores = evaluator.get_scores(hypothesis, reference)
        rouge_df = pd.DataFrame(rouge_scores)
        f1_val, precision_val, recall_val = rouge_df.mean(axis=1)
        return [f1_val, precision_val, recall_val]

    def bleu_evaluation(self, chencherry, reference, hypothesis):
        bleu_score = sentence_bleu(reference, word_tokenize(hypothesis), smoothing_function=chencherry.method4, weights=(0.2, 0.4, 0.3, 0.1))
        return bleu_score
        

    def test(self):
        summarized_df = pd.DataFrame(columns=['original', 'summarized'], index=[i for i in range(test_df.shape[0])])
        rouge_test_score = pd.DataFrame(columns=['f1_score', 'precision', 'recall'], index=[i for i in range(test_df.shape[0])])
        evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                        max_n=4,
                        limit_length=True,
                        length_limit=100,
                        length_limit_type='words',
                        apply_avg=1,
                        apply_best=0,
                        alpha=0.5, # Default F1_score
                        weight_factor=1.2,
                        stemming=True)
        chencherry = SmoothingFunction()
        bleu_scores_list = []
        
        for i in tqdm(range(test_df.shape[0]), desc="Loading..."):
            index = test_df.index[i]
            test_content = test_df.loc[index]['content']
            test_sentences = sent_tokenize(test_content)
            word_embeddings = self.extract_word_vectors()
            sentence_vectors = self.sentence_vector_representation(sentences=test_sentences, word_embeddings=word_embeddings)
            similarity_mat = self.create_similarity_matrix(sentences=test_sentences, sentence_vectors=sentence_vectors)
            ranked_sentences = self.determine_sentence_rank(sentences=test_sentences, sim_mat=similarity_mat)
            summary = self.generate_summary(sentences=test_sentences, ranked_sentences=ranked_sentences)
            summarized_df.iloc[i] = [test_content, summary]
            rouge_test_score.iloc[i] = self.rouge_evaluation(evaluator=evaluator, reference=test_content, hypothesis=summary)
            bleu_scores = self.bleu_evaluation(chencherry=chencherry, reference=test_sentences, hypothesis=summary)
            bleu_scores_list.append(bleu_scores)
        
        print()
        print('-' * 25 + 'BLEU SCORE' + '-' * 25)
        print(f"Score: {bleu_scores}")
        print('-' * 30 + '*' + '-' * 30) 
        print()
        print('-' * 25 + 'ROUGE SCORE' + '-' * 25)
        print(rouge_test_score.mean())
        print('-' * 30 + '*' + '-' * 30)

        print()
        print('-' * 25 + 'Saving original text and its generated summary' + '-' * 25)
        summarized_df.to_csv(DOCUMENTS_PATH + 'Extractive_Summarization_Result.csv')

    def test_custom(self, custom_text: str = 'Dummy Text'):
        test_content = custom_text
        test_sentences = sent_tokenize(test_content)
        word_embeddings = self.extract_word_vectors()
        sentence_vectors = self.sentence_vector_representation(sentences=test_sentences, word_embeddings=word_embeddings)
        similarity_mat = self.create_similarity_matrix(sentences=test_sentences, sentence_vectors=sentence_vectors)
        ranked_sentences = self.determine_sentence_rank(sentences=test_sentences, sim_mat=similarity_mat)
        summary_custom = self.generate_summary(sentences=test_sentences, ranked_sentences=ranked_sentences)
        print()
        print('-' * 25 + 'YOUR TEXT' + '-' * 25)
        print(test_content)
        print('-' * 30 + '*' + '-' * 30)
        print('-' * 25 + 'EXTRACTIVE SUMMARY' + '-' * 25)
        print(summary_custom)
        print('-' * 30 + '*' + '-' * 30) 





In [69]:
extract_obj = Extractive(
                        train_df=train_df,
                        test_df=test_df,
                        word_lemmatizer=WordNetLemmatizer(),
                        word2vec_path=WORD2VEC_PATH,
                        epochs=EXTRACTIVE_EPOCHS
                )

In [70]:
extract_obj.train()

Loading...:   0%|          | 0/1780 [00:00<?, ?it/s]

Storing Word Vectors!
Word Vectors Saved at ../Checkpoints/bbc_word2vec.wordvectors_train.txt!


In [72]:
extract_obj.test()

Loading...:   0%|          | 0/445 [00:00<?, ?it/s]


-------------------------BLEU SCORE-------------------------
Score: 0.011549270572159
------------------------------*------------------------------

-------------------------ROUGE SCORE-------------------------
f1_score     0.378655
precision    0.455189
recall       0.344460
dtype: float64
------------------------------*------------------------------

-------------------------Saving original text and its generated summary-------------------------


In [73]:
extract_obj.test_custom(custom_text="""
The wave roared towards them with speed and violence they had not anticipated. They both turned to run but by that time it was too late. The wave crashed into their legs sweeping both of them off of their feet. They now found themselves in a washing machine of saltwater, getting tumbled and not know what was up or down. Both were scared, not knowing how this was going to end, but it was by far the best time of the trip thus far. Welcome to my world. You will be greeted by the unexpected here and your mind will be challenged and expanded in ways that you never thought possible. That is if you are able to survive... It really shouldn't have mattered to Betty. That's what she kept trying to convince herself even if she knew it mattered to Betty more than practically anything else. Why was she trying to convince herself otherwise? As she stepped forward to knock on Betty's door, she still didn't have a convincing answer to this question that she'd been asking herself for more than two years now. The headphones were on. They had been utilized on purpose. She could hear her mom yelling in the background, but couldn't make out exactly what the yelling was about. That was exactly why she had put them on. She knew her mom would enter her room at any minute, and she could pretend that she hadn't heard any of the previous yelling. It was their first date and she had been looking forward to it the entire week. She had her eyes on him for months, and it had taken a convoluted scheme with several friends to make it happen, but he'd finally taken the hint and asked her out. After all the time and effort she'd invested into it, she never thought that it would be anything but wonderful. It goes without saying that things didn't work out quite as she expected.
""")


-------------------------YOUR TEXT-------------------------

The wave roared towards them with speed and violence they had not anticipated. They both turned to run but by that time it was too late. The wave crashed into their legs sweeping both of them off of their feet. They now found themselves in a washing machine of saltwater, getting tumbled and not know what was up or down. Both were scared, not knowing how this was going to end, but it was by far the best time of the trip thus far. Welcome to my world. You will be greeted by the unexpected here and your mind will be challenged and expanded in ways that you never thought possible. That is if you are able to survive... It really shouldn't have mattered to Betty. That's what she kept trying to convince herself even if she knew it mattered to Betty more than practically anything else. Why was she trying to convince herself otherwise? As she stepped forward to knock on Betty's door, she still didn't have a convincing answer to this 

In [84]:
from summary_moduler import SummaryDataset, SummaryModel, SummaryDataModule

class Abstractive:

    def __init__(
        self,
        train_df: pd.DataFrame = None,
        test_df: pd.DataFrame = None,
        model: SummaryModel = None,
        tokenizer: T5Tokenizer = None,
        trainer: pl.Trainer = None,
        datamodule: SummaryDataModule = None

    ):
        self.train_df = train_df
        self.test_df = test_df
        self.model = model
        self.tokenizer = tokenizer
        self.trainer = trainer
        self.datamodule = datamodule

    def train(self):
        
        title_token_counts, content_token_counts = [], []
        for _, row in self.train_df.iterrows():
            title_token_count = len(self.tokenizer.encode(row['title']))
            title_token_counts.append(title_token_count)

            content_token_count = len(self.tokenizer.encode(row['content']))
            content_token_counts.append(content_token_count)
        
        self.data_module = SummaryDataModule(train_df=self.train_df, test_df=self.test_df, tokenizer=self.tokenizer)
        
        checkpoint_callback = ModelCheckpoint(
            dirpath='Checkpoints',
            filename='model',
            save_top_k=1,
            verbose=True,
            monitor='validation_loss',
            mode='min'
        )
        
        logger = TensorBoardLogger("lightning_logs", name='summary')
        
        trainer = pl.Trainer(
                logger=logger,
                callbacks=[checkpoint_callback, TQDMProgressBar(refresh_rate=10)],
                max_epochs=ABSTRACTIVE_EPOCHS,
                devices=2,
                accelerator="auto",
                log_every_n_steps=1,
                enable_model_summary=True,
                enable_progress_bar=True
            )
        trainer.fit(model=self.model, train_dataloaders=self.data_module.train_dataloader, val_dataloaders=self.data_module.validation_dataloader)
        self.trainer = trainer
        self.trainer.save_checkpoint(MODEL_CHECKPOINT)

    def test(self):
        self.trainer.test(dataloaders=self.data_module.test_dataloader)


    def summarizeText(self, eval_flag='false', text='Dummy Data'):
        trained_model = SummaryModel.load_from_checkpoint(MODEL_CHECKPOINT)
        trained_model.freeze()
        text_encoding = self.tokenizer(
            text,
            max_length = 512,
            padding = 'max_length',
            truncation = True,
            add_special_tokens = True,
            return_tensors = 'pt'
        )
        generated_ids = trained_model.model.generate(
            input_ids = text_encoding['input_ids'],
            attention_mask = text_encoding['attention_mask'],
            max_length = 150,
            num_beams = 2,
            repetition_penalty = 2.5,
            length_penalty = 2.0,
            early_stopping = True
            
        )

        preds = [
            self.tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            for generated_id in generated_ids
        ]
        abstractive_summary = "".join(preds)
        if(eval_flag):
            return abstractive_summary
        
        print()
        print('-' * 25 + 'YOUR TEXT' + '-' * 25)
        print(text)
        print('-' * 30 + '*' + '-' * 30)
        print('-' * 25 + 'ABSTRACTIVE SUMMARY' + '-' * 25)
        print(abstractive_summary)
        print('-' * 30 + '*' + '-' * 30) 

    def rouge_evaluation(self):
        abstractive_summarized_df = pd.DataFrame(columns=['original', 'summarized'], index=[i for i in range(self.test_df.shape[0])])
        abstractive_rouge_test_score = pd.DataFrame(columns=['f1_score', 'precision', 'recall'], index=[i for i in range(self.test_df.shape[0])])
        evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                        max_n=4,
                        limit_length=True,
                        length_limit=100,
                        length_limit_type='words',
                        apply_avg=1,
                        apply_best=0,
                        alpha=0.5, # Default F1_score
                        weight_factor=1.2,
                        stemming=True)

        for i in tqdm(range(self.test_df.shape[0]), desc="Loading..."):
            index = self.test_df.index[i]
            test_content = test_df.loc[index]['content']
            summary = self.summarizeText(eval_flag=True, text=test_content)
            abstractive_summarized_df.iloc[i] = [test_content, summary]
            rouge_score = evaluator.get_scores(summary, test_content)
            rouge_df = pd.DataFrame(rouge_score)
            f1_val, precision_val, recall_val = rouge_df.mean(axis=1)
            abstractive_rouge_test_score.iloc[i] = [f1_val, precision_val, recall_val]
        
        print()
        print('-' * 25 + 'ROUGE SCORE' + '-' * 25)
        print(abstractive_rouge_test_score.mean())
        print('-' * 30 + '*' + '-' * 30)

        print()
        print('-' * 25 + 'Saving original text and its generated summary' + '-' * 25)
        abstractive_summarized_df.to_csv(DOCUMENTS_PATH + 'Abstractive_Summarization_Result.csv')



In [86]:
abstractive = Abstractive(
                train_df=train_df,
                test_df=test_df,
                model= SummaryModel(),
                tokenizer= T5Tokenizer.from_pretrained(MODEL_NAME)
                )

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
abstractive.train()

In [87]:
abstractive.rouge_evaluation()

Loading...:   0%|          | 0/445 [00:00<?, ?it/s]


-------------------------ROUGE SCORE-------------------------
f1_score     0.134473
precision    0.146104
recall       0.128985
dtype: float64
------------------------------*------------------------------

-------------------------Saving original text and its generated summary-------------------------


In [None]:
abstractive.test()

In [79]:
abstractive.summarizeText(text="""
The wave roared towards them with speed and violence they had not anticipated. They both turned to run but by that time it was too late. The wave crashed into their legs sweeping both of them off of their feet. They now found themselves in a washing machine of saltwater, getting tumbled and not know what was up or down. Both were scared, not knowing how this was going to end, but it was by far the best time of the trip thus far. Welcome to my world. You will be greeted by the unexpected here and your mind will be challenged and expanded in ways that you never thought possible. That is if you are able to survive... It really shouldn't have mattered to Betty. That's what she kept trying to convince herself even if she knew it mattered to Betty more than practically anything else. Why was she trying to convince herself otherwise? As she stepped forward to knock on Betty's door, she still didn't have a convincing answer to this question that she'd been asking herself for more than two years now. The headphones were on. They had been utilized on purpose. She could hear her mom yelling in the background, but couldn't make out exactly what the yelling was about. That was exactly why she had put them on. She knew her mom would enter her room at any minute, and she could pretend that she hadn't heard any of the previous yelling. It was their first date and she had been looking forward to it the entire week. She had her eyes on him for months, and it had taken a convoluted scheme with several friends to make it happen, but he'd finally taken the hint and asked her out. After all the time and effort she'd invested into it, she never thought that it would be anything but wonderful. It goes without saying that things didn't work out quite as she expected.
""")


-------------------------YOUR TEXT-------------------------

The wave roared towards them with speed and violence they had not anticipated. They both turned to run but by that time it was too late. The wave crashed into their legs sweeping both of them off of their feet. They now found themselves in a washing machine of saltwater, getting tumbled and not know what was up or down. Both were scared, not knowing how this was going to end, but it was by far the best time of the trip thus far. Welcome to my world. You will be greeted by the unexpected here and your mind will be challenged and expanded in ways that you never thought possible. That is if you are able to survive... It really shouldn't have mattered to Betty. That's what she kept trying to convince herself even if she knew it mattered to Betty more than practically anything else. Why was she trying to convince herself otherwise? As she stepped forward to knock on Betty's door, she still didn't have a convincing answer to this 