<a href="https://colab.research.google.com/github/Sonu-P/BDS/blob/main/NLP_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentence_transformers wikipedia pytextrank -q
# !pip install wikipedia -q
# !pip install pytextrank -q

In [2]:
import os
import pandas as pd
import numpy as np
import math
import random
import re
import warnings
import glob
import nltk
from sentence_transformers import SentenceTransformer, util
from sentence_transformers import models, InputExample, losses
import wikipedia
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import spacy
import pytextrank
from IPython.display import HTML, display
warnings.filterwarnings("ignore")

In [3]:
def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [4]:
model_path = 'google/bigbird-roberta-base' # downloaded the model from huggingface
max_seq_length = 128  # max sequence length that model can process at a time

word_embedding_model = models.Transformer(model_path, max_seq_length = max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens = True) # mean pooling is used
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [5]:
# Processing function as four main sections:
# pre-processing
# worldcloud
# key words extraction
# summarization

def processing(text):
    ''' Text preprocessing : Regex to clean the text'''
    text = re.sub(r'=(.*)=', '', text)  # removing the sub headings from the text
    text = re.sub(r'=', ' ', text)  # removing any "="

    print("\n************************")
    print("Overview of the document:")
    print("************************\n")

    ''' Worldcloud creation : max_font_size and max_words are taken post certain iterations that best fits the data '''

    wordcloud = WordCloud(max_font_size=100, max_words=50, background_color="black").generate(text)
    plt.figure()
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()

    print("\n************************************")
    print("Important phrases in the document:")
    print("************************************\n")

    ''' Important keywords selection :
    spacy pre-trained english embeddings : en_core_web_sm is utilized for building understanding aroung the data.
    "text rank", an algorithm similar to page rank is used, which assigns more weightage to key words or phrases that are more
    linked to surrounding words.'''

    nlp = spacy.load("en_core_web_sm")  # loading the english-core-web-small embeddings
    nlp.add_pipe("textrank")  # adding text-rank pipeline
    doc = nlp(text)  # passing input text to the pipeline
    for phrase in doc._.phrases[:3]:  # only printing top 3 phrases
        print('* {}\n'.format(phrase.text))

    ''' Segregating sentences from the overall text. Regex to seprate sentences only on specific conditions '''

    punct_regex = r"(?=\S)(?:[A-Z][a-z]{0,3}\.|[^.?!]|\.(?!\s+[A-Z]))*.?"
    sentences = re.findall(punct_regex, text)

    ''' Summarization by computing similarity between sentences.
    sentence embeddings here are computed through the transformer model.
    each sentence's similarity is computed with all others. Sentence combinations having scores between 0.90 and 0.99 are selected.
    sentence level grouping is performed to get the sum of scores
    top n sentences with maximum scores are selected and combined is their natural order to get the summary'''

    cos_sim = []  # list for storing cosine similarity scores
    element = []  # list to store sentences index
    counter = 0   # initializing counter
    for i in sentences:
        sent_emb = model.encode(i)
        for j in sentences:
            other_emb = model.encode(j)
            element.append(counter)
            cos_sim.append(round(util.cos_sim(other_emb, sent_emb).item(), 2))
        counter += 1

    df = pd.DataFrame({'element':element, 'sim':cos_sim})
    df_f = df[(df['sim']>0.7)&(df['sim']!=1)]
    df_f1 = df_f.groupby('element').agg({'sim':'sum'}).sort_values('sim', ascending=False).head(4).reset_index()
    element_list = [i for i in df_f1['element']]
    element_list.sort()

    values = []
    for i in element_list:
        values.append(sentences[i])
    summary = ' '.join(str(i) for i in values)
    print("\n*********")
    print("Summary:")
    print("*********\n")
    print(summary)

In [7]:
sample = input("Please select the topic\n")
w = wikipedia.page(sample)
processing(w.content[:4000])

KeyboardInterrupt: ignored