* Performing Text summarization using Extractive Summarization technique.

In [1]:
import re
import numpy as np
import pandas as pd
import wordninja
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk.corpus import words
from sklearn.feature_extraction.text import TfidfVectorizer
import networkx as nx
import traceback

In [2]:
class NewsParsing(object):
    def __init__(self):
        self.regex_pattern = r'[^A-Za-z0-9.]'
        self.stopwords_list = stopwords.words('english')
        self.words_list = list(words.words())
        self.vectorizer = TfidfVectorizer()
        self.file_path = "./Articles/"
    
    # function to read data...
    def read_data(self, file_name):
        try:
            # get the filename...
            file_name = self.file_path + file_name
            file = open(file_name,'r',encoding='cp1252')
            filedata = file.readlines()

            # create a complete article...
            complete_article = ' '.join(filedata)
            complete_article = complete_article.replace('\n','').strip().lower()
            
            return complete_article
        
        except Exception as e:
            print(traceback.format_exc(e))
        
    # function to tokenize the data...
    def tokenize_data(self, news_data):
        # Tokenize the articles...
        tokenized_article = sent_tokenize(news_data)
        return tokenized_article
    
    # function to build similarity matrix...
    def generate_similarity_score(self, sentence1, sentence2):
        try:
            # create the vectors....
            sparse_matrix = self.vectorizer.fit_transform([sentence1, sentence2])

            # get the feature names...
            feature_names = self.vectorizer.get_feature_names()

            dense = sparse_matrix.todense()

            denseList = dense.tolist()

            # create a dataframe
            df = pd.DataFrame(denseList, columns=feature_names)

            vector1 = list(df.iloc[0])
            vector2 = list(df.iloc[0])

            # build cosine similarity score...
            cos_distance = cosine_distance(vector1, vector2)

            cosine_similarity = (1 - cos_distance)

            return cosine_similarity
        
        except Exception as e:
            print(traceback.format_exc(e))

    # function to create a similarity matrix...
    def generate_similarity_matrix(self, tokenized_article):
        try:
            # Create a similarity matrix...
            similarity_matrix = np.zeros((len(tokenized_article), len(tokenized_article)))

            # Iterate over the sentences...
            for index1 in range(0, len(tokenized_article)):
                for index2 in range(0, len(tokenized_article)):
                    if index1 == index2:
                        similarity_matrix[index1][index2] = 1.0
                    else:
                        # Build a similarity matrix here....
                        similarity_matrix[index1][index2] = self.generate_similarity_score(tokenized_article[index1], tokenized_article[index2])

            return similarity_matrix
        
        except Exception as e:
            print(traceback.format_exc(e))
            
    # function to rank the sentences...
    def rank_sentences(self, similarity_matrix, tokenized_article, top_n=5):
        summarized_text = []
        # Create a sentence similarity graph...
        sentence_similarity_graph = nx.from_numpy_array(similarity_matrix)
        scores = nx.pagerank(sentence_similarity_graph)
        ranked_sentence = sorted(((scores[i], s) for i, s in enumerate(tokenized_article)), reverse=True)
        
        for i in range(top_n):
            summarized_text.append("".join(ranked_sentence[i][1]))
        
        # Step 6 - output the summarize text
        summarized_text = " ".join(summarized_text)
        summarized_text = " ".join(summarized_text.split())
        summarized_text = summarized_text.strip()
        
        return summarized_text 
        
    # function to summarize the text...
    def summarize_text(self, file_name):
        # Step 1: Read the Articles...
        complete_article = self.read_data(file_name)
        
        # Step 2: Tokenize the data...
        tokenized_article = self.tokenize_data(complete_article)
        
        # Step 3: Generate Similarity Matrix...
        similarity_matrix = self.generate_similarity_matrix(tokenized_article)
        
        # Step 4: Rank the Sentences...
        summarized_text = self.rank_sentences(similarity_matrix, tokenized_article)
        
        return summarized_text
        
        

In [11]:
file_name = "News_2.txt"

In [8]:
obj_parse = NewsParsing()

In [12]:
summarized_text = obj_parse.summarize_text(file_name)

In [13]:
summarized_text

'"while we don\'t know which of these apps have implemented the new sdk, we can confirm that agora has released the sdk and has followed up with its developers to urge them to implement the update," povolny told zdnet. mcafee said it discovered this issue last year, in april, during a security audit for temi, a personal robot used in retail stores, which also supports audio and video calling.a subsequent investigation also found clues that this behavior also impacted other apps using the sdk.steve povolny, head of advanced threat research at mcafee, told zdnet in an email last week that they notified agora of their findings and that the company responded by releasing a new sdk in december 2020 that was not vulnerable to cve-2020-25605. any attacker sitting on the same network as a targeted user could intercept the traffic in the initial phases of a call, extract call identifiers, and then join the call without being detected. a small library that provides audio and video calling capabi