<a href="https://colab.research.google.com/github/RonnieCOOL/Text-Summarizer/blob/main/Text_Summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import string
import numpy as np
import matplotlib.pyplot as plt
from nltk import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ, ADV
from nltk.corpus import brown, stopwords
from nltk.cluster.util import cosine_distance
from operator import itemgetter
%matplotlib

Using matplotlib backend: agg


In [2]:
sentences = brown.sents('ca03')

In [3]:
len(sentences)

112

In [4]:
[' '.join(sent) for sent in sentences]

['Several defendants in the Summerdale police burglary trial made statements indicating their guilt at the time of their arrest , Judge James B. Parsons was told in Criminal court yesterday .',
 'The disclosure by Charles Bellows , chief defense counsel , startled observers and was viewed as the prelude to a quarrel between the six attorneys representing the eight former policemen now on trial .',
 'Bellows made the disclosure when he asked Judge Parsons to grant his client , Alan Clements , 30 , a separate trial .',
 'Bellows made the request while the all-woman jury was out of the courtroom .',
 'Fears prejudicial aspects',
 "`` The statements may be highly prejudicial to my client '' , Bellows told the court .",
 '`` Some of the defendants strongly indicated they knew they were receiving stolen property .',
 "It is impossible to get a fair trial when some of the defendants made statements involving themselves and others '' .",
 "Judge Parsons leaned over the bench and inquired , `` 

In [5]:
class TextCleaner():
    
    def __init__(self):
        self.stop_words = set(stopwords.words("english"))
        self.punctuations = set(string.punctuation)
        self.pos_tags = {
                NOUN: ['NN', 'NNS', 'NNP', 'NNPS', 'PRP', 'PRP$', 'WP', 'WP$'],
                VERB: ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
                ADJ: ['JJ', 'JJR', 'JJS'],
                ADV: ['RB', 'RBR', 'RBS', 'WRB']
        }


    def _remove_stop_words(self, words):
        return [w for w in words if w not in self.stop_words]
     
    
    def _remove_regex(self):
        self.input_sent = " ".join([w.lower() for w in self.input_sent])
        self.input_sent = re.sub(r"i'm", "i am", self.input_sent)
        self.input_sent = re.sub(r"he's", "he is", self.input_sent)
        self.input_sent = re.sub(r"she's", "she is", self.input_sent)
        self.input_sent = re.sub(r"that's", "that is", self.input_sent)
        self.input_sent = re.sub(r"what's", "what is", self.input_sent)
        self.input_sent = re.sub(r"where's", "where is", self.input_sent)
        self.input_sent = re.sub(r"\'ll", " will", self.input_sent)
        self.input_sent = re.sub(r"\'ve", " have", self.input_sent)
        self.input_sent = re.sub(r"\'re", " are", self.input_sent)
        self.input_sent = re.sub(r"\'d", " would", self.input_sent)
        self.input_sent = re.sub(r"won't", "will not", self.input_sent)
        self.input_sent = re.sub(r"can't", "cannot", self.input_sent)
        self.input_sent = re.sub(r"don't", "do not", self.input_sent)
        patterns = re.finditer("#[\w]*", self.input_sent)
        for pattern in patterns:
            self.input_sent = re.sub(pattern.group().strip(), "", self.input_sent)
        self.input_sent = "".join(ch for ch in self.input_sent if ch not in self.punctuations)
    
    
    def _tokenize(self):
        return word_tokenize(self.input_sent)
    
    
    def _process_content_for_pos(self, words):
        tagged_words = pos_tag(words)
        pos_words = []
        for word in tagged_words:
            flag = False
            for key, value in self.pos_tags.items():
                if word[1] in value:
                    pos_words.append((word[0], key))
                    flag = True
                    break
            if not flag:
                pos_words.append((word[0], NOUN))
        return pos_words
       
                 
    def _remove_noise(self):
        self._remove_regex()
        words = self._tokenize()
        noise_free_words = self._remove_stop_words(words)
        return noise_free_words
    
    
    def _normalize_text(self, words):
        lem = WordNetLemmatizer()
        pos_words = self._process_content_for_pos(words)
        normalized_words = [lem.lemmatize(w, pos=p) for w, p in pos_words]
        return normalized_words
    
    
    def clean_up(self, input_sent):
        self.input_sent = input_sent
        cleaned_words = self._remove_noise()
        cleaned_words = self._normalize_text(cleaned_words)
        return cleaned_words

In [6]:
# PageRank Algorithm
def pagerank(M, eps=1.0e-8, d=0.85):
    N = M.shape[1]
    v = np.random.rand(N, 1)
    v = v / np.linalg.norm(v, 1)
    last_v = np.ones((N, 1), dtype=np.float32) * np.inf
    M_hat = (d * M) + (((1 - d) / N) * np.ones((N, N), dtype=np.float32))
    
    while np.linalg.norm(v - last_v, 2) > eps:
        last_v = v
        v = np.matmul(M_hat, v)
    return v

In [7]:
# Calculating cosine similarity among sentences
def sentence_similarity(sent1, sent2):
    text_cleaner = TextCleaner()
    
    sent1 = text_cleaner.clean_up(sent1)
    sent2 = text_cleaner.clean_up(sent2)
    
    all_words = list(set(sent1 + sent2))
    
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    
    for w in sent1:
        vector1[all_words.index(w)] += 1
    
    for w in sent2:
        vector2[all_words.index(w)] += 1
    
    return 1 - cosine_distance(vector1, vector2)

In [8]:
# Similarity Adjacency Matrix for PageRank
def build_similarity_matrix(sentences):
    S = np.zeros((len(sentences), len(sentences)))
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i == j:
                continue
            else:
                S[i][j] = sentence_similarity(sentences[i], sentences[j])
    
    for i in range(len(S)):
        S[i] /= S[i].sum()
    return S

In [9]:
S = build_similarity_matrix(sentences)

  # This is added back by InteractiveShellApp.init_path()


In [10]:
S

array([[0.        , 0.01687557, 0.07942969, ..., 0.        , 0.        ,
        0.        ],
       [0.04705707, 0.        , 0.17510134, ..., 0.04705707, 0.        ,
        0.        ],
       [0.06799961, 0.05375841, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.01895401, 0.        , ..., 0.        , 0.        ,
        0.03790803],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.71010205],
       [0.        , 0.        , 0.        , ..., 0.039031  , 0.12342687,
        0.        ]])

In [11]:
sentence_ranks = pagerank(S)

In [12]:
sentence_ranks

array([[1.88548821e-02],
       [6.55282597e-03],
       [8.29578568e-03],
       [7.44373886e-03],
       [5.58660146e-03],
       [4.52984224e-03],
       [4.99660200e-03],
       [1.57881208e-02],
       [1.05258395e-02],
       [6.49803749e-03],
       [5.50145201e-03],
       [1.89734986e-02],
       [1.48051601e-02],
       [1.25541667e-02],
       [1.89610879e-02],
       [1.96704673e-02],
       [1.80403302e-02],
       [1.15302902e-02],
       [1.41160233e-03],
       [1.05722134e-02],
       [1.09833692e-02],
       [1.11123861e-02],
       [3.19474573e-04],
       [1.45642746e-02],
       [8.31259585e-03],
       [1.40540356e-02],
       [2.10450333e-03],
       [4.93080637e-03],
       [6.38187192e-03],
       [1.21344513e-02],
       [8.62344179e-03],
       [1.39783984e-02],
       [2.59184152e-03],
       [9.10677397e-03],
       [6.19607998e-03],
       [2.19223157e-03],
       [8.36613055e-03],
       [1.74079361e-02],
       [6.74489224e-03],
       [1.15109389e-02],


In [13]:
ranked_sentence_indexes = [item[0] for item in sorted(enumerate(sentence_ranks), key=lambda item: -item[1])]

In [14]:
ranked_sentence_indexes

[15,
 74,
 108,
 11,
 14,
 0,
 16,
 89,
 51,
 37,
 105,
 60,
 98,
 7,
 12,
 101,
 66,
 23,
 73,
 59,
 25,
 31,
 41,
 67,
 109,
 42,
 56,
 90,
 94,
 13,
 102,
 43,
 29,
 49,
 82,
 107,
 95,
 88,
 75,
 17,
 65,
 39,
 83,
 21,
 58,
 20,
 44,
 19,
 8,
 61,
 71,
 33,
 52,
 45,
 30,
 36,
 24,
 2,
 91,
 92,
 3,
 68,
 104,
 81,
 54,
 38,
 76,
 69,
 103,
 99,
 1,
 9,
 28,
 77,
 34,
 93,
 70,
 46,
 4,
 10,
 100,
 6,
 97,
 27,
 5,
 110,
 72,
 84,
 86,
 63,
 64,
 55,
 57,
 96,
 106,
 32,
 48,
 35,
 26,
 47,
 40,
 18,
 50,
 79,
 85,
 62,
 78,
 111,
 80,
 22,
 53,
 87]

In [15]:
SUMMARY_SIZE = 5

In [16]:
selected_sentences = sorted(ranked_sentence_indexes[:SUMMARY_SIZE])

In [17]:
selected_sentences

[11, 14, 15, 74, 108]

In [18]:
summary = itemgetter(*selected_sentences)(sentences)

In [19]:
# Generated Summary
for sent in summary:
    print(' '.join(sent))

There is a conflict among the defendants '' .
The President spent much of the week-end at his summer home on Cape Cod writing the first drafts of portions of the address with the help of White House aids in Washington with whom he talked by telephone .
Shortly after the Chief Executive returned to Washington in midmorning from Hyannis Port , Mass. , a White House spokesman said the address text still had `` quite a way to go '' toward completion .
The plan does not cover doctor bills .
Customary Senate rules were ignored in order to speed approval of the Negro leader as administrator of the housing and home finance agency .
