# Extractive Summarizer

Extractive summarization picks up sentences directly from the document based on a scoring function to form a coherent summary. This method work by identifying important sections of the text cropping out and stitch together portions of the content to produce a condensed version.

In [32]:
import nltk
import math
import numpy
import pandas as pd
from warnings import warn
from operator import attrgetter
from nltk.tokenize import sent_tokenize, word_tokenize
from numpy.linalg import svd as singular_value_decomposition
from nltk.corpus import stopwords
nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)

True

In [9]:
text="""The Indian independence movement was a series of historic events with the ultimate aim of ending British rule in India. It lasted from 1857 to 1947.

The first nationalistic revolutionary movement for Indian independence emerged from Bengal. It later took root in the newly formed Indian National Congress with prominent moderate leaders seeking the right to appear for Indian Civil Service examinations in British India, as well as more economic rights for natives. The first half of the 20th century saw a more radical approach towards self-rule by the Lal Bal Pal triumvirate, Aurobindo Ghosh and V. O. Chidambaram Pillai.

The final stages of the independence struggle from the 1920s was characterized by Congress' adoption of Mahatma Gandhi's policy of non-violence and civil disobedience. Intellectuals such as Rabindranath Tagore, Subramania Bharati, and Bankim Chandra Chattopadhyay spread patriotic awareness. Female leaders like Sarojini Naidu, Pritilata Waddedar, and Kasturba Gandhi promoted the emancipation of Indian women and their participation in the freedom struggle.

Few leaders followed a more violent approach. This became especially popular after the Rowlatt Act, which permitted indefinite detention. The Act sparked protests across India, especially in the Punjab province, where they were violently suppressed in the Jallianwala Bagh massacre."""

In [8]:
stopwords = stopwords.words('english')

## STEPS:
    
    1. Sent tokenize
    2. Create TFITDF matrix
    3. Create Scoring Matrix 
    4. Produces a summary based on the top k most important sentences.

In [6]:
## Total 10 sentences
pd.DataFrame(sent_tokenize(sent))

Unnamed: 0,0
0,The Indian independence movement was a series ...
1,It lasted from 1857 to 1947.
2,The first nationalistic revolutionary movement...
3,It later took root in the newly formed Indian ...
4,The first half of the 20th century saw a more ...
5,The final stages of the independence struggle ...
6,"Intellectuals such as Rabindranath Tagore, Sub..."
7,"Female leaders like Sarojini Naidu, Pritilata ..."
8,Few leaders followed a more violent approach.
9,This became especially popular after the Rowla...


In [59]:
#Create a Dictionary 

#Creates mapping key = word, value = row index

In [38]:
def normalize_word(word):
        return word.lower()
def create_dictionary(document):

    words = word_tokenize(document)
    words = tuple(words)

    words = map(normalize_word, words)

    unique_words = frozenset(w for w in words if w not in stopwords.words())

    return dict((w, i) for i, w in enumerate(unique_words))
dictionary=create_dictionary(text)

In [56]:
dictionary

{'india': 0,
 'root': 1,
 'prominent': 2,
 'century': 3,
 "'s": 4,
 'aim': 5,
 'participation': 6,
 'triumvirate': 7,
 'especially': 8,
 'final': 9,
 'newly': 10,
 'radical': 11,
 'leaders': 12,
 'ghosh': 13,
 'series': 14,
 'permitted': 15,
 'formed': 16,
 '.': 17,
 'movement': 18,
 'protests': 19,
 'aurobindo': 20,
 'act': 21,
 'non-violence': 22,
 'bagh': 23,
 'ultimate': 24,
 'british': 25,
 'patriotic': 26,
 'examinations': 27,
 'approach': 28,
 'struggle': 29,
 "'": 30,
 'adoption': 31,
 'sarojini': 32,
 'characterized': 33,
 'half': 34,
 'chandra': 35,
 'promoted': 36,
 '1857': 37,
 'rowlatt': 38,
 'lal': 39,
 'v.': 40,
 'punjab': 41,
 'mahatma': 42,
 'rights': 43,
 'nationalistic': 44,
 'stages': 45,
 'chidambaram': 46,
 'tagore': 47,
 'spread': 48,
 'female': 49,
 'pritilata': 50,
 'emerged': 51,
 'lasted': 52,
 'historic': 53,
 'o.': 54,
 'suppressed': 55,
 'massacre': 56,
 ',': 57,
 'disobedience': 58,
 'natives': 59,
 '1920s': 60,
 'bal': 61,
 'violently': 62,
 'events': 63

In [None]:
### Creates matrix of shape where cells contains number of occurences of words (rows) in senteces (cols).

In [60]:
def create_matrix(document, dictionary):
    sentences = sent_tokenize(document)
    words_count = len(dictionary)
    sentences_count = len(sentences)
    if words_count < sentences_count:
        message = (
            "Number of words (%d) is lower than number of sentences (%d). "
        )
        warn(message % (words_count, sentences_count))

    matrix = numpy.zeros((words_count, sentences_count))
    for col, sentence in enumerate(sentences):
        words = word_tokenize(sentence)
        for word in words:
            if word in dictionary:
                row = dictionary[word]
                matrix[row, col] += 1

    return matrix
matrix = create_matrix(text, dictionary)

In [61]:
pd.DataFrame(matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
101,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
#
#    Computes TF metrics for each sentence (column) in the given matrix and  normalize 
#    the tf weights of all terms occurring in a document by the maximum tf in that document 
#    according to ntf_{t,d} = a + (1-a)\frac{tf_{t,d}}{tf_{max}(d)^{'}}.

#    The smoothing term $a$ damps the contribution of the second term - which may be viewed 
#    as a scaling down of tf by the largest tf value in $d$
#

In [41]:
def compute_term_frequency(matrix, smooth=0.4):
    assert 0.0 <= smooth < 1.0

    max_word_frequencies = numpy.max(matrix, axis=0)
    rows, cols = matrix.shape
    for row in range(rows):
        for col in range(cols):
            max_word_frequency = max_word_frequencies[col]
            if max_word_frequency != 0:
                frequency = matrix[row, col]/max_word_frequency
                matrix[row, col] = smooth + (1.0 - smooth)*frequency

    return matrix

In [63]:
matrix = compute_term_frequency(matrix)

In [64]:
pd.DataFrame(matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4
1,0.4,0.4,0.4,1.0,0.4,0.4,0.4,0.4,0.4,0.4,0.4
2,0.4,0.4,0.4,1.0,0.4,0.4,0.4,0.4,0.4,0.4,0.4
3,0.4,0.4,0.4,0.4,1.0,0.4,0.4,0.4,0.4,0.4,0.4
4,0.4,0.4,0.4,0.4,0.4,1.0,0.4,0.4,0.4,0.4,0.4
...,...,...,...,...,...,...,...,...,...,...,...
99,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.7
100,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.7,0.4,0.4,0.4
101,1.0,0.4,1.0,0.4,0.4,1.0,0.4,0.4,0.4,0.4,0.4
102,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4


The **Singular Value Decomposition (SVD)** of a matrix is a factorization of that matrix into three matrices. It has some interesting algebraic properties and conveys important geometrical and theoretical insights about linear transformations. 

In [44]:
u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)

In [45]:
MIN_DIMENSIONS = 3
REDUCTION_RATIO = 1/1
def compute_ranks(sigma, v_matrix):
    assert len(sigma) == v_matrix.shape[0]

    dimensions = max(MIN_DIMENSIONS,int(len(sigma)*REDUCTION_RATIO))
    powered_sigma = tuple(s**2 if i < dimensions else 0.0 for i, s in enumerate(sigma))

    ranks = []

    for column_vector in v_matrix.T:
        rank = sum(s*v**2 for s, v in zip(powered_sigma, column_vector))
        ranks.append(math.sqrt(rank))

    return ranks

def get_best_sentences(sentences, count, rating, *args, **kwargs):
    rate = rating
    if isinstance(rating, dict):
        assert not args and not kwargs
        rate = lambda s: rating[s]

    infos = (SentenceInfo(s, o, rate(s, *args, **kwargs))
        for o, s in enumerate(sentences))

    # sort sentences by rating in descending order
    infos = sorted(infos, key=attrgetter("rating"), reverse=True)
    # get `count` first best rated sentences
    if not isinstance(count, ItemsCount):
        count = ItemsCount(count)
    infos = count(infos)
    # sort sentences by their order in document
    infos = sorted(infos, key=attrgetter("order"))

    return tuple(i.sentence for i in infos)

In [51]:
class ItemsCount(object):
    def __init__(self, value):
        self._value = value

    def __call__(self, sequence):
        if isinstance(self._value, (bytes, str,)):
            if self._value.endswith("%"):
                total_count = len(sequence)
                percentage = int(self._value[:-1])
                # at least one sentence should be chosen
                count = max(1, total_count*percentage // 100)
                return sequence[:count]
            else:
                return sequence[:int(self._value)]
        elif isinstance(self._value, (int, float)):
            return sequence[:int(self._value)]
        else:
            ValueError("Unsuported value of items count '%s'." % self._value)

    def __repr__(self):
        return to_string("<ItemsCount: %r>" % self._value)

In [54]:
from operator import attrgetter
from collections import namedtuple

SentenceInfo = namedtuple("SentenceInfo", ("sentence", "order", "rating",))

def extractive_summary(text,sentences_count):
    sentences = sent_tokenize(text)

    dictionary=create_dictionary(text)
    matrix = create_matrix(text, dictionary)
    matrix = compute_term_frequency(matrix)
    u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)
    ranks = iter(compute_ranks(sigma, v))
    return "".join(get_best_sentences(sentences, sentences_count,lambda s: next(ranks)))




In [55]:
extractive_summary(text,3)

"The Indian independence movement was a series of historic events with the ultimate aim of ending British rule in India.It later took root in the newly formed Indian National Congress with prominent moderate leaders seeking the right to appear for Indian Civil Service examinations in British India, as well as more economic rights for natives.The final stages of the independence struggle from the 1920s was characterized by Congress' adoption of Mahatma Gandhi's policy of non-violence and civil disobedience."