
## Importing modules

In [3]:
from nltk.corpus import stopwords

from nltk.cluster.util import cosine_distance

import numpy as np

import networkx as nx

## Open file and split into sentences

In [6]:
file = open("RRR.txt", "r", encoding="utf-8")

#This file contains one paragraph of multiple sentences

filedata = file.readlines()

article = filedata[0].split(". ") #Just do the first paragraph

sentences = []

for sentence in article:

         print(sentence)

         sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))

The Telugu language Indian action epic “RRR” (short for “Rise Roar Revolt”) has returned to US theaters for an exceptional one-night-only engagement on June 1st following its initial theatrical release
Some hindsight has made it easy to guess why writer/director S.S
Rajamouli has only now broken through to Western audiences with “RRR” despite his consistent box office success
Rajamouli’s latest is an anti-colonial fable and buddy drama about the imaginary combo of two real-life freedom fighters, Komaram Bheem (N.T
Rama Rao Jr.) and Alluri Sitarama Raju (Ram Charan)
“RRR” is also a fine showcase for Rajamouli’s characteristic focus on maximalist action choreography, overwhelming stuntwork and pyrotechnics, and sophisticated computer graphics
 



## Our data: a list of sentences

In [7]:
print("Sentences are ", sentences)

Sentences are  [['The', 'Telugu', 'language', 'Indian', 'action', 'epic', '“RRR”', '(short', 'for', '“Rise', 'Roar', 'Revolt”)', 'has', 'returned', 'to', 'US', 'theaters', 'for', 'an', 'exceptional', 'one-night-only', 'engagement', 'on', 'June', '1st', 'following', 'its', 'initial', 'theatrical', 'release'], ['Some', 'hindsight', 'has', 'made', 'it', 'easy', 'to', 'guess', 'why', 'writer/director', 'S.S'], ['Rajamouli', 'has', 'only', 'now', 'broken', 'through', 'to', 'Western', 'audiences', 'with', '“RRR”', 'despite', 'his', 'consistent', 'box', 'office', 'success'], ['Rajamouli’s', 'latest', 'is', 'an', 'anti-colonial', 'fable', 'and', 'buddy', 'drama', 'about', 'the', 'imaginary', 'combo', 'of', 'two', 'real-life', 'freedom', 'fighters,', 'Komaram', 'Bheem', '(N.T'], ['Rama', 'Rao', 'Jr.)', 'and', 'Alluri', 'Sitarama', 'Raju', '(Ram', 'Charan)'], ['“RRR”', 'is', 'also', 'a', 'fine', 'showcase', 'for', 'Rajamouli’s', 'characteristic', 'focus', 'on', 'maximalist', 'action', 'choreogra

## Function to calculate similarity 
## (count the number of common words)

In [8]:
def sentence_similarity(sent1, sent2 ):

    sent1 = [w.lower() for w in sent1]

    sent2 = [w.lower() for w in sent2]

    all_words = list(set(sent1 + sent2))

    vector1 = [0] * len(all_words)

    vector2 = [0] * len(all_words)

        # build the vector for the first sentence

    for w in sent1:

         vector1[all_words.index(w)] += 1

       # build the vector for the second sentence

    for w in sent2:

         vector2[all_words.index(w)] += 1

    return 1 - cosine_distance(vector1, vector2)

## Create the similarity matrix 

In [9]:
similarity_matrix = np.zeros((len(sentences), len(sentences)))
for idx1 in range(len(sentences)):
    for idx2 in range(len(sentences)):
        if idx1 == idx2:  # ignore if both are same sentences
            continue
        similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2])

print("Similarity matrix:\n", similarity_matrix)

Similarity matrix:
 [[0.         0.10660036 0.12862394 0.07715167 0.         0.18042196
  0.        ]
 [0.10660036 0.         0.14625448 0.         0.         0.
  0.        ]
 [0.12862394 0.14625448 0.         0.         0.         0.04950738
  0.        ]
 [0.07715167 0.         0.         0.         0.0727393  0.17817416
  0.        ]
 [0.         0.         0.         0.0727393  0.         0.13608276
  0.        ]
 [0.18042196 0.         0.04950738 0.17817416 0.13608276 0.
  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.        ]]


## Get the pagerank scores

In [10]:
# Step 3 - Rank sentences in similarity martix

sentence_similarity_graph = nx.from_numpy_array(similarity_matrix)

scores = nx.pagerank(sentence_similarity_graph)

print("scores", scores)

scores {0: 0.21508528044532071, 1: 0.12147617274451285, 2: 0.15013684627817148, 3: 0.14934347854981908, 4: 0.1028513336497258, 5: 0.2367166444300088, 6: 0.024390243902441208}


## Sort sentences by pagerank

In [11]:
# Step 4 - Sort the rank and pick top sentences

ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

print("Indexes of top ranked_sentence order are \n\n",

ranked_sentence)

Indexes of top ranked_sentence order are 

 [(0.2367166444300088, ['“RRR”', 'is', 'also', 'a', 'fine', 'showcase', 'for', 'Rajamouli’s', 'characteristic', 'focus', 'on', 'maximalist', 'action', 'choreography,', 'overwhelming', 'stuntwork', 'and', 'pyrotechnics,', 'and', 'sophisticated', 'computer', 'graphics']), (0.21508528044532071, ['The', 'Telugu', 'language', 'Indian', 'action', 'epic', '“RRR”', '(short', 'for', '“Rise', 'Roar', 'Revolt”)', 'has', 'returned', 'to', 'US', 'theaters', 'for', 'an', 'exceptional', 'one-night-only', 'engagement', 'on', 'June', '1st', 'following', 'its', 'initial', 'theatrical', 'release']), (0.15013684627817148, ['Rajamouli', 'has', 'only', 'now', 'broken', 'through', 'to', 'Western', 'audiences', 'with', '“RRR”', 'despite', 'his', 'consistent', 'box', 'office', 'success']), (0.14934347854981908, ['Rajamouli’s', 'latest', 'is', 'an', 'anti-colonial', 'fable', 'and', 'buddy', 'drama', 'about', 'the', 'imaginary', 'combo', 'of', 'two', 'real-life', 'freed

## Pick the top “n” sentences

In [12]:
#Step 5 - How many sentences to pick

n = int(input("How many sentences do you want in the summary? "))

#n=2

summarize_text = []

for i in range(n):

      summarize_text.append(" ".join(ranked_sentence[i][1]))

## printing summary

In [14]:
# Step 6 - Offcourse, output the summarize text
print("Summarize Text: \n", ". ".join(summarize_text))

Summarize Text: 
 “RRR” is also a fine showcase for Rajamouli’s characteristic focus on maximalist action choreography, overwhelming stuntwork and pyrotechnics, and sophisticated computer graphics. The Telugu language Indian action epic “RRR” (short for “Rise Roar Revolt”) has returned to US theaters for an exceptional one-night-only engagement on June 1st following its initial theatrical release. Rajamouli has only now broken through to Western audiences with “RRR” despite his consistent box office success. Rajamouli’s latest is an anti-colonial fable and buddy drama about the imaginary combo of two real-life freedom fighters, Komaram Bheem (N.T. Some hindsight has made it easy to guess why writer/director S.S
