# Import Required Modules
  
*   Download require nltk components



In [7]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Finding Sentence Similarity


*   Cleaning Sentences
*   Making vectors from unique words.
*   Calculating the consine similarity.



In [8]:

def find_similarity(sentence1:list, sentence2:list, stopwords=None):
  # Removing all stop words
  filtered_sentence1 = [word.lower() for word in sentence1 if word not in stopwords]
  filtered_sentence2 = [word.lower() for word in sentence2 if word not in stopwords]


  # get unique words
  unique_words = list()
  unique_words.extend(filtered_sentence1)
  unique_words.extend(filtered_sentence2)

  unique_words = list(set(unique_words)) # use set to ensure only one case of each word

  # Get frequency information
  frequency_1 = dict()
  frequency_2 = dict()

  for word in filtered_sentence1:
    frequency_1[word]  = frequency_1.get(word, 0) +1;

  for word in filtered_sentence2:
    frequency_2[word]  = frequency_2.get(word, 0) +1;

  # Create vectors
  size_unique_words = len(unique_words)
  vector_sentence1 = [0 for i in unique_words]
  vector_sentence2 = [0 for i in unique_words]

  # Assigning frequencies to vectors
  for word,frequency in frequency_1.items():
    vector_sentence1[unique_words.index(word)] = frequency

  for word,frequency in frequency_2.items():
    vector_sentence2[unique_words.index(word)] =frequency
  # Calculate similarity between vectors
  return 1-cosine_distance(vector_sentence1, vector_sentence2)

#Make Similarity Matrix
* Iterate over all sentences.
* Generate  a similarity score for all “sentence pairs” that aren’t the same.


In [9]:
def construct_similarity_matrix(all_sentences, stopwords):
  # Initialize matrix
  matrix = [

            [0 for i in range(len(all_sentences))]
            for j in range(len(all_sentences))
  ]
  # Iterate over all sentences
  for i in range(len(all_sentences)):
    for j in range(len(all_sentences)):
      # Calculate similarity score for each sentence pair
      if(i != j):
        matrix[i][j] = find_similarity(all_sentences[i], all_sentences[j], stopwords)
      else:
        # Skip same sentence pairs.
        continue
  return matrix

# Get Text Rank score
* Iterate over similarity matrix.
* Calculate news score.
* Assign score.
* Add Stop iteration mechanic.


In [10]:
def text_rank(similarity_matrix):
    # Initialize scores
    sentence_count = len(similarity_matrix)
    scores = [ 1 for i in range(sentence_count)]
    # Set damping factors
    damping_factor = 0.85
    epsilon = 1e-5

    for x in range(100):
        # Initialize prior scores
        prior_scores = scores.copy()
        for i in range(sentence_count):
            # Calculate new score
            new_score = (1 - damping_factor)

            sum_of_products = 0
            for j in range(sentence_count):
                product = similarity_matrix[j][i] * prior_scores[j]
                sum_of_products += product


            new_score += (damping_factor * sum_of_products)

            # Assign new score
            scores[i] = new_score
        # Check if the change a minute change.
        if sum(abs(scores[i] - prior_scores[i]) for i in range(sentence_count)) < epsilon:
            break

    return scores

# Construct summary
* Tokenize text to sentences.
* Apply the above methods.
* Get the desired amount of sentences.
* Display summary.


In [11]:
def extract_summary(text, top_n=5):

    stop_words = set(stopwords.words('english'))

    # Get the sentences from the original text
    sentences = sent_tokenize(text)

    # Get the similarity matrix from all sentences
    sentence_similarity_matrix = construct_similarity_matrix(sentences, stop_words)

    # Get the scores for each sentence
    scores = text_rank(sentence_similarity_matrix)

    # Put the sentences in decsending order based on the score
    ranked_sentences = []
    for i, s in enumerate(sentences):
        ranked_sentences.append((scores[i], s))

    ranked_sentences = sorted(ranked_sentences, reverse=True)

    # Get the summary sentences and make a combine string.
    summary = [sentence for score, sentence in ranked_sentences[:top_n]]
    return " ".join(summary)

# Test Text Rank Algorithm

In [12]:
text = "The Western Chalukya Empire ruled most of the western Deccan, South India, between the 10th and 12th centuries. This Kannadiga dynasty is sometimes called the Kalyani Chalukya after its regal capital at Kalyani, today's Basavakalyan in the modern Bidar District of Karnataka state, and alternatively the Later Chalukya from its theoretical relationship to the sixth-century Chalukya dynasty of Badami. Prior to the rise of the Western and Eastern Chalukyas, the Rashtrakuta Empire of Manyakheta controlled most of the Deccan and Central India for over two centuries. In 973, seeing confusion in the Rashtrakuta Empire after a successful invasion of their capital by the ruler of the Paramara dynasty of Malwa, Tailapa II, a feudatory of the Rashtrakuta dynasty ruling from Bijapur region, defeated his overlords and made Manyakheta his capital. The dynasty quickly rose to power and grew into an empire under Someshvara I who moved the capital to Kalyani."
summary = extract_summary(text, top_n=2)
print(summary)

Prior to the rise of the Western and Eastern Chalukyas, the Rashtrakuta Empire of Manyakheta controlled most of the Deccan and Central India for over two centuries. The dynasty quickly rose to power and grew into an empire under Someshvara I who moved the capital to Kalyani.
