In [None]:
import nltk
nltk.download("stopwords")      # stopwords corpus
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx           # for creating social graphs

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from google.colab import files
uploaded = files.upload()

Saving sun.txt to sun.txt


In [None]:
def read_file (filename) :
  file = open(filename, "r")    # open file in read mode
  filedata = file.readlines()   # reading file line by line
  line = filedata[0].split(". ")
  sentences = []
  for sentence in line :        # list of words per line
    sentences.append(sentence.replace("[^a-zA-Z", " ").split(" "))
  sentences.pop()
  return sentences

In [None]:
read_file("sun.txt")

[['The',
  'Sun',
  'is',
  'the',
  'star',
  'at',
  'the',
  'center',
  'of',
  'the',
  'Solar',
  'System'],
 ['It',
  'is',
  'a',
  'nearly',
  'perfect',
  'ball',
  'of',
  'hot',
  'plasma,',
  'heated',
  'to',
  'incandescence',
  'by',
  'nuclear',
  'fusion',
  'reactions',
  'in',
  'its',
  'core,',
  'radiating',
  'the',
  'energy',
  'mainly',
  'as',
  'visible',
  'light,',
  'ultraviolet',
  'light,',
  'and',
  'infrared',
  'radiation'],
 ['It',
  'is',
  'by',
  'far',
  'the',
  'most',
  'important',
  'source',
  'of',
  'energy',
  'for',
  'life',
  'on',
  'Earth'],
 ['Its',
  'diameter',
  'is',
  'about',
  '1.39',
  'million',
  'kilometers,',
  'or',
  '109',
  'times',
  'that',
  'of',
  'Earth'],
 ['Its',
  'mass',
  'is',
  'about',
  '330,000',
  'times',
  'that',
  'of',
  'Earth,',
  'and',
  'it',
  'accounts',
  'for',
  'about',
  '99.86%',
  'of',
  'the',
  'total',
  'mass',
  'of',
  'the',
  'Solar',
  'System']]

In [None]:
def sentence_similarity (sent1, sent2, stopwords=None) :
  if stopwords is None :
    stopwords = []

  sent1 = [w.lower() for w in sent1]
  sent2 = [w.lower() for w in sent2]
  all_words = list(set(sent1 + sent2))

  print("Sent1 : \n", sent1, "\n")
  print("Sent2 : \n", sent2, "\n")
  print("All Words : \n", all_words, "\n")

  vector1 = [0] * len(all_words)
  vector2 = [0] * len(all_words)

  for w in sent1 :
    if w in stopwords :
      continue
    vector1[all_words.index(w)] += 1
  
  for w in sent2 :
    if w in stopwords :
      continue
    vector2[all_words.index(w)] += 1

  print (vector1, "\n")
  print (vector2, "\n")
  
  print(1 - cosine_distance(vector1, vector2))
  
  return (1 - cosine_distance(vector1, vector2))

In [None]:
def gen_sim_matrix (sentences, stopwords) :
  sim_matrix = np.zeros((len(sentences), len(sentences)))
  for idx1 in range (len(sentences)) :
    for idx2 in range (len(sentences)) :
      if idx1 == idx2 :
        continue
      sim_matrix[idx1][idx2] = sentence_similarity (sentences[idx1], sentences[idx2], stopwords)

  return sim_matrix

In [None]:
def summarizer (filename, topn = 5) :
  stop_words = stopwords.words('english')
  summarize = []
  sentences = read_file(filename)
  print(sentences, "\n")
  sentence_similarity_matrix = gen_sim_matrix(sentences, stop_words)
  print(sentence_similarity_matrix, "\n")
  sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
  print(sentence_similarity_graph, "\n")
  scores = nx.pagerank(sentence_similarity_graph)
  print(scores, "\n")
  ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
  for i in range (topn) :
    summarize.append(" ".join(ranked_sentences[i][1]))
  print("Summary \n", ". ".join(summarize))

In [None]:
summarizer ("sun.txt", 3)

[['The', 'Sun', 'is', 'the', 'star', 'at', 'the', 'center', 'of', 'the', 'Solar', 'System'], ['It', 'is', 'a', 'nearly', 'perfect', 'ball', 'of', 'hot', 'plasma,', 'heated', 'to', 'incandescence', 'by', 'nuclear', 'fusion', 'reactions', 'in', 'its', 'core,', 'radiating', 'the', 'energy', 'mainly', 'as', 'visible', 'light,', 'ultraviolet', 'light,', 'and', 'infrared', 'radiation'], ['It', 'is', 'by', 'far', 'the', 'most', 'important', 'source', 'of', 'energy', 'for', 'life', 'on', 'Earth'], ['Its', 'diameter', 'is', 'about', '1.39', 'million', 'kilometers,', 'or', '109', 'times', 'that', 'of', 'Earth'], ['Its', 'mass', 'is', 'about', '330,000', 'times', 'that', 'of', 'Earth,', 'and', 'it', 'accounts', 'for', 'about', '99.86%', 'of', 'the', 'total', 'mass', 'of', 'the', 'Solar', 'System']] 

Sent1 : 
 ['the', 'sun', 'is', 'the', 'star', 'at', 'the', 'center', 'of', 'the', 'solar', 'system'] 

Sent2 : 
 ['it', 'is', 'a', 'nearly', 'perfect', 'ball', 'of', 'hot', 'plasma,', 'heated', 'to',