In [3]:
# IMPORTING LIBRARIES

import nltk

# Corpora is a group presenting multiple collections of text documents. A single collection is called corpus
# Stopwords are the words that are most common words and we do not want to use them to describe our content
from nltk.corpus import stopwords

# cosine distance is used to measure document similarity in text analysis
from nltk.cluster.util import cosine_distance
import numpy as np

# graph representation
import networkx as nx

# Genrate Clean Sentences

In [6]:
def read_article(file_name):
    file = open(file_name, "r")
    filedata = file.readlines()
    article = filedata[0].split(". ")
    sentences = []

    for sentence in article:
        print(sentence)
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
        # [^a-zA-Z] means any character that IS NOT a-z or A-Z
    sentences.pop() 
    
    return sentence

# Similarity matrix
### Here we will be using cosine similarity to find similarity between sentences

In [7]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)

In [8]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix

# Generate Summary

In [11]:
def generate_summary(file_name, top_n=5):
    nltk.download("stopwords")
    stop_words = stopwords.words('english')
    summarize_text = []

    # Step 1 - Read text anc split it
    sentences =  read_article(file_name)

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank_numpy(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
    print("Indexes of top ranked_sentence order are ", ranked_sentence)    

    for i in range(top_n):
      summarize_text.append(" ".join(ranked_sentence[i][1]))

    # Step 5 - Offcourse, output the summarize text
    print("Summarize Text: \n", ". ".join(summarize_text))

# let's begin
generate_summary( "text.txt", 2)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v))))


Research is "creative and systematic work undertaken to increase the stock of knowledge".[1] It involves the unbiased collection, organization and analysis of information to increase understanding of a topic or issue
A research project may be an expansion on past work in the field
To test the validity of instruments, procedures, or experiments, research may replicate elements of prior projects or the project as a whole.[2]

Indexes of top ranked_sentence order are  [(0.010638297872340463, 'u'), (0.01063829787234046, 'u'), (0.010638297872340458, ','), (0.010638297872340455, ','), (0.010638297872340455, ','), (0.010638297872340448, 'p'), (0.010638297872340448, 'p'), (0.010638297872340448, 'p'), (0.010638297872340448, 'n'), (0.010638297872340448, 'n'), (0.010638297872340446, 'p'), (0.010638297872340446, 'p'), (0.010638297872340446, 'n'), (0.010638297872340446, 'n'), (0.010638297872340444, 'p'), (0.010638297872340444, 'f'), (0.010638297872340443, 'f'), (0.010638297872340441, 'c'), (0.01063

  
