In [1]:
#Pos Tagging a simple sentences
import nltk
import re
import numpy as np
import networkx as nx
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance

In [2]:
#Sample sentence
star_wars = """It is a period of civil war. Rebel spaceships, striking from a hidden base, have won their firts victory againts
the evil Galactic Empire. During the battle, Rebel spies managed to steal secret plans to the
Empire's ultimate weapon, the DEATH STAR, an armored space station with enough power to destroy an entire
planet. Pursued by the Empire's sinister agents, Princess Leia races home aboard her starship, custodian of 
the stolen plans that can save her people and restore freedom to the galaxy..."""

In [3]:
clean_words = re.sub("[^a-zA-Z]"," ",star_wars)
clean_words = " ".join(clean_words.split()) #remove multiple whitespace
tokens = word_tokenize(clean_words)
pos_tokens = pos_tag(tokens)
print(pos_tokens)

[('It', 'PRP'), ('is', 'VBZ'), ('a', 'DT'), ('period', 'NN'), ('of', 'IN'), ('civil', 'JJ'), ('war', 'NN'), ('Rebel', 'NNP'), ('spaceships', 'VBZ'), ('striking', 'VBG'), ('from', 'IN'), ('a', 'DT'), ('hidden', 'JJ'), ('base', 'NN'), ('have', 'VBP'), ('won', 'VBN'), ('their', 'PRP$'), ('firts', 'NNS'), ('victory', 'NN'), ('againts', 'VBZ'), ('the', 'DT'), ('evil', 'JJ'), ('Galactic', 'NNP'), ('Empire', 'NNP'), ('During', 'IN'), ('the', 'DT'), ('battle', 'NN'), ('Rebel', 'NNP'), ('spies', 'NNS'), ('managed', 'VBD'), ('to', 'TO'), ('steal', 'VB'), ('secret', 'JJ'), ('plans', 'NNS'), ('to', 'TO'), ('the', 'DT'), ('Empire', 'NNP'), ('s', 'NN'), ('ultimate', 'JJ'), ('weapon', 'IN'), ('the', 'DT'), ('DEATH', 'NNP'), ('STAR', 'NNP'), ('an', 'DT'), ('armored', 'JJ'), ('space', 'NN'), ('station', 'NN'), ('with', 'IN'), ('enough', 'JJ'), ('power', 'NN'), ('to', 'TO'), ('destroy', 'VB'), ('an', 'DT'), ('entire', 'JJ'), ('planet', 'NN'), ('Pursued', 'VBN'), ('by', 'IN'), ('the', 'DT'), ('Empire', '

In [4]:
def read_article(filename):
    file = open(filename, "r")
    filedata = file.readlines()
    article = filedata[0].split(". ")
    sentences = []
    
    for sentence in article:
        print(sentence)
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    sentences.pop()
    return sentences

In [5]:
def sentence_sim(sentence1, sentence2, stopwords = None):
    if stopwords is None:
        stopwords = []
    sentence1 = [w.lower() for w in sentence1]
    sentence2 = [w.lower() for w in sentence2]
    
    all_words = list(sentence1+sentence2)
    
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    
    #build the vector for 1st sentence
    for w in sentence1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
    #build the vector for 2nd sentence
    for w in sentence2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
    return 1-cosine_distance(vector1,vector2)

In [6]:
def build_similarity_mat(sentences, stop_words):
    #Create an empty similarity matrix
    sim_matrix = np.zeros((len(sentences), len(sentences)))
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2 : #ignore if both are same sentences
                continue
            sim_matrix[idx1][idx2] = sentence_sim(sentences[idx1], sentences[idx2],stop_words)
    return sim_matrix

In [7]:
def generate_summary(filename, top_n=5):
    stop_words = stopwords.words('english')
    summarize_text = []
    
    #Step 1 : Read Text and tokenize
    sentences = read_article(filename)
    #print("The Sentences are : \n",sentences)
    
    #Step 2 : Generate similarity matrix across sentences
    sentence_sim_matrix = build_similarity_mat(sentences,stop_words)
    
    #Step 3 : Rank sentences in similarity matrix
    sentences_sim_graph = nx.from_numpy_array(sentence_sim_matrix)
    scores = nx.pagerank(sentences_sim_graph)
    
    #Step 4 : Sort the rank and pick the top snetence
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse = True)
    print("Indexes of top ranked sentence order are" , ranked_sentence)
    for i in range(top_n):
        summarize_text.append(" ".join(ranked_sentence[i][1]))
    
    #Step 5 : Print the output
    print("Summarize text = \n", ". ".join(summarize_text))

In [8]:
#lets begin with some example
generate_summary("example4.txt",5)

Different types of restorative imaging systems such as Computed Tomography (CT), Single- Photon Emission Computed Tomography (SPECT), and Magnetic Resonance Imaging (MRI) are utilized to give important data about nature, dimension, vicinity and metabolism of cere- brum tumor aiding determination
These modalities are utilized in combination to give the most elevated data about the cerebrum tumor
MRI is a non-intrusive system that uses radio recurrence signals to create the internal images under the influence of an extremely amazing attractive field
Distinctive MRI modalities produce various kinds of tissue contrast images, subsequently giving important auxiliary data and empowering analysis and segmentation of tumor along with their subregions
Cerebrum tumor segmentation from MRI is a criti- cal errand which includes different covering pathology, medical resonance materal science, radiologistâ€™s discernment, and image analysis dependent on intensity and shape
There are numerous difficu