# Extractive text summarization 
    Personalised Text rank algorithm 

In [33]:
from nltk.tokenize import sent_tokenize, word_tokenize 
def get_sentences(article):
    extracts=sent_tokenize(article)
    sentences=[]
    for extract in extracts:
        #print(extract)
        clean_sentence=extract.replace("[^a-zA-Z0-9]"," ")   ## Removing special characters
        #print(clean_sentence)
        obtained=word_tokenize(clean_sentence) 
        #print(obtained)
        sentences.append(obtained)

    return sentences

Get Similarity

In [18]:
from nltk.cluster.util import cosine_distance
def get_similarity(sent_1,sent_2,stop_words):
  
    sent_1=[w.lower() for w in sent_1]
    sent_2=[w.lower() for w in sent_2]

    total=list(set(sent_1+sent_2)) ## Removing duplicate words in total set

    vec_1= [0] * len(total)
    vec_2= [0] * len(total)


  ## Count Vectorization of two sentences
    for w in sent_1:
        if w not in stop_words:
            vec_1[total.index(w)]+=1

    for w in sent_2:
        if w not in stop_words:
            vec_2[total.index(w)]+=1


    return 1-cosine_distance(vec_1,vec_2)

Create Matrix

In [19]:
from nltk.corpus import stopwords
import numpy as np
def build_matrix(sentences):
    stop_words = stopwords.words('english')

    sim_matrix=np.zeros((len(sentences),len(sentences)))
    ## Adjacency matrix

    for id1 in range(len(sentences)):
        for id2 in range(len(sentences)):
            if id1==id2:  #escaping diagonal elements
                continue
            else:
                sim_matrix[id1][id2]=get_similarity(sentences[id1],sentences[id2],stop_words)

    return sim_matrix

Page Rank 

In [20]:
def pagerank(text, eps=0.000001, d=0.85):
    score_mat = np.ones(len(text)) / len(text)
    delta=1
    ### iterative approach
    while delta>eps:
        score_mat_new = np.ones(len(text)) * (1 - d) / len(text) + d * text.T.dot(score_mat)
        delta = abs(score_mat_new - score_mat).sum()
        score_mat = score_mat_new

    return score_mat_new

Summarizer

In [21]:
def summarizer(article,req=3):
    summarized=[]

    sentence=get_sentences(article)

    sim_matrix=build_matrix(sentence)

    score=pagerank(sim_matrix)

    ranked_sentence = sorted(((score[i],s) for i,s in enumerate(sentence)), reverse=True)
    #print(ranked_sentence[2])
  
    for i in range(req):
        #print(ranked_sentence[i][1])
        summarized.append(" ".join(ranked_sentence[i][1]))

    return summarized

In [23]:
article='A black hole is a region of spacetime where gravity is so strong that nothing—no particles or even electromagnetic radiation such as light—can escape from it. The theory of general relativity predicts that a sufficiently compact mass can deform spacetime to form a black hole. The boundary of the region from which no escape is possible is called the event horizon. Although the event horizon has an enormous effect on the fate and circumstances of an object crossing it, according to general relativity it has no locally detectable features. In many ways, a black hole acts like an ideal black body, as it reflects no light. Moreover, quantum field theory in curved spacetime predicts that event horizons emit Hawking radiation, with the same spectrum as a black body of a temperature inversely proportional to its mass. This temperature is on the order of billionths of a kelvin for black holes of stellar mass, making it essentially impossible to observe. Objects whose gravitational fields are too strong for light to escape were first considered in the 18th century by John Michell and Pierre-Simon Laplace. The first modern solution of general relativity that would characterize a black hole was found by Karl Schwarzschild in 1916, although its interpretation as a region of space from which nothing can escape was first published by David Finkelstein in 1958. Black holes were long considered a mathematical curiosity; it was not until the 1960s that theoretical work showed they were a generic prediction of general relativity. The discovery of neutron stars by Jocelyn Bell Burnell in 1967 sparked interest in gravitationally collapsed compact objects as a possible astrophysical reality. Black holes of stellar mass are expected to form when very massive stars collapse at the end of their life cycle. After a black hole has formed, it can continue to grow by absorbing mass from its surroundings.'
len(article)

1908

In [24]:
Summary=summarizer(article)

  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


In [25]:
print(Summary)

['The theory of general relativity predicts that a sufficiently compact mass can deform spacetime to form a black hole .', 'In many ways , a black hole acts like an ideal black body , as it reflects no light .', 'Moreover , quantum field theory in curved spacetime predicts that event horizons emit Hawking radiation , with the same spectrum as a black body of a temperature inversely proportional to its mass .']


**Summarization Using Spacy and PytextRank**

In [32]:
import spacy
import pytextrank
import en_core_web_sm
nlp = en_core_web_sm.load()


# add PyTextRank to the spaCy pipeline
tr = pytextrank.TextRank()
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

doc = nlp(article)

# examine the top-ranked phrases in the document
for p in doc._.phrases[0:10]:
    
    print(p.chunks)



[black holes, Black holes, Black holes, A black hole, a black hole, a black hole, a black hole, a black hole]
[stellar mass, stellar mass]
[general relativity, general relativity, general relativity, general relativity]
[event horizons, the event horizon, the event horizon]
[mass, its mass]
[Jocelyn Bell Burnell, Jocelyn Bell Burnell]
[neutron stars]
[Hawking radiation]
[theoretical work]
[spacetime]
