In [66]:
#Importing the required modules

import nltk
import numpy as np
import bs4 as bs  
import urllib.request  
import re
import heapq  
from operator import itemgetter
from nltk.cluster.util import cosine_distance

In [67]:
#Incase the data is from a wikipedia page use the following code to get the data from the site.
#It return a string containing the information.
def scrape_data(url_link):
    scraped_data = urllib.request.urlopen(url_link)  
    data = scraped_data.read()

    parsed_data = bs.BeautifulSoup(data,'lxml')

    paragraphs = parsed_data.find_all('p')

    data_text = ""

    for p in paragraphs:  
        data_text += p.text
    return data_text

In [68]:
#Remove the brackets and extra spaces present in the text.
def remove_brackets_extraSpaces(raw_data):
    raw_data = re.sub(r'\[[0-9]*\]', ' ', raw_data)  
    raw_data = re.sub(r'\s+', ' ', raw_data)  
    return raw_data

In [69]:
#Remove special Character from the text. This was not included in the above function because "." is a special character and if we remove this then sentence tokenizing would return just one sentence due to absnce of "." in the raw data.
def remove_specialChar(raw_data):
    raw_data = re.sub('[^a-zA-Z]', ' ', raw_data )  
    raw_data = re.sub(r'\s+', ' ', raw_data)  
    return raw_data

In [70]:
#Sentanize tokenize the processed text.
def sentenceTokenize(paragraph):
    return nltk.sent_tokenize(paragraph)

In [71]:
#Sentanize tokenize the processed text.
def wordTokenize(sent):
    return nltk.word_tokenize(sent)

In [72]:
def sentence_similarity(sent1, sent2):
    
    sent1 = remove_specialChar(sent1)
    sent2 = remove_specialChar(sent2)
    
    vocab = list(set(sent1 + sent2))
    
    vector1 = [0] * len(vocab)
    vector2 = [0] * len(vocab)
    
    for w in sent1:
        vector1[vocab.index(w)] += 1
    
    for w in sent2:
        vector2[vocab.index(w)] += 1
    
    return 1 - cosine_distance(vector1, vector2)

In [73]:
def similarity_matrix_data(sentenceList):
    len_val = len(sentenceList)
    sim_matrix = np.zeros([len_val, len_val])
    for i in range(len_val):
        for j in range(len_val):
            if i != j:
                sim_matrix[i][j] = sentence_similarity(sentenceList[i], sentenceList[j])
            else:
                continue
    
    for i in range(len(sim_matrix)):
        sim_matrix[i] /= sim_matrix[i].sum()
    return sim_matrix
    

In [74]:
def PageRank(matrix, eps,d):
    N = matrix.shape[1]
    v = np.random.rand(N, 1)
    v = v / np.linalg.norm(v, 1)
    last_v = np.ones((N, 1), dtype=np.float32) * np.inf
    M_hat = (d * matrix) + (((1 - d) / N) * np.ones((N, N), dtype=np.float32))
    
    while np.linalg.norm(v - last_v, 2) > eps:
        last_v = v
        v = np.matmul(M_hat, v)
    return v

In [75]:
def SummayFromURL(urlLink):
    scrapped_data = scrape_data(urlLink)
    brac_extra_space = remove_brackets_extraSpaces(scrapped_data)
    sentences = sentenceTokenize(brac_extra_space)
    similarityMatrix = similarity_matrix_data(sentences)
    sentence_ranks = PageRank(similarityMatrix,1.0e-8,0.85)
    rankedIndexes = [item[0] for item in sorted(enumerate(sentence_ranks), key=lambda item: -item[1])]
    Size_of_Summary = 10
    selected_sentences = sorted(rankedIndexes[:Size_of_Summary])
    return itemgetter(*selected_sentences)(sentences)

In [76]:
SummayFromURL('https://en.wikipedia.org/wiki/Artificial_intelligence')

('The AI field draws upon computer science, information engineering, mathematics, psychology, linguistics, philosophy, and many others.',
 'AI often revolves around the use of algorithms.',
 'A representation of "what exists" is an ontology: the set of objects, relations, concepts, and properties formally described so that software agents can interpret them.',
 'Moderate successes related to affective computing include textual sentiment analysis and, more recently, multimodal affect analysis (see multimodal sentiment analysis), wherein AI classifies the affects displayed by a videotaped subject.',
 'Many learning algorithms use search algorithms based on optimization.',
 'Among the most popular feedforward networks are perceptrons, multi-layer perceptrons and radial basis networks.',
 'Some self-driving cars are not equipped with steering wheels or brake pedals, so there has also been research focused on creating an algorithm that is capable of maintaining a safe environment for the pa