In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import bs4 as bs
import urllib.request
import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import networkx as nx
import re 
import heapq
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package punkt to C:\Users\My
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\My
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def userinput():
    inp_url=input("Enter a url: ")
    return inp_url

"""function - checking validity of input (This program is dealing with wikipedia pages only. 
Links to other websites are considered invalid )"""
def validity_input(inp_url):
    if inp_url[0:30]!='https://en.wikipedia.org/wiki/':
        return False
    else:
         return True

def get_data(inp_url):
    #storing data scraped from wikipedia page 
    scraped_data = urllib.request.urlopen(inp_url)
    
    #reading the scraped data - type(wikiarticle) = bytes
    wikiarticle = scraped_data.read() 
    
    #converting the data into a BeautifulSoup object. type(parsed_article)=bs4.BeautifulSoup
    parsed_article = bs.BeautifulSoup(wikiarticle,'lxml')
    
    """storing all the text in the webpage, which is enclosed within <p> and </p> tags. type(paragraphs)=bs4.element.ResultSet
    ResultSet is iterable, supports indexing. Basically functions as a list in this program"""
    paragraphs = parsed_article.find_all('p') 

    wikiarticle_text = ""
    
    #storing all text from the webpage in a string
    for p in paragraphs:  
        wikiarticle_text += p.text
    
    return wikiarticle_text
        
def format_data(wikiarticle_text):
    #replacing references - numbers enclosed in square brackets - with spaces
    wikiarticle_text = re.sub(r'\[[0-9]*\]', ' ', wikiarticle_text)
    
    #replacing multiple spaces with single space
    wikiarticle_text = re.sub(r'\s+', ' ', wikiarticle_text)
    
    return wikiarticle_text
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)

def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences))) #Creates an empty 2x2 matrix filled with zeros
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix
def get_summary(full_text,n):
    sentence_list = nltk.sent_tokenize(full_text)
    new_sentence_list=[]
    for sentence in sentence_list:
        new_sentence_list.append(re.sub("\?", '', sentence))
    #Usually the first sentece is very important in a wikipedia article. So making sure that it is included.
    mandate = new_sentence_list.pop(0)
    sentence_similarity_martix = build_similarity_matrix(new_sentence_list, stop_words)
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix) #Creates graph
    scores = nx.pagerank(sentence_similarity_graph) #Scores the graph using PageRank
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(new_sentence_list)), reverse=True)    
    summarize_text = []
    for i in range(n):
        summarize_text.append("".join(ranked_sentence[i][1]))
    summary= (" ".join(summarize_text))
    summary = mandate + " " + summary
    return summary

#calling all the above functions
def call():
    print("Enter URL for a Wikipedia (strictly) Article:")
    inp_url=userinput()
    print("Enter the number of sentences you want your summary to be:")
    n = int(input())
    print()
    while validity_input(inp_url)!=True:
        print("Invalid url. Please enter the url of any wikipedia page.")
        inp_url=userinput()
    article = get_data(inp_url)
    article = format_data(article)
    summary = get_summary(article,n)
    print(summary)

In [4]:
call()

Enter URL for a Wikipedia (strictly) Article:
Enter a url: https://en.wikipedia.org/wiki/Imputation_(statistics)
Enter the number of sentences you want your summary to be:
5

In statistics, imputation is the process of replacing missing data with substituted values. In order to deal with the problem of increased noise due to imputation, Rubin (1987) developed a method for averaging the outcomes across multiple imputed data sets to account for this. Because of the incomplete N values at some points in time, while still maintaining complete case comparison for other parameters, pairwise deletion can introduce impossible mathematical situations such as correlations that are over 100%. The problem is that the imputed data do not have an error term included in their estimation, thus the estimates fit perfectly along the regression line without any residual variance. After imputation, the data is treated as if they were the actual real values in single imputation. A once-common method of imp