In [1]:
#Importing the required modules

import nltk
import bs4 as bs  
import urllib.request  
import re
import heapq  

In [2]:
#Incase the data is from a wikipedia page use the following code to get the data from the site.
#It return a string containing the information.
def scrape_data(url_link):
    scraped_data = urllib.request.urlopen(url_link)  
    data = scraped_data.read()

    parsed_data = bs.BeautifulSoup(data,'lxml')

    paragraphs = parsed_data.find_all('p')

    data_text = ""

    for p in paragraphs:  
        data_text += p.text
    return data_text

In [3]:
#Remove the brackets and extra spaces present in the text.
def remove_brackets_extraSpaces(raw_data):
    raw_data = re.sub(r'\[[0-9]*\]', ' ', raw_data)  
    raw_data = re.sub(r'\s+', ' ', raw_data)  
    return raw_data

In [4]:
#Remove special Character from the text. This was not included in the above function because "." is a special character and if we remove this then sentence tokenizing would return just one sentence due to absnce of "." in the raw data.
def remove_specialChar(raw_data):
    raw_data = re.sub('[^a-zA-Z]', ' ', raw_data )  
    raw_data = re.sub(r'\s+', ' ', raw_data)  
    return raw_data

In [5]:
#Remove's the stop words like a,to,the etc. This function returns a dictionary containing the words and its frequency in a dictionary format.
def remove_stopWords_Frequency(paragraph):
    stop_words = nltk.corpus.stopwords.words('english') #NLTK's inbuilt list of stop words.
    word_frequency ={}
    for word in nltk.word_tokenize(paragraph):  
        if word not in stop_words:
            word_frequency[word] = word_frequency.get(word,0)+1
    return word_frequency

In [6]:
#Sentanize tokenize the processed text.
def sentenceTokenize(paragraph):
    return nltk.sent_tokenize(paragraph)

In [7]:
#Assigning weights to each word based on based on its frequency
def assignScores_words(word_frequency):
    max_frequency = max(word_frequency.values())
    for word in word_frequency.keys():
        word_frequency[word] = word_frequency[word]/(max_frequency*1.0)
    return word_frequency

In [8]:
#Assign scores to each sentence based on the words it contains from the scores assigned to words.
def assignSenteceScores(sent_list, word_scores):
    sentence_scores = {}
    for sent in sent_list:
        if(len(sent.split(' ')) <30):
            for word in nltk.word_tokenize(sent.lower()):
                if word in word_scores.keys():
                    sentence_scores[sent] = sentence_scores.get(sent,0)+word_scores[word]
    return sentence_scores

In [17]:
def printTopsenteces(sent_scores):
    summary_sentences = heapq.nlargest(int(len(sent_scores)/50), sent_scores, key=sent_scores.get)
    summary = ' '.join(summary_sentences)  
    return summary  

In [18]:
def SummayFromURL(urlLink):
    scrapped_data = scrape_data(urlLink)
    brac_extra_space = remove_brackets_extraSpaces(scrapped_data)
    remove_special_char = remove_specialChar(brac_extra_space)
    words_frequency = remove_stopWords_Frequency(remove_special_char)
    sentenceList = sentenceTokenize(brac_extra_space)
    word_scores = assignScores_words(words_frequency)
    sentence_scores = assignSenteceScores(sentenceList,word_scores)
    return printTopsenteces(sentence_scores)

In [19]:
def SummaryFromText(paragraph):
    brac_extra_space = remove_brackets_extraSpaces(paragraph)
    remove_special_char = remove_specialChar(brac_extra_space)
    words_frequency = remove_stopWords_Frequency(remove_special_char)
    sentenceList = sentenceTokenize(brac_extra_space)
    word_scores = assignScores_words(words_frequency)
    sentence_scores = assignSenteceScores(sentenceList,word_scores)
    return printTopsenteces(sentence_scores)

In [20]:
SummayFromURL('https://en.wikipedia.org/wiki/Artificial_intelligence')

' Artificial intelligence (AI), sometimes called machine intelligence, is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and other animals. Colloquially, the term "artificial intelligence" is applied when a machine mimics "cognitive" functions that humans associate with other human minds, such as "learning" and "problem solving". Neural networks can be applied to the problem of intelligent control (for robotics) or learning, using such techniques as Hebbian learning ("fire together, wire together"), GMDH or competitive learning. Deep learning has transformed many important subfields of artificial intelligence, including computer vision, speech recognition, natural language processing and others. Many of the problems in this article may also require general intelligence, if machines are to solve the problems as well as people do. IBM has created its own artificial intelligence computer, the IBM Watson, which has beaten human intelligen

In [13]:
paragraph = """ Artificial intelligence (AI), sometimes called machine intelligence, is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and other animals. Colloquially, the term "artificial intelligence" is applied when a machine mimics "cognitive" functions that humans associate with other human minds, such as "learning" and "problem solving". Neural networks can be applied to the problem of intelligent control (for robotics) or learning, using such techniques as Hebbian learning ("fire together, wire together"), GMDH or competitive learning. Deep learning has transformed many important subfields of artificial intelligence, including computer vision, speech recognition, natural language processing and others. Many of the problems in this article may also require general intelligence, if machines are to solve the problems as well as people do. Musk also funds companies developing artificial intelligence such as Google DeepMind and Vicarious to "just keep an eye on what\'s going on with artificial intelligence. IBM has created its own artificial intelligence computer, the IBM Watson, which has beaten human intelligence (at some levels). "robotics" or "machine learning"), the use of particular tools ("logic" or artificial neural networks), or deep philosophical differences. A superintelligence, hyperintelligence, or superhuman intelligence is a hypothetical agent that would possess intelligence far surpassing that of the brightest and most gifted human mind. The traditional problems (or goals) of AI research include reasoning, knowledge representation, planning, learning, natural language processing, perception and the ability to move and manipulate objects.  """
SummaryFromText(paragraph)

' Artificial intelligence (AI), sometimes called machine intelligence, is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and other animals.'