In [26]:
#import required modules
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.util import ngrams
import bs4 as bs
import urllib.request
import requests
import string # for checking punctuations
import matplotlib.pyplot as plt

In [27]:
def scrape_webpage(url):
  connection = urllib.request.urlopen(url)


  data = connection.read()


  parsed = bs.BeautifulSoup(data, 'lxml')


  paragraphs = parsed.find_all('p')

  text = [p.text for p in paragraphs]


  text = '\n'.join(text)
  return text

url = 'https://en.wikipedia.org/wiki/Natural_language_processing'

text = scrape_webpage(url)

print(text[:5000], '...')

Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.  The goal is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.

Challenges in natural language processing frequently involve speech recognition, natural-language understanding, and natural-language generation.

Natural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, though at the time that was not articula

In [28]:
def get_sentences(text):
    sentence_list = sent_tokenize(text)
    return sentence_list
def generate_ngrams(text,n):
    sentence_list = get_sentences(text)
    ngrams_list = []
    for sentence in sentence_list:
        ngrams_list.extend(list(ngrams(word_tokenize(sentence.lower()), n)))
    return [' '.join(grams) for grams in ngrams_list]

n = 3
res = generate_ngrams(text, n)
print(res[0:50])

['natural language processing', 'language processing (', 'processing ( nlp', '( nlp )', 'nlp ) is', ') is a', 'is a subfield', 'a subfield of', 'subfield of linguistics', 'of linguistics ,', 'linguistics , computer', ', computer science', 'computer science ,', 'science , and', ', and artificial', 'and artificial intelligence', 'artificial intelligence concerned', 'intelligence concerned with', 'concerned with the', 'with the interactions', 'the interactions between', 'interactions between computers', 'between computers and', 'computers and human', 'and human language', 'human language ,', 'language , in', ', in particular', 'in particular how', 'particular how to', 'how to program', 'to program computers', 'program computers to', 'computers to process', 'to process and', 'process and analyze', 'and analyze large', 'analyze large amounts', 'large amounts of', 'amounts of natural', 'of natural language', 'natural language data', 'language data .', 'the goal is', 'goal is a', 'is a comput

In [29]:
import nltk
def get_ngram_frequency(text, n):
    ngrams_list = generate_ngrams(text, n)
    fdist = FreqDist(ngrams_list)
    freq_dict = {}
    for k in fdist.keys():
        freq_dict[k] = fdist[k]
    freq_dict = sorted(freq_dict.items(), key = lambda kv: kv[1], reverse = True)
    return freq_dict
ngram_freq_dict = get_ngram_frequency(text, n)
ngram_freq_dict[0:15]
import matplotlib.pyplot as plt

def get_ngram_weight_freq(text, n):

    ngrams_list = generate_ngrams(text, n)
    fdist = FreqDist(ngrams_list)
    dist = {}
    for k in fdist.keys():
        dist[k] = fdist[k]
    dist = sorted(dist.items(), key = lambda kv: kv[1], reverse = True)
    disttemp = list(dist)
    top_frequency = disttemp[0][1]
    dist = dict(dist)
    weighted_dist = {}
    for k in dist:
        weighted_dist[k] = dist[k]*1.0/top_frequency 
    weighted_dist = sorted(weighted_dist.items(), key = lambda kv: kv[1], reverse = True)
    return weighted_dist
weighted_ngram_dist = get_ngram_weight_freq(text, n)
weighted_ngram_dist[0:20]

[('natural language processing', 1.0),
 ('language processing .', 0.4166666666666667),
 ('in natural language', 0.3333333333333333),
 ('( e.g. ,', 0.3333333333333333),
 ('of natural language', 0.25),
 (', however ,', 0.25),
 ('] in the', 0.25),
 (', e.g. ,', 0.25),
 ('grammar , [', 0.25),
 ('language processing (', 0.16666666666666666),
 ('processing ( nlp', 0.16666666666666666),
 ('( nlp )', 0.16666666666666666),
 ('of linguistics ,', 0.16666666666666666),
 ('of documents ,', 0.16666666666666666),
 ('natural language .', 0.16666666666666666),
 ('of symbolic nlp', 0.16666666666666666),
 ('hand-written rules .', 0.16666666666666666),
 ('in the late', 0.16666666666666666),
 ('the late 1980s', 0.16666666666666666),
 ('of machine learning', 0.16666666666666666)]

In [30]:
def sentence_scores(sent_tokens, ngram_freqs, n_grams):
    score = 0
    for wpair in n_grams:
        if wpair in ngram_freqs: 
            score += ngram_freqs[wpair]
    score = score / len(n_grams)
    return score
sentence_list = get_sentences(text)
sent_tokens = word_tokenize(sentence_list[0].lower())
n_grams = generate_ngrams(sentence_list[0],n)
score_1 = sentence_scores(sent_tokens, weighted_ngram_dist,n_grams)
print(sentence_list,score_1)

['Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.', 'The goal is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them.', 'The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.', 'Challenges in natural language processing frequently involve speech recognition, natural-language understanding, and natural-language generation.', 'Natural language processing has its roots in the 1950s.', 'Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, though at the time that wa

In [31]:
def rank_sentences(sentence_list, ngram_freq_dict, n):
    rank_list = []
    for sentence in sentence_list:
        sent_tokens = word_tokenize(sentence)
        sentence_ngram = generate_ngrams(sentence, n)
        score = sentence_scores(sent_tokens, ngram_freq_dict, sentence_ngram)

        rank_list.append([sentence,score])
    rank_list = sorted(rank_list, key=lambda x: x[1], reverse = True)
    return rank_list

def webpage_summary_ngram(text, ngram_freq_dict, n, nr_sentence , nr_tokens):
    sentence_list = get_sentences(text)
    top_freq_dict = dict(list(ngram_freq_dict)[0:nr_tokens])
    ranked_sentence_list = rank_sentences(sentence_list,top_freq_dict, n)
    summary = ""
    for i in range(nr_sentence):
        summary = summary+" "+ranked_sentence_list[i][0]
    return summary
nr_sentence = 3
nr_tokens = len(weighted_ngram_dist) 
summary = webpage_summary_ngram(text, weighted_ngram_dist, n, nr_sentence, nr_tokens)
print(summary)


print("Summary using 20 most frequent words:")

nr_sentence = 3
nr_tokens = 20 # using top k tokens
summary = webpage_summary_ngram(text, weighted_ngram_dist, n, nr_sentence, nr_tokens)
print(summary)

 Natural language processing has its roots in the 1950s. The following is a list of some of the most commonly researched tasks in natural language processing. Starting in the late 1980s, however, there was a revolution in natural language processing with the introduction of machine learning algorithms for language processing.
Summary using 20 most frequent words:
 Natural language processing has its roots in the 1950s. The following is a list of some of the most commonly researched tasks in natural language processing. Starting in the late 1980s, however, there was a revolution in natural language processing with the introduction of machine learning algorithms for language processing.
