In [1]:
import requests
from bs4 import BeautifulSoup
import re

### Scrapping from web

In [2]:
URL = 'https://en.wikipedia.org/wiki/Artificial_intelligence'
html_page = requests.get(URL).text
soup = BeautifulSoup(html_page, 'lxml')
paraContent = soup.find_all('p')

In [3]:
paragraph = ""
for para in paraContent:
    paragraph += para.text

In [4]:
paragraph = re.sub(r'\[[0-9a-zA-Z]*\]', ' ', paragraph)
paragraph = re.sub(r'\s+', ' ', paragraph)

### NLP

In [5]:
import nltk

In [6]:
sentence_tokens = nltk.sent_tokenize(paragraph)

In [7]:
stop_words = nltk.corpus.stopwords.words('english')

In [9]:
word_frequencies = {}
word_tokens = nltk.word_tokenize(paragraph)
for word in word_tokens:
    if word not in stop_words:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1

In [10]:
# Weighted Frequencies
maximum_frquency_word = max(word_frequencies.values())
for word in word_frequencies.keys():
    word_frequencies[word] = (word_frequencies[word]/maximum_frquency_word)

In [11]:
# Sentence Score
sentence_scores = {}
for sentence in sentence_tokens:
    for word in nltk.word_tokenize(sentence.lower()):
        if word in word_frequencies.keys():
            if(len(sentence.split(" ")) < 30):
                if sentence not in sentence_scores.keys():
                    sentence_scores[sentence] = word_frequencies[word]
                else:
                    sentence_scores[sentence] += word_frequencies[word]

### Combine top sentences 

In [12]:
import heapq

In [13]:
summary = heapq.nlargest(25, sentence_scores, key=sentence_scores.get)

In [14]:
sentences = []
for sentence in summary:
    sentences.append(nltk.word_tokenize(sentence))

In [15]:
" ".join(summary)

'Most EU member states had released national AI strategies, as had Canada, China, India, Japan, Mauritius, the Russian Federation, Saudi Arabia, United Arab Emirates, USA and Vietnam. When given a small, static, and visible environment, this is easy; however, dynamic environments, such as (in endoscopy) the interior of a patient\'s breathing body, pose a greater challenge. The traditional goals of AI research include reasoning, knowledge representation, planning, learning, natural language processing, perception, and the ability to move and manipulate objects. Soft computing is a set of techniques, including genetic algorithms, fuzzy logic and neural networks, that are tolerant of imprecision, uncertainty, partial truth and approximation. Formal knowledge representations are used in content-based indexing and retrieval, scene interpretation, clinical decision support, knowledge discovery (mining "interesting" and actionable inferences from large databases), and other areas. The rise of

### Using sentence similarity and picking the best from the above

In [16]:
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

In [29]:
def sentence_similarity(sent1, sent2, stop_words):
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list((set(sent1+sent2)))
    
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    for w in sent1:
        if w not in stop_words:
            vector1[all_words.index(w)] += 1
    for w in sent2:
        if w not in stop_words:
            vector2[all_words.index(w)] += 1
    
    return 1-cosine_distance(vector1, vector2)

In [30]:
def gen_sim_matrix(sentences, stop_words):
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if(idx1 == idx2):
                continue
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
    
    return similarity_matrix

In [36]:
def generate_summary(sentences, stop_words, top_n=5):
    summarized_text = []
    sentence_similarity_matrix = gen_sim_matrix(sentences, stop_words)
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    scores = nx.pagerank(sentence_similarity_graph)
    ranked_sentences = sorted(((scores[i], s) for i,s in enumerate(sentences)), reverse=True)
    for i in range(top_n):
        summarized_text.append(" ".join(ranked_sentences[i][1]))
    print("Summary: \n", " ".join(summarized_text))

In [37]:
generate_summary(sentences, stop_words, top_n=10)

Summary: 
 Most EU member states had released national AI strategies , as had Canada , China , India , Japan , Mauritius , the Russian Federation , Saudi Arabia , United Arab Emirates , USA and Vietnam . The traditional goals of AI research include reasoning , knowledge representation , planning , learning , natural language processing , perception , and the ability to move and manipulate objects . When given a small , static , and visible environment , this is easy ; however , dynamic environments , such as ( in endoscopy ) the interior of a patient 's breathing body , pose a greater challenge . AI also draws upon computer science , psychology , linguistics , philosophy , and many other fields . Soft computing is a set of techniques , including genetic algorithms , fuzzy logic and neural networks , that are tolerant of imprecision , uncertainty , partial truth and approximation . A few examples are : energy storage , deepfakes , medical diagnosis , military logistics , or supply chain

## Combining all the function.

In [38]:
def scrape_data(URL):
    html_page = requests.get(URL).text
    soup = BeautifulSoup(html_page, 'lxml')
    paraContent = soup.find_all('p')
    paragraph = ""
    for para in paraContent:
        paragraph += para.text
    paragraph = re.sub(r'\[[0-9a-zA-Z]*\]', ' ', paragraph)
    paragraph = re.sub(r'\s+', ' ', paragraph)
    
    return paragraph

In [39]:
def get_important_sentences(data):
    sentence_tokens = nltk.sent_tokenize(data)
    stop_words = nltk.corpus.stopwords.words('english')
    word_frequencies = {}
    word_tokens = nltk.word_tokenize(data)
    for word in word_tokens:
        if word not in stop_words:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
                
    # Weighted Frequencies
    maximum_frquency_word = max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word]/maximum_frquency_word)
    
    # Sentence Score
    sentence_scores = {}
    for sentence in sentence_tokens:
        for word in nltk.word_tokenize(sentence.lower()):
            if word in word_frequencies.keys():
                if(len(sentence.split(" ")) < 30):
                    if sentence not in sentence_scores.keys():
                        sentence_scores[sentence] = word_frequencies[word]
                    else:
                        sentence_scores[sentence] += word_frequencies[word]
    
    top_sentences = heapq.nlargest(25, sentence_scores, key=sentence_scores.get)
    result = []
    for sentence in top_sentences:
        result.append(nltk.word_tokenize(sentence))
    
    return result

In [40]:
def sentence_similarity(sent1, sent2, stop_words):
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list((set(sent1+sent2)))
    
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    for w in sent1:
        if w not in stop_words:
            vector1[all_words.index(w)] += 1
    for w in sent2:
        if w not in stop_words:
            vector2[all_words.index(w)] += 1
    
    return 1-cosine_distance(vector1, vector2)

In [41]:
def gen_sim_matrix(sentences, stop_words):
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if(idx1 == idx2):
                continue
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
    
    return similarity_matrix

In [55]:
def generate_summary(data, stop_words, top_n=5):
    sentences = get_important_sentences(data)
    summarized_text = []
    sentence_similarity_matrix = gen_sim_matrix(sentences, stop_words)
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    scores = nx.pagerank(sentence_similarity_graph)
    ranked_sentences = sorted(((scores[i], s) for i,s in enumerate(sentences)), reverse=True)
    for i in range(top_n):
        summarized_text.append(" ".join(ranked_sentences[i][1]))
    summary = " ".join(summarized_text)
    return summary

In [56]:
scraped_data = scrape_data('https://en.wikipedia.org/wiki/Artificial_intelligence')
summary = generate_summary(scraped_data, stop_words, top_n=10)
print(summary)

Most EU member states had released national AI strategies , as had Canada , China , India , Japan , Mauritius , the Russian Federation , Saudi Arabia , United Arab Emirates , USA and Vietnam . The traditional goals of AI research include reasoning , knowledge representation , planning , learning , natural language processing , perception , and the ability to move and manipulate objects . When given a small , static , and visible environment , this is easy ; however , dynamic environments , such as ( in endoscopy ) the interior of a patient 's breathing body , pose a greater challenge . AI also draws upon computer science , psychology , linguistics , philosophy , and many other fields . Soft computing is a set of techniques , including genetic algorithms , fuzzy logic and neural networks , that are tolerant of imprecision , uncertainty , partial truth and approximation . A few examples are : energy storage , deepfakes , medical diagnosis , military logistics , or supply chain management