# Beautiful Soup is a Python library for pulling data out of HTML and XML files"

In [2]:
!pip install beautifulsoup4



# lxml is a tool for working with HTML and XML documents, represented as an element tree.

In [3]:
!pip install lxml



In [4]:
import bs4 as bs
import urllib.request 

In [5]:
source = urllib.request.urlopen('https://en.wikipedia.org/wiki/Air_pollution').read()

In [6]:
source



In [7]:
soup = bs.BeautifulSoup(source,'lxml')

In [8]:
soup

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-feature-night-mode-disabled skin-theme-clientpref-day vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Air pollution - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-fe

In [9]:
text = ""
for paragraph in soup.find_all('p'):
    text += paragraph.text

In [10]:
text

'\nAir pollution is the contamination of air due to the presence of substances called pollutants in the atmosphere that are harmful to the health of humans and other living beings, or cause damage to the climate or to materials.[1] It is also the contamination of the indoor or outdoor environment either by chemical, physical, or biological agents that alters the natural features of the atmosphere.[1] There are many different types of air pollutants, such as gases (including ammonia, carbon monoxide, sulfur dioxide, nitrous oxides, methane and chlorofluorocarbons), particulates (both organic and inorganic), and biological molecules. Air pollution can cause diseases, allergies, and even death to humans; it can also cause harm to other living organisms such as animals and crops, and may damage the natural environment (for example, climate change, ozone depletion or habitat degradation) or built environment (for example, acid rain).[2] Air pollution can be caused by both human activities[3

In [11]:
import re

In [12]:
text = re.sub(r'\[[0-9]*\]',' ',text)
text = text.lower()
text = re.sub(r'\s+',' ',text)
clean_text = re.sub(r'\W',' ',text)
clean_text = re.sub(r'\d',' ',clean_text)
clean_text = re.sub(r'\s+[a-z]\s+',' ',clean_text)
clean_text = re.sub(r'\s+',' ',clean_text)
clean_text = re.sub(r'\s+$','',clean_text)

In [13]:
clean_text

' air pollution is the contamination of air due to the presence of substances called pollutants in the atmosphere that are harmful to the health of humans and other living beings or cause damage to the climate or to materials it is also the contamination of the indoor or outdoor environment either by chemical physical or biological agents that alters the natural features of the atmosphere there are many different types of air pollutants such as gases including ammonia carbon monoxide sulfur dioxide nitrous oxides methane and chlorofluorocarbons particulates both organic and inorganic and biological molecules air pollution can cause diseases allergies and even death to humans it can also cause harm to other living organisms such as animals and crops and may damage the natural environment for example climate change ozone depletion or habitat degradation or built environment for example acid rain air pollution can be caused by both human activities and natural phenomena air quality is clo

In [14]:
import nltk

In [15]:
sentences = nltk.sent_tokenize(text)

In [16]:
sentences

[' air pollution is the contamination of air due to the presence of substances called pollutants in the atmosphere that are harmful to the health of humans and other living beings, or cause damage to the climate or to materials.',
 'it is also the contamination of the indoor or outdoor environment either by chemical, physical, or biological agents that alters the natural features of the atmosphere.',
 'there are many different types of air pollutants, such as gases (including ammonia, carbon monoxide, sulfur dioxide, nitrous oxides, methane and chlorofluorocarbons), particulates (both organic and inorganic), and biological molecules.',
 'air pollution can cause diseases, allergies, and even death to humans; it can also cause harm to other living organisms such as animals and crops, and may damage the natural environment (for example, climate change, ozone depletion or habitat degradation) or built environment (for example, acid rain).',
 'air pollution can be caused by both human activ

In [17]:
len(sentences)

369

In [18]:
stop_words = nltk.corpus.stopwords.words('english')

In [19]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [20]:
len(stop_words)

179

In [21]:
word2count = {}
for word in nltk.word_tokenize(clean_text):
    if word not in stop_words:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1

In [22]:
word2count

{'air': 204,
 'pollution': 173,
 'contamination': 2,
 'due': 8,
 'presence': 2,
 'substances': 1,
 'called': 1,
 'pollutants': 34,
 'atmosphere': 5,
 'harmful': 6,
 'health': 54,
 'humans': 5,
 'living': 7,
 'beings': 1,
 'cause': 14,
 'damage': 4,
 'climate': 9,
 'materials': 3,
 'also': 26,
 'indoor': 8,
 'outdoor': 12,
 'environment': 10,
 'either': 5,
 'chemical': 8,
 'physical': 1,
 'biological': 6,
 'agents': 2,
 'alters': 1,
 'natural': 7,
 'features': 1,
 'many': 8,
 'different': 13,
 'types': 4,
 'gases': 7,
 'including': 15,
 'ammonia': 2,
 'carbon': 10,
 'monoxide': 7,
 'sulfur': 10,
 'dioxide': 20,
 'nitrous': 3,
 'oxides': 4,
 'methane': 2,
 'chlorofluorocarbons': 1,
 'particulates': 9,
 'organic': 5,
 'inorganic': 1,
 'molecules': 1,
 'diseases': 11,
 'allergies': 1,
 'even': 13,
 'death': 12,
 'harm': 1,
 'organisms': 1,
 'animals': 1,
 'crops': 2,
 'may': 26,
 'example': 11,
 'change': 4,
 'ozone': 30,
 'depletion': 2,
 'habitat': 1,
 'degradation': 1,
 'built': 4,
 'ac

In [23]:
for key in word2count.keys():
    word2count[key] = word2count[key]/max(word2count.values())

In [24]:
word2count

{'air': 1.0,
 'pollution': 1.0,
 'contamination': 0.029850746268656716,
 'due': 0.11940298507462686,
 'presence': 0.029850746268656716,
 'substances': 0.014925373134328358,
 'called': 0.014925373134328358,
 'pollutants': 0.5074626865671642,
 'atmosphere': 0.07462686567164178,
 'harmful': 0.08955223880597014,
 'health': 0.8059701492537313,
 'humans': 0.07462686567164178,
 'living': 0.1044776119402985,
 'beings': 0.014925373134328358,
 'cause': 0.208955223880597,
 'damage': 0.05970149253731343,
 'climate': 0.13432835820895522,
 'materials': 0.04477611940298507,
 'also': 0.3880597014925373,
 'indoor': 0.11940298507462686,
 'outdoor': 0.1791044776119403,
 'environment': 0.14925373134328357,
 'either': 0.07462686567164178,
 'chemical': 0.11940298507462686,
 'physical': 0.014925373134328358,
 'biological': 0.08955223880597014,
 'agents': 0.029850746268656716,
 'alters': 0.014925373134328358,
 'natural': 0.1044776119402985,
 'features': 0.014925373134328358,
 'many': 0.11940298507462686,
 'di

In [26]:
sent2score = {}
for sentence in sentences:
    for word in nltk.word_tokenize(sentence.lower()):
        if word in word2count.keys():
            if len(sentence.split(' ')) <25:
                if sentence not in sent2score.keys():
                    sent2score[sentence] = word2count[word]
                else:
                    sent2score[sentence] += word2count[word]

In [27]:
sent2score

{'air pollution can be caused by both human activities and natural phenomena.': 2.597014925373134,
 "air quality is closely related to the earth's climate and ecosystems globally.": 2.074626865671642,
 'many of the contributors of air pollution are also sources of greenhouse emission i.e., burning of fossil fuel.': 3.1940298507462686,
 "the human health effects of poor air quality are far reaching, but principally affect the body's respiratory system and the cardiovascular system.": 8.332835820895523,
 'although the health consequences are extensive, the way the problem is handled is considered largely haphazard or neglected.': 1.4275917708753525,
 'various pollution control technologies and strategies are available to reduce air pollution.': 4.081081081081082,
 'several international and national legislation and regulation have been developed to limit the negative effects of air pollution.': 3.6682432432432437,
 'local rules, when properly executed, have resulted in significant advanc

In [28]:
import heapq 


In [29]:
best_sentences = heapq.nlargest(10,sent2score,key=sent2score.get)

In [30]:
best_sentences

['hazardous land uses (toxic storage and disposal facilities, manufacturing facilities, major roadways) tend to be located where property values and income levels are low.',
 'however, even populated areas in developed countries attain unhealthy levels of pollution, with los angeles and rome being two examples.',
 'air pollution hotspots are areas where air pollution emissions expose individuals to increased negative health effects.',
 'urbanization leads to a rapid rise in premature mortality due to anthropogenic air pollution in fast-growing tropical cities.',
 "the human health effects of poor air quality are far reaching, but principally affect the body's respiratory system and the cardiovascular system.",
 'racial discrepancies are particularly distinct in suburban areas of the southern united states and metropolitan areas of the midwestern and western united states.',
 'diesel soot is concentrated in densely populated areas, and one in six people in the u.s. live near a diesel po