In [1]:
#Importing the essential libraries
#Beautiful Soup is a Python library for pulling data out of HTML and XML files
#The Natural Language Toolkit

import requests
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
from bs4 import BeautifulSoup
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import random
from wordcloud import WordCloud
from html.parser import HTMLParser

import bs4 as bs
import urllib.request
import re

In [2]:
r=requests.get('http://gflenv.com/liquid-and-special-waste/hazardous-waste/')

In [3]:
#Setting the correct text encoding of the HTML page
r.encoding = 'utf-8'

In [4]:
#Extracting the HTML from the request object
html = r.text

In [5]:
# Printing the first 500 characters in html
print(html[:500])

<!DOCTYPE html>
<html lang="en-US" prefix="og: http://ogp.me/ns#" class="no-js">
<head>
	
<!-- Google Tag Manager -->
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
})(window,document,'script','dataLayer','GTM-5C34RXR');</script>
<!-- End Google Tag Man


In [6]:
# Creating a BeautifulSoup object from the HTML
soup = BeautifulSoup(html)

In [7]:
# Getting the text out of the soup
text = soup.get_text()

In [8]:
#total length
len(text)

14842

In [9]:
text=text[7938:12275]

In [10]:
text

'For over forty years, GFL Environmental and its predecessors have managed industrial, institutional and household hazardous waste with services such as onsite packaging, container supply, transportation, and licensed disposal or recycling. Our teams of qualified chemists and technicians work together with our customers to solve hazardous waste challenges ranging from routine to complex. We also coordinate onsite recycling initiatives, personnel training and waste reduction programs. All hazardous waste materials are handled by GFL with care and compliance at our transfer, storage and disposal facilities located in Canada and the United States.\nINDUSTRIAL WASTE MATERIALS\nGreen For Life is a trusted partner to many of North America’s thriving industries. We provide hazardous waste and recycling services for industrial sites such as mines, refineries, power stations and rail yards. We supply bins, roll-off containers and storage tanks for large volumes of waste materials. Our high-perf

In [11]:
# Removing Square Brackets and Extra Spaces
clean_text = re.sub(r'\[[0-9]*\]', ' ', text)
clean_text = re.sub(r'\s+', ' ', clean_text)

In [12]:
clean_text[488:892]

'All hazardous waste materials are handled by GFL with care and compliance at our transfer, storage and disposal facilities located in Canada and the United States. INDUSTRIAL WASTE MATERIALS Green For Life is a trusted partner to many of North America’s thriving industries. We provide hazardous waste and recycling services for industrial sites such as mines, refineries, power stations and rail yards. '

# Text Summarization

In [13]:
#We need to tokenize the article into sentences
#Sentence tokenization
nltk.download('punkt')
sentence_list = nltk.sent_tokenize(clean_text)

[nltk_data] Downloading package punkt to C:\Users\Sayanti
[nltk_data]     Dutta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
#Weighted Frequency of Occurrence

stopwords = nltk.corpus.stopwords.words('english')

word_frequencies = {}
for word in nltk.word_tokenize(clean_text):
    if word not in stopwords:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1

In [15]:
maximum_frequncy = max(word_frequencies.values())

for word in word_frequencies.keys():
    word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)

In [16]:
sentence_scores = {}
for sent in sentence_list:
    for word in nltk.word_tokenize(sent.lower()):
        if word in word_frequencies.keys():
            if len(sent.split(' ')) < 30:
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word]
                else:
                    sentence_scores[sent] += word_frequencies[word]

In [17]:
sentence_scores

{'Our teams of qualified chemists and technicians work together with our customers to solve hazardous waste challenges ranging from routine to complex.': 2.5,
 'We also coordinate onsite recycling initiatives, personnel training and waste reduction programs.': 3.3214285714285716,
 'All hazardous waste materials are handled by GFL with care and compliance at our transfer, storage and disposal facilities located in Canada and the United States.': 3.535714285714286,
 'INDUSTRIAL WASTE MATERIALS Green For Life is a trusted partner to many of North America’s thriving industries.': 2.107142857142857,
 'We provide hazardous waste and recycling services for industrial sites such as mines, refineries, power stations and rail yards.': 4.392857142857142,
 'We supply bins, roll-off containers and storage tanks for large volumes of waste materials.': 3.0714285714285716,
 'Our high-performance vacuum trucks and confined space entry teams can be deployed for plant shutdowns and to extract hazardous m

# 10 Key Ideas

In [18]:
import heapq
summary_sentences = heapq.nlargest(10, sentence_scores, key=sentence_scores.get)

summary = ' '.join(summary_sentences)
print(summary)

HOUSEHOLD WASTE MATERIALS Materials such as chemical cleaners, paints, solvents, poisons and pesticides pose a danger to people and the environment if neglected or improperly handled. We provide hazardous waste and recycling services for industrial sites such as mines, refineries, power stations and rail yards. PROCESSING CAPABILITIES In addition to collection and transportation of liquid wastes, GFL facilities process and/or treat an impressive variety of wastes generated by industrial, commercial and institutional applications. All hazardous waste materials are handled by GFL with care and compliance at our transfer, storage and disposal facilities located in Canada and the United States. INSTITUTIONAL WASTE MATERIALS We safely remove and properly recycle or dispose of outdated chemicals and reagents from institutions such as high schools, hospitals and laboratories. Our high-performance vacuum trucks and confined space entry teams can be deployed for plant shutdowns and to extract h

# 15 Key Ideas

In [19]:
import heapq
summary_sentences_2 = heapq.nlargest(15, sentence_scores, key=sentence_scores.get)

summary_2 = ' '.join(summary_sentences_2)
print(summary_2)

HOUSEHOLD WASTE MATERIALS Materials such as chemical cleaners, paints, solvents, poisons and pesticides pose a danger to people and the environment if neglected or improperly handled. We provide hazardous waste and recycling services for industrial sites such as mines, refineries, power stations and rail yards. PROCESSING CAPABILITIES In addition to collection and transportation of liquid wastes, GFL facilities process and/or treat an impressive variety of wastes generated by industrial, commercial and institutional applications. All hazardous waste materials are handled by GFL with care and compliance at our transfer, storage and disposal facilities located in Canada and the United States. INSTITUTIONAL WASTE MATERIALS We safely remove and properly recycle or dispose of outdated chemicals and reagents from institutions such as high schools, hospitals and laboratories. Our high-performance vacuum trucks and confined space entry teams can be deployed for plant shutdowns and to extract h