# UN Hazardous Waste Project

In this project, we will extract the UN public data on **Hazardous Waste** and analyze the values provided by the source.

In [172]:
## we will need requests and bs for web scrapping
import requests
from bs4 import BeautifulSoup
import json
## for data cleaning and visualizations
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [177]:
## the datasets we will be needing are the following:
new_table_list = {"hazardous-waste-treated-or-disposed":
              {'url':'https://data.un.org/Data.aspx?d=ENV&f=variableID:1780&c=2,3,4,5&s=countryName:asc,yr:desc&v=pagenum',
             'data':None},
             "total-amount-of-municipal-waste-collected":
             {'url':'https://data.un.org/Data.aspx?d=ENV&f=variableID:1814&c=2,3,4,5&s=countryName:asc,yr:desc&v=pagenum',
             'data':None},
             "hazardous-waste-landfilled":
             {'url':'https://data.un.org/Data.aspx?d=ENV&f=variableID:1841&c=2,3,4,5&s=countryName:asc,yr:desc&v=pagenum',
             'data':None},
              "total-population-served-by-municipal-waste-collection":
             {'url':'https://data.un.org/Data.aspx?d=ENV&f=variableID:1878&c=2,3,4,5&s=countryName:asc,yr:desc&v=pagenum',
             'data':None},
              "hazardous-waste-recycled":
             {'url':'https://data.un.org/Data.aspx?d=ENV&f=variableID:2573&c=2,3,4,5&s=countryName:asc,yr:desc&v=pagenum',
             'data':None},
              "hazardous-waste-incinerated":
             {'url':'https://data.un.org/Data.aspx?d=ENV&f=variableID:2574&c=2,3,4,5&s=countryName:asc,yr:desc&v=pagenum',
             'data':None},
              "hazardous-waste-generated":
             {'url':'https://data.un.org/Data.aspx?d=ENV&f=variableID:2830&c=2,3,4,5&s=countryName:asc,yr:desc&v=pagenum',
             'data':None},
             }

In [178]:
## saving the table list into a json
with open("C:\\Users\\12145\\OneDrive\\Documents\\GitHub\\un-project\\data\\table-list.json", "w") as t:
    json.dump(new_table_list, t, indent=5)

In [171]:
## the api doesn't allow access to this dataset directly
## so we have to use page numbers to scrape the data
def get_data(url, table_name="table"):
    i = 1
    data = {}
    response = requests.get(url.replace("&v=pagenum", str(i)))
    soup = BeautifulSoup(response.content, 'html.parser')
    page_count = soup.find("span", {"id":"spanPageCountB"}).text
    print(f"Staring to get data for {table_name}")
    while response.ok and i <= int(page_count):
        # print("On page: ", i)
        # print(url.replace("=pagenum", f"={i}"))
        for tb in BeautifulSoup(response.content, 'html.parser').find_all("div", {"class":"DataContainer"}):
            data[i] = {x.text:[] for x in tb.find_all('th') if x is not None}
            for row in tb.find_all('tr'):
                if row.find('td') is not None:
                    vals = [v.text for v in row.find_all('td') if v is not None]
                    for pos, k in enumerate(data[i]):
                        if len(vals) > pos:
                            data[i][k].append(vals[pos])
        i += 1
        response = requests.get(url.replace("=pagenum", f"={i}"))
    return data

In [170]:
## getting the data for each table
for key in table_list:
    url = table_list[key]['url']
    table_list[key]['data'] = get_data(url, table_name=key)
table_list

On page:  1
On page:  2
On page:  3
On page:  4
On page:  5
On page:  6
On page:  7
On page:  8
On page:  9
On page:  10
On page:  11
On page:  12
On page:  13
On page:  14
On page:  15
On page:  16
On page:  17
On page:  1
On page:  2
On page:  3
On page:  4
On page:  5
On page:  6
On page:  7
On page:  8
On page:  9
On page:  10
On page:  11
On page:  12
On page:  13
On page:  14
On page:  15
On page:  16
On page:  17
On page:  18
On page:  19
On page:  20
On page:  21
On page:  22
On page:  23
On page:  24
On page:  25
On page:  26
On page:  27
On page:  28
On page:  29
On page:  30
On page:  31
On page:  32
On page:  33
On page:  34
On page:  35
On page:  36
On page:  37
On page:  38
On page:  39
On page:  40
On page:  1
On page:  2
On page:  3
On page:  4
On page:  5
On page:  6
On page:  7
On page:  8
On page:  9
On page:  10
On page:  11
On page:  12
On page:  13
On page:  14
On page:  15
On page:  1
On page:  2
On page:  3
On page:  4
On page:  5
On page:  6
On page:  7
On page

{'hazardous-waste-treated-or-disposed': {'url': 'https://data.un.org/Data.aspx?d=ENV&f=variableID:1780&c=2,3,4,5&s=countryName:asc,yr:desc&v=pagenum',
  'data': {1: {'Country or Area': ['Albania',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Argentina',
     'Argentina',
     'Argentina',
     'Argentina',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Austria'],
    'Year'

In [180]:
## the api doesn't allow access to this dataset directly
## so we have to use page numbers to scrape the data
def get_data(url, table_name="table"):
    i = 1
    data = {}
    response = requests.get(url.replace("&v=pagenum", str(i)))
    soup = BeautifulSoup(response.content, 'html.parser')
    page_count = soup.find("span", {"id":"spanPageCountB"}).text
    print(f"Staring to get data for {table_name}")
    while response.ok and i <= int(page_count):
        # print("On page: ", i)
        # print(url.replace("=pagenum", f"={i}"))
        for tb in BeautifulSoup(response.content, 'html.parser').find_all("div", {"class":"DataContainer"}):
            if i == 1:
                data = {x.text:[] for x in tb.find_all('th') if x is not None}
            for row in tb.find_all('tr'):
                if row.find('td') is not None:
                    vals = [v.text for v in row.find_all('td') if v is not None]
                    for pos, k in enumerate(data):
                        if len(vals) > pos:
                            data[k].append(vals[pos])
        i += 1
        response = requests.get(url.replace("=pagenum", f"={i}"))
    return data

In [181]:
test_dict = get_data('https://data.un.org/Data.aspx?d=ENV&f=variableID:1780&c=2,3,4,5&s=countryName:asc,yr:desc&v=pagenum')
test_dict

Staring to get data for table


{'Country or Area': ['Albania',
  'Andorra',
  'Andorra',
  'Andorra',
  'Andorra',
  'Andorra',
  'Andorra',
  'Andorra',
  'Andorra',
  'Andorra',
  'Andorra',
  'Andorra',
  'Andorra',
  'Andorra',
  'Andorra',
  'Andorra',
  'Andorra',
  'Andorra',
  'Andorra',
  'Andorra',
  'Andorra',
  'Argentina',
  'Argentina',
  'Argentina',
  'Argentina',
  'Armenia',
  'Armenia',
  'Armenia',
  'Armenia',
  'Armenia',
  'Armenia',
  'Armenia',
  'Armenia',
  'Armenia',
  'Armenia',
  'Armenia',
  'Armenia',
  'Armenia',
  'Armenia',
  'Armenia',
  'Armenia',
  'Armenia',
  'Armenia',
  'Armenia',
  'Armenia',
  'Armenia',
  'Armenia',
  'Armenia',
  'Armenia',
  'Austria',
  'Austria',
  'Austria',
  'Austria',
  'Austria',
  'Austria',
  'Austria',
  'Azerbaijan',
  'Azerbaijan',
  'Azerbaijan',
  'Azerbaijan',
  'Azerbaijan',
  'Azerbaijan',
  'Azerbaijan',
  'Azerbaijan',
  'Azerbaijan',
  'Azerbaijan',
  'Azerbaijan',
  'Azerbaijan',
  'Azerbaijan',
  'Azerbaijan',
  'Azerbaijan',
  'Az

In [169]:
## now that we have the data for all of the tables
## we can start with our data analysis


{'hazardous-waste-treated-or-disposed': {'url': 'https://data.un.org/Data.aspx?d=ENV&f=variableID:1780&c=2,3,4,5&s=countryName:asc,yr:desc&v=pagenum',
  'data': {1: {'Country or Area': ['Albania',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Andorra',
     'Argentina',
     'Argentina',
     'Argentina',
     'Argentina',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Armenia',
     'Austria'],
    'Year'

In [14]:
test = requests.get('https://data.un.org/ws/rest/data/UNSD,DF_UNData_UNFCC,1.0/all/ALL/?detail=full&dimensionAtObservation=TIME_PERIOD')
test_bs = bs(test.content)
test_bs

<?xml version="1.0" encoding="utf-8"?><!--NSI Web Service v8.15.1.0--><html><body><message:genericdata xmlns:common="http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common" xmlns:footer="http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message/footer" xmlns:generic="http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic" xmlns:message="http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message" xmlns:xml="http://www.w3.org/XML/1998/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><message:header><message:id>IREF000237</message:id><message:test>false</message:test><message:prepared>2024-06-11T15:20:53</message:prepared><message:sender id="1B0"></message:sender><message:structure dimensionatobservation="TIME_PERIOD" structureid="UNSD_DF_UNData_UNFCC_1_0"><common:structureusage><ref agencyid="UNSD" id="DF_UNData_UNFCC" version="1.0"></ref></common:structureusage></message:structure><message:datasetaction>Information</message:datasetaction><message:datasetid>DSD_G

In [53]:
#https://data.un.org/Data.aspx?d=ENV&f=variableID:1780&c=2,3,4,5&s=countryName:asc,yr:desc&v=1
test = requests.get('https://data.un.org/Data.aspx?d=ENV&f=variableID:1780&c=2,3,4,5&s=countryName:asc,yr:desc&v=1')
test_bs = bs(test.content, 'xmlns')
test_bs


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>
	UNdata | record view | Hazardous waste treated or disposed
</title><link href="_Styles/Global.css" rel="stylesheet" type="text/css"/><link href="_Images/favicon.ico" rel="shortcut icon"/><link href="OpenSearch.xml" rel="search" title="UNdata" type="application/opensearchdescription+xml"/>
<script src="_Scripts/Ajax.js" type="text/javascript"></script>
<script src="_Scripts/Common.js" type="text/javascript"></script>
<script src="_Scripts/Rollovers.js" type="text/javascript"></script><script src="_Scripts/DataCommon.js" type="text/javascript"></script><script src="_Scripts/QueryString.js" type="text/javascript"></script><link href="_Styles/Filters.css" rel="Stylesheet" type="text/css"/><link href="_Styles/Data.css" rel="Stylesheet" type="text/css"/><script src="_Scripts/Data.js" type="text/javascript"></scr

In [88]:
dir(test_bs)

['ASCII_SPACES',
 'DEFAULT_BUILDER_FEATURES',
 'DEFAULT_INTERESTING_STRING_TYPES',
 'ROOT_TAG_NAME',
 '__bool__',
 '__call__',
 '__class__',
 '__contains__',
 '__copy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_all_strings',
 '_decode_markup',
 '_feed',
 '_find_all',
 '_find_one',
 '_is_xml',
 '_lastRecursiveChild',
 '_last_descendant',
 '_linkage_fixer',
 '_markup_is_url',
 '_markup_resembles_filename',
 '_most_recent_element',
 '_namespaces',
 '_popToTag',
 '_should_pretty_print',
 'append',
 'attrs',
 'builder',
 'can_be_empty_element',
 'cdata_list_

In [89]:
help(test_bs)

Help on BeautifulSoup in module bs4 object:

class BeautifulSoup(bs4.element.Tag)
 |  BeautifulSoup(markup='', features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, element_classes=None, **kwargs)
 |  
 |  A data structure representing a parsed HTML or XML document.
 |  
 |  Most of the methods you'll call on a BeautifulSoup object are inherited from
 |  PageElement or Tag.
 |  
 |  Internally, this class defines the basic interface called by the
 |  tree builders when converting an HTML/XML document into a data
 |  structure. The interface abstracts away the differences between
 |  parsers. To write a new tree builder, you'll need to understand
 |  these methods as a whole.
 |  
 |  These methods will be called by the BeautifulSoup constructor:
 |    * reset()
 |    * feed(markup)
 |  
 |  The tree builder may call these methods from its feed() implementation:
 |    * handle_starttag(name, attrs) # See note about return value
 |    * handle_endtag(n

In [114]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def fetch_webpage(url):
    response = requests.get(url)
    response.raise_for_status()
    return response.text

def extract_js_function(html, function_name):
    soup = BeautifulSoup(html, 'html.parser')
    scripts = soup.find_all('script')
    print(scripts)
    for script in scripts:
        if script.string and function_name in script.string:
            return script.string
    return None

def run_js_function_in_browser(url, js_function):
    # Set up Selenium WebDriver (ensure you have the correct path to the WebDriver)
    service = Service('C:/chromedriver')
    options = webdriver.ChromeOptions()
    options.headless = True
    driver = webdriver.Chrome(service=service, options=options)
    
    try:
        driver.get(url)
        
        # Inject and run the JavaScript function
        result = driver.execute_script(js_function)
        return result
    finally:
        driver.quit()

def main(url, function_name):
    html = fetch_webpage(url)
    js_function = extract_js_function(html, function_name)
    
    if js_function:
        print(f"Found JavaScript function:\n{js_function}")
        result = run_js_function_in_browser(url, js_function)
        print(f"Function output: {result}")
    else:
        print(f"JavaScript function '{function_name}' not found.")

# if __name__ == "__main__":
url = "https://example.com"  # Replace with the URL you want to scrape
url = 'https://data.un.org/Data.aspx?d=ENV&f=variableID:1780&c=2,3,4,5&s=countryName:asc,yr:desc&v=1'
function_name = "BuildDownloadBox"  # Replace with the JavaScript function name you are looking for
main(url, function_name)


[<script src="_Scripts/Ajax.js" type="text/javascript"></script>, <script src="_Scripts/Common.js" type="text/javascript"></script>, <script src="_Scripts/Rollovers.js" type="text/javascript"></script>, <script src="_Scripts/DataCommon.js" type="text/javascript"></script>, <script src="_Scripts/QueryString.js" type="text/javascript"></script>, <script src="_Scripts/Data.js" type="text/javascript"></script>, <script src="_Scripts/DataMartInfo.js" type="text/javascript"></script>, <script src="_Scripts/SeriesActions.js" type="text/javascript"></script>, <script src="_Scripts/SearchBox.js" type="text/javascript"></script>, <script src="_Scripts/PopupLibrary.js" type="text/javascript"></script>, <script src="_Scripts/Scroll/dw_scroll_dx.js" type="text/javascript"></script>, <script src="_Scripts/Filters.js" type="text/javascript"></script>, <script type="text/javascript">InitScrollBar('CountryorArea');</script>, <script type="text/javascript">InitScrollBar('Period');</script>, <script type

In [161]:
import requests
from bs4 import BeautifulSoup
## 
url = 'https://data.un.org/Data.aspx?d=ENV&f=variableID:1780&c=2,3,4,5&s=countryName:asc,yr:desc&v=pagenum'
## Total amount of municipal waste collected
## https://data.un.org/Data.aspx?d=ENV&f=variableID:1814&c=2,3,4,5&s=countryName:asc,yr:desc&v=1
i = 1
data = {}
response = requests.get(url.replace("&v=pagenum", str(i)))
soup = BeautifulSoup(response.content, 'html.parser')
page_count = soup.find("span", {"id":"spanPageCountB"}).text
while response.ok and i <= int(page_count):
    print("On page: ", i)
    # print(url.replace("=pagenum", f"={i}"))
    for tb in BeautifulSoup(response.content, 'html.parser').find_all("div", {"class":"DataContainer"}):
        data[i] = {x.text:[] for x in tb.find_all('th') if x is not None}
        for row in tb.find_all('tr'):
            if row.find('td') is not None:
                vals = [v.text for v in row.find_all('td') if v is not None]
                for pos, k in enumerate(data[i]):
                    if len(vals) > pos:
                        data[i][k].append(vals[pos])
    i += 1
    response = requests.get(url.replace("=pagenum", f"={i}"))
print(data)

On page:  1
https://data.un.org/Data.aspx?d=ENV&f=variableID:1780&c=2,3,4,5&s=countryName:asc,yr:desc&v=1
On page:  2
https://data.un.org/Data.aspx?d=ENV&f=variableID:1780&c=2,3,4,5&s=countryName:asc,yr:desc&v=2
On page:  3
https://data.un.org/Data.aspx?d=ENV&f=variableID:1780&c=2,3,4,5&s=countryName:asc,yr:desc&v=3
On page:  4
https://data.un.org/Data.aspx?d=ENV&f=variableID:1780&c=2,3,4,5&s=countryName:asc,yr:desc&v=4
On page:  5
https://data.un.org/Data.aspx?d=ENV&f=variableID:1780&c=2,3,4,5&s=countryName:asc,yr:desc&v=5
On page:  6
https://data.un.org/Data.aspx?d=ENV&f=variableID:1780&c=2,3,4,5&s=countryName:asc,yr:desc&v=6
On page:  7
https://data.un.org/Data.aspx?d=ENV&f=variableID:1780&c=2,3,4,5&s=countryName:asc,yr:desc&v=7
On page:  8
https://data.un.org/Data.aspx?d=ENV&f=variableID:1780&c=2,3,4,5&s=countryName:asc,yr:desc&v=8
On page:  9
https://data.un.org/Data.aspx?d=ENV&f=variableID:1780&c=2,3,4,5&s=countryName:asc,yr:desc&v=9
On page:  10
https://data.un.org/Data.aspx?d=E

In [124]:
data

{1: [<table border="0" cellpadding="0" cellspacing="0"><tr><th>Country or Area</th><th style="text-align:center;">Year</th><th style="text-align:right;border-right: none;">Value</th><th> </th><th>Unit</th></tr><tr><td>Albania</td><td style="text-align:center;">2014</td><td style="text-align:right;border-right: none;">0</td><td class="Footnotes"> </td><td>tonnes</td></tr><tr><td>Andorra</td><td style="text-align:center;">2021</td><td style="text-align:right;border-right: none;">0</td><td class="Footnotes"> </td><td>tonnes</td></tr><tr><td>Andorra</td><td style="text-align:center;">2020</td><td style="text-align:right;border-right: none;">0</td><td class="Footnotes"> </td><td>tonnes</td></tr><tr><td>Andorra</td><td style="text-align:center;">2019</td><td style="text-align:right;border-right: none;">0</td><td class="Footnotes"> </td><td>tonnes</td></tr><tr><td>Andorra</td><td style="text-align:center;">2018</td><td style="text-align:right;border-right: none;">0</td><td class="Footnotes"> 

In [157]:
https://unstats.un.org/unsd/envstats/Questionnaires/2022/Tables/Composition%20of%20Municipal%20Waste%20(latest%20year).xlsx

{1: {'Country or Area': ['Belarus',
   'Belarus',
   'Belarus',
   'Belarus',
   'Belarus',
   'Belarus',
   'Belarus',
   'Belarus',
   'Belarus',
   'Belarus',
   'Belarus',
   'Belarus',
   'Belarus',
   'Belarus',
   'Belarus',
   'Belarus',
   'Belarus',
   'Belgium',
   'Belgium',
   'Belgium',
   'Belgium',
   'Belgium',
   'Belgium',
   'Belgium',
   'Belize',
   'Belize',
   'Bermuda',
   'Bermuda',
   'Bermuda',
   'Bermuda',
   'Bermuda',
   'Bermuda',
   'Bermuda',
   'Bermuda',
   'Bermuda',
   'Bermuda',
   'Bhutan',
   'Bhutan',
   'Botswana',
   'Botswana',
   'Botswana',
   'Bulgaria',
   'Bulgaria',
   'Bulgaria',
   'Bulgaria',
   'Bulgaria',
   'Bulgaria',
   'Bulgaria',
   'Burkina Faso',
   'Burkina Faso'],
  'Year': ['2021',
   '2020',
   '2019',
   '2018',
   '2017',
   '2016',
   '2015',
   '2014',
   '2013',
   '2012',
   '2011',
   '2010',
   '2009',
   '2008',
   '2007',
   '2006',
   '2005',
   '2016',
   '2014',
   '2012',
   '2010',
   '2008',
   '2006',
