### Web scraping with Selenium 
for missing nutritional values in Anglian Tap Water, since this data is missing in the water quality dataset <br>
from: https://waterquality.anglianwater.com/map.aspx
<br>
160 Public Water Supply Zones. Report info only accessible with postcode input; boarders do not align with districts etc.<br>
found error messages in pop-up (OK button): "Error retrieving data or area not covered by Anglian Water", "Please provide a valid postcode." <br>
also possible: This area is covered by '<different water supplier>' ...and more text in iframe <br>
<br>
15 postcodes from different supply zones, looked up manually, used for proof of concept <br>
Info wanted from report: Calcium, Magnesium, Sodium, Chloride, Fluoride, Nitride, Potassium, Fluoride, pH

In [7]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


In [38]:
import pandas as pd
import re

In [51]:
# read the postcodes to use from file to a list
zone_postcode = pd.read_csv('data/zone_postcode.csv')
postcodes = zone_postcode.postcode.to_list()
print(postcodes)

# initialize data frame to store gathered information
headers= ['zone','source','parameter', 'value', 'unit', 'legal_limit','over_legal_limit']
nutrients = pd.DataFrame(columns = headers)

# specify values and localization of desired values
target_values = ["Calcium", "Magnesium", "Sodium", "Chloride", "Fluoride", "Nitride", "Potassium", "Fluoride", "pH"]
columns_to_store = [0,5,2,1,7] # see headers after source

['PE31 7LR', 'PE12 7LR', 'PE33 9HP', 'NR19 2TF', 'IP27 9FD', 'NR12 0AN', 'CB6 3NN', 'IP30 0TL', 'PE12 9RW', 'PE10 9NJ', 'NN17 4AP', 'MK42 9DJ', 'NN6 8EH', 'PE29 3DD', 'CO9 1JD']


In [52]:
# set firefox as webdriver
driver = webdriver.Firefox()

# open website
driver.get("https://waterquality.anglianwater.com/map.aspx")

# setup wait for later
wait = WebDriverWait(driver, 10)

# save current window handle
window = driver.current_window_handle

# loop through the postcodes list
for postcode in postcodes:

    # find textbox on website, clear, insert postcode and send
    text_box = driver.find_element(By.ID, "frmPostcode")
    text_box.clear()
    text_box.send_keys(postcode)
    text_box.send_keys(Keys.RETURN)
        
    # enter imbedded frame,waite for page to load and find report link
    driver.switch_to.frame("zoneInformation")
    report_link = wait.until(EC.presence_of_element_located((By.LINK_TEXT, "Drinking Water Quality Report")))
    
    # breaks the loop in case of postcode out of Anglian Water Zone
    if "This area is covered by" in driver.page_source:
        break
    else:
        # find zone information and store just the info between brackets
        zone_text = driver.find_element(By.XPATH,"/html/body/div[3]/p").text
        zone = re.search(r'\((.*?)\)', zone_text).group(1)

    #clicks the link
    report_link.click()

    # wait for second window to open
    wait.until(EC.number_of_windows_to_be(2))

    #loop to make sure the opened report windows is active           
    for window_handle in driver.window_handles:
        if window_handle != window:
            driver.switch_to.window(window_handle)
            break

    # wait for page load, find and store source information
    source =  wait.until(EC.presence_of_element_located((By.XPATH, "/html/body/div[2]/div/div[5]/dl/dd[2]/p"))).text
    
    # find table in report
    table = driver.find_element(By.CSS_SELECTOR,"div.content:nth-child(8) > table:nth-child(1) > tbody:nth-child(2)")
    
    # find all rows in the table
    rows = table.find_elements(By.TAG_NAME, "tr")

    #lLoop through target_values list
    for target_value in target_values:

        # loop through each row of the table and check if the target value is present
        for row in rows:

            # first cell in the current row
            first_cell = row.find_elements(By.TAG_NAME, "td")[0]
            
            # target value in the first cell
            if target_value in first_cell.text:
                # if found, store the text from specified columns
                target_row = [row.find_elements(By.TAG_NAME, "td")[index].text for index in columns_to_store]
                # store values in a new row of the data frame in the correct columns
                nutrients.loc[len(nutrients),['zone','source']] = [zone,source]
                nutrients.loc[(len(nutrients)-1),['parameter', 'value', 'unit', 'legal_limit','over_legal_limit']] = target_row

    #close current window and switch to original
    driver.close()
    driver.switch_to.window(window)
# closes webdriver after postcode loop
driver.close()

In [53]:
nutrients

Unnamed: 0,zone,source,parameter,value,unit,legal_limit,over_legal_limit
0,FE44,Your drinking water supply comes from a ground...,Calcium,117.37,mg/l,No legal Limit,0
1,FE44,Your drinking water supply comes from a ground...,Magnesium,2.864,mg/l,No legal Limit,0
2,FE44,Your drinking water supply comes from a ground...,Sodium,19.258,mg/l,200,0
3,FE44,Your drinking water supply comes from a ground...,Chloride,53.2,mg/l,250,0
4,FE44,Your drinking water supply comes from a ground...,Fluoride,0.1,mg/l,1.5,0
...,...,...,...,...,...,...,...
112,FW41,Your drinking water supply comes from a surfac...,Chloride,74.3,mg/l,250,0
113,FW41,Your drinking water supply comes from a surfac...,Fluoride,0.29,mg/l,1.5,0
114,FW41,Your drinking water supply comes from a surfac...,Potassium,8.953,mg/l,No legal Limit,0
115,FW41,Your drinking water supply comes from a surfac...,Fluoride,0.29,mg/l,1.5,0


In [62]:
nutrients.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 117 entries, 0 to 116
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   zone              117 non-null    object 
 1   source            117 non-null    object 
 2   parameter         117 non-null    object 
 3   value             117 non-null    float64
 4   unit              117 non-null    object 
 5   legal_limit       117 non-null    object 
 6   over_legal_limit  117 non-null    object 
dtypes: float64(1), object(6)
memory usage: 11.4+ KB


In [60]:
# '<' means :Below the limit of detection of the analysis. For the purpose of this analysis the sign is just ignored
nutrients.value = nutrients.value.str.replace('<','')

In [61]:
nutrients.value = pd.to_numeric(nutrients.value)

In [54]:
from sql_functions import upload

upload(nutrients,'anglian_nutrients')

The anglian_nutrients table was imported successfully.


In [63]:
nutrients.groupby('parameter')['value'].mean()

parameter
Calcium              109.300267
Chloride              56.140000
Fluoride               0.226067
Magnesium              5.968067
Potassium              3.385583
Sodium                25.476733
pH (Hydrogen ion)      7.498667
Name: value, dtype: float64