# Main script

Important note: in order to make this script work properly, I renamed several chemicals in the `iris_assessment_table...` Excel sheet. There were several lines with the same chemical name, but pointing to different chemicals with separate CAS numbers and unique titles on their actual webpages. These were renamed to agree with their webpages, e.g., the two rows with chemical name "Chlorobutane" were renamed "1-Chlorobutane" and "2-Chlorobutane." Additionally, there were two chemical names containing forward slashes. The alternate names after the slashes were placed in parentheses.

In [6]:
import selenium, requests, time, os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from PIL import Image

# read in the source file
iris_source = pd.read_excel('iris_assessment_table_unique_names_20220721.xlsx')

In [13]:
# webdriver must be headless to take a full-page screenshot
Options = webdriver.ChromeOptions()
Options.headless = True 
s=Service(r'C:\Users\JWILLI29\OneDrive - Environmental Protection Agency (EPA)\Profile\Documents\chromedriver.exe')

# get list of screenshots already taken, if any, to check against during the loop
downloaded = os.listdir(r'output\chemicals\screenshots')
# list to catch any names containing problematic characters
messed_up_names = []

for i in range(len(iris_source)):
    # check to see if this file has already been downloaded
    if iris_source.loc[i, 'Chemical Name'] + '.png' in downloaded:
        continue
    
    # check to see if chemical name contains a slash that will mess up my filenames
    if ("\\" in iris_source.loc[i, 'Chemical Name']) | ("/" in iris_source.loc[i, 'Chemical Name']):
        messed_up_names.append(iris_source.loc[i, 'Chemical Name'])
        continue
    
    # get a timestamp for the chemical page
    chem_timestamp = time.strftime("%Y%m%d_%H%M%S",time.localtime())
    
    # pull up the page we're screenshotting
    URL = iris_source.loc[i, 'Chemical URL']
    driver = webdriver.Chrome(service=s, options = Options)
    driver.get(URL)
    time.sleep(1)

    # find the element containing the expansion button and then click on it. if it doesn't exist, this does nothing
    try:
        driver.find_element(By.XPATH, "//*[@id=\"DataTables_Table_0\"]/tbody/tr[1]/td[1]").click()
        time.sleep(0.25)
    except:
        pass
    try:
        driver.find_element(By.XPATH, "//*[@id=\"DataTables_Table_0\"]/tbody/tr[3]/td[1]").click()
        time.sleep(0.25)
    except:
        pass
    try:
        driver.find_element(By.XPATH, "//*[@id=\"DataTables_Table_1\"]/tbody/tr[1]/td[1]").click()
        time.sleep(0.25)
    except:
        pass
    try:
        driver.find_element(By.XPATH, "//*[@id=\"DataTables_Table_1\"]/tbody/tr[3]/td[1]").click()
        time.sleep(0.25)
    except:
        pass

    # Resize the window to the full size of the page (plus a little more width for readability)
    full_width = driver.execute_script('return document.body.parentNode.scrollWidth')
    time.sleep(0.33)
    full_height = driver.execute_script('return document.body.parentNode.scrollHeight')
    time.sleep(0.33)
    driver.set_window_size(full_width + 500, full_height)
    time.sleep(0.33)

    # screenshot everything inside the main content tag (everything but header and footer)
    content = driver.find_element(By.XPATH, '//*[@id=\"main-content\"]')
    time.sleep(0.25)
    content.screenshot(r'output\chemicals\screenshots\\' + iris_source.loc[i, 'Chemical Name'] + '.png')
    time.sleep(0.5)

    # close the driver
    driver.quit()

    # save the image as a pdf
    image = Image.open(r'output\chemicals\screenshots\\'+iris_source.loc[i, 'Chemical Name'] +'.png')
    image_converted = image.convert('RGB')
    image_converted.save(r'output\chemicals\iris_'+iris_source.loc[i, 'Chemical Name']+'_webpage_'+chem_timestamp+'.pdf', 
                         quality = 100)

    # get a timestamp for the summary
    sum_timestamp = time.strftime("%Y%m%d_%H%M%S",time.localtime())
    # try downloading the summary pdf and catch any errors
    errors = []
    try:
        response = requests.get(iris_source.loc[i, 'IRIS Summary URL'])
    except:
        errors.append(iris_source.loc[i, 'IRIS Summary URL'])
    file = open(r'output\summaries\iris_'+iris_source.loc[i, 'Chemical Name']+'_summary_'+sum_timestamp+'.pdf', 'wb')
    file.write(response.content)
    file.close()
    
    print(chem_timestamp, i/570*100, iris_source.loc[i, 'Chemical Name'])

20220722_122121 100.0 Zineb


# Resolving errors

In [8]:
messed_up_names
# I renamed the first to "Cyhalothrin (Karate)" and the second to "Hydrazine (Hydrazine Sulfate)," as the CAS given is for
# hydrazine but I didn't want to throw away information.
# I then reran the script, since it would skip over every file as they were all in output\screenshots but those two.

[]

In [14]:
# A quick flip through the screenshots folder revealed only one page that needed to be corrected, as it had too many buttons
# We can just correct this manually
Options = webdriver.ChromeOptions()
Options.headless = True 
s=Service(r'C:\Users\JWILLI29\OneDrive - Environmental Protection Agency (EPA)\Profile\Documents\chromedriver.exe')
URL = "https://iris.epa.gov/ChemicalLanding/&substance_nmbr=197"
driver = webdriver.Chrome(service=s, options = Options)
chem_timestamp = time.strftime("%Y%m%d_%H%M%S",time.localtime())
driver.get(URL)
time.sleep(0.25)

xpaths = ["//*[@id=\"DataTables_Table_0\"]/tbody/tr[1]/td[1]",
"//*[@id=\"DataTables_Table_0\"]/tbody/tr[3]/td[1]",
"//*[@id=\"DataTables_Table_1\"]/tbody/tr[1]/td[1]",
"//*[@id=\"DataTables_Table_1\"]/tbody/tr[3]/td[1]",
"//*[@id=\"DataTables_Table_1\"]/tbody/tr[5]/td[1]",
"//*[@id=\"DataTables_Table_1\"]/tbody/tr[7]/td[1]",
"//*[@id=\"DataTables_Table_1\"]/tbody/tr[9]/td[1]",
"//*[@id=\"DataTables_Table_1\"]/tbody/tr[11]/td[1]",
"//*[@id=\"DataTables_Table_1\"]/tbody/tr[13]/td[1]"]

for x in xpaths:
    driver.find_element(By.XPATH, x).click()
    time.sleep(0.25)
    
full_width = driver.execute_script('return document.body.parentNode.scrollWidth')
time.sleep(0.25)
full_height = driver.execute_script('return document.body.parentNode.scrollHeight')
time.sleep(0.25)
driver.set_window_size(full_width + 500, full_height)
time.sleep(0.25)

# screenshot everything inside the main content tag (everything but header and footer)
content = driver.find_element(By.XPATH, '//*[@id=\"main-content\"]')
time.sleep(0.25)
content.screenshot(r'output\chemicals\screenshots\1,1,1-Trichloroethane.png')
time.sleep(0.25)

# close the driver
driver.quit()

# save the image as a pdf
image = Image.open(r'output\chemicals\screenshots\1,1,1-Trichloroethane.png')
image_converted = image.convert('RGB')
image_converted.save(r'output\chemicals\iris_1,1,1-Trichloroethane_webpage_'+chem_timestamp+'.pdf', 
                     quality = 100)

In [16]:
# quick check to see if we have the same number of files in each folder
chemicals = os.listdir(r'output\chemicals')
screenshots = os.listdir(r'output\chemicals\screenshots')
summaries = os.listdir(r'output\summaries')

print(len(chemicals), len(screenshots), len(summaries))

# We expect the chemicals directory to contain 571 files + 1 subfolder, so that's good,
# and we expect 571 summaries, one for each chemical, but why are there 572 screenshots? expecting 571.

572 572 571


In [28]:
pd.DataFrame(screenshots).value_counts()
# Well there aren't any duplicate files in there...

1,1,1,2-Tetrachloroethane.png            1
Naphthalene.png                          1
Metribuzin.png                           1
Mirex.png                                1
Molinate.png                             1
                                        ..
Decabromodiphenyl ether (BDE-209).png    1
Danitol.png                              1
Dalapon, sodium salt.png                 1
Dacthal.png                              1
trans-1,2-Dichloroethylene.png           1
Length: 572, dtype: int64

In [49]:
for i in range(len(chemicals)):
    if sorted(chemicals)[i][5:-28] == sorted(screenshots)[i][:-4]:
        pass
    else:
        print(sorted(chemicals)[i]+" =/= "+sorted(screenshots)[i])
        break
        
# Okay, looks like there's a hidden system file containing thumbnails of all the .pngs. I'm satisfied.

iris_Toluene diisocyanate mixture (TDI)_webpage_20220722_112412.pdf =/= Thumbs.db
