# Notes

Before beginning, I downloaded the .csv version of the data so we can use its list of chemical names.

In [39]:
import selenium, requests, time, os, re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from PIL import Image
from tqdm.notebook import tqdm

# read in the source file
oehha = pd.read_csv('OEHHA-chemicals_2022-08-30T11-30-58.csv')

In [45]:
# start a list to keep track of which chemicals we've done already
# do it up here so it doesn't get reset after 
done = []

In [70]:
# webdriver must be headless to take a full-page screenshot
Options = webdriver.ChromeOptions()
Options.headless = True 
s=Service(r'<path_to_your_chrome_driver_here>')

# get a list of everything already downloaded so we can check against it
# os.chdir(r'L:\PRIV\ToxValDB\Document Webscrape\source_cal_oehha')
# done = [filename[10:-28] for filename in os.listdir(r'docs\pdf')]

# lists to catch any problematic names and the cause for their issues
messed_up_names = []
reasons = []
# list to hold URLs for metadata
urls = []
# list to hold new filenames
new_names = []

for i in tqdm(range(len(oehha))):
    try:
        # grab the name of the chemical for this round
        name = oehha['Title'].iloc[i]

        # check to see if this file has already been downloaded
        if name in done:
            continue

        # catch problematic names
        if bool(re.search(r'[\\/:;*?"<>|&]', name)):
            messed_up_names.append(name)
            reasons.append('forbidden character')
            continue

        # get a timestamp for the png
        timestamp = time.strftime("%Y%m%d_%H%M%S",time.localtime())

        # pull up the page we're screenshotting by pasting it into the search bar
        driver = webdriver.Chrome(service=s, options = Options)
        # URL = f'https://oehha.ca.gov/chemicals/' # used in the first run of the script, commented out in the second
        #driver.get(URL)
        time.sleep(0.25)
        # driver.find_element(By.XPATH, '//*[@id="edit_jump_chosen"]/a/span').click() # Ibid.
        # time.sleep(0.25)
        # driver.find_element(By.XPATH, '//*[@id="edit_jump_chosen"]/div/div/input').send_keys(name)
        # time.sleep(0.25)
        # driver.find_element(By.XPATH, '//*[@id="edit_jump_chosen"]/div/div/input').send_keys(Keys.ENTER)
        # time.sleep(0.25)
        URL = f'https://oehha.ca.gov/chemicals/{name.replace(" ", "-")}'
        driver.get(URL)
        time.sleep(0.25)

        # check to make sure the chemical name matches the name on the URL (see https://.../acetominophen (sic))
        if driver.find_element(By.XPATH, '//*[@id="page-title"]').text != name:
            messed_up_names.append(name)
            reasons.append('chemical name did not match page title')
            continue

        # get the URL and assign a new name
        urls.append(driver.current_url)
        time.sleep(0.25)
        new_name = driver.current_url.split('/')[-1]
        time.sleep(0.25)
        new_names.append(new_name)

        # Resize the window to the full size of the page (plus a little more width for readability)
        full_width = driver.execute_script('return document.body.parentNode.scrollWidth')
        time.sleep(0.25)
        full_height = driver.execute_script('return document.body.parentNode.scrollHeight')
        time.sleep(0.25)
        driver.set_window_size(full_width + 500, full_height)
        time.sleep(0.25)

        # screenshot everything inside the main content tag (everything but header and footer)
        content = driver.find_element(By.XPATH, '//*[@id="main-content"]')
        time.sleep(0.25)
        content.screenshot(f'docs\\png\\cal_oehha_{new_name}_webpage_{timestamp}.png')
        time.sleep(0.25)

        # close the driver
        driver.quit()

        # save the image as a pdf
        image = Image.open(f'docs\\png\\cal_oehha_{new_name}_webpage_{timestamp}.png')
        image_converted = image.convert('RGB')
        image_converted.save(f'docs\\pdf\\cal_oehha_{new_name}_webpage_{timestamp}.pdf', 
                             quality = 100)

        done.append(name)
    
    except:
        messed_up_names.append(name)
        reasons.append('unknown exception')

  0%|          | 0/1126 [00:00<?, ?it/s]

In [71]:
for name in messed_up_names:
    if name in done:
        messed_up_names.pop(messed_up_names.index(name))

In [73]:
print(len(done) + len(messed_up_names))
[print(messed_up_names[i], reasons[i]) for i in range(len(reasons))]

1124
Chlorinated paraffins (Avg. chain length C12; approx. 60 percent chlorine by weight) forbidden character
Cyanotoxins: Cylindrospermopsin forbidden character
D&C Orange No. 17 forbidden character
D&C Red No. 19 forbidden character
D&C Red No. 8 forbidden character
D&C Red No. 9 forbidden character
Gasoline engine exhaust (condensates/extracts) forbidden character
Glu-P-1 (2-Amino-6-methyldipyrido[1-2-a:3'-2'-d]imidazole) forbidden character
Glu-P-2 (2-Aminodipyrido[1,2-a:3',2'-d]imidazole) forbidden character
2,4-Hexadienal (89% trans, trans isomer; 11% cis, trans isomer) forbidden character
Manganese & Manganese Compounds forbidden character
Norethisterone (Norethindrone) /Ethinyl Estradiol forbidden character
Norethisterone (Norethindrone) /Mestranol forbidden character
Palygorskite fibers (> 5µm in length) forbidden character
Perfluorooctane Sulfonate  (PFOS) chemical name did not match page title
Retinol/Retinyl Esters forbidden character


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [74]:
still_messed_up = ['chlorinated p',
                   'cyanotoxins',
                   'Orange no. 17', 
                   'red no. 19',
                   'Red no. 8', 
                   'red no. 9',
                   'gasoline e', 
                   'glu-p-1', 
                   'glu-p-2',
                   '2,4-hex',
                   'manganese',
                   'norethisterone (norethindrone) /e',
                   'norethisterone (norethindrone) /m',
                   'palygor',
                   'perfluorooctane sulfonate',
                   'retinol']

In [77]:
#
##
###
#### Taking care of the remaining pages that were not downloaded earlier #################################
###
##
#

for i in tqdm(range(len(messed_up_names))):
    try:
        # grab the name of the chemical for this round
        name = messed_up_names[i]
        # and get the shortened version we just found
        name_abbrev = still_messed_up[i]

        # check to see if this file has already been downloaded
        if name in done:
            continue

        # don't need to worry about catching problematic names

        # get a timestamp for the png
        timestamp = time.strftime("%Y%m%d_%H%M%S",time.localtime())

        # pull up the page we're screenshotting by pasting it into the search bar
        driver = webdriver.Chrome(service=s, options = Options)
        URL = f'https://oehha.ca.gov/chemicals/' # used in the first run of the script, commented out in the second
        driver.get(URL)
        time.sleep(0.25)
        driver.find_element(By.XPATH, '//*[@id="edit_jump_chosen"]/a/span').click() # Ibid.
        time.sleep(0.25)
        driver.find_element(By.XPATH, '//*[@id="edit_jump_chosen"]/div/div/input').send_keys(name_abbrev)
        time.sleep(0.25)
        driver.find_element(By.XPATH, '//*[@id="edit_jump_chosen"]/div/div/input').send_keys(Keys.ENTER)
        time.sleep(0.25)


        # check to make sure the chemical name matches the name on the URL (see https://.../acetominophen (sic))
        if driver.find_element(By.XPATH, '//*[@id="page-title"]').text != name:
            messed_up_names.append(name)
            reasons.append('chemical name did not match page title')
            continue

        # get the URL and assign a new name
        urls.append(driver.current_url)
        time.sleep(0.25)
        new_name = driver.current_url.split('/')[-1]
        time.sleep(0.25)
        new_names.append(new_name)

        # Resize the window to the full size of the page (plus a little more width for readability)
        full_width = driver.execute_script('return document.body.parentNode.scrollWidth')
        time.sleep(0.25)
        full_height = driver.execute_script('return document.body.parentNode.scrollHeight')
        time.sleep(0.25)
        driver.set_window_size(full_width + 500, full_height)
        time.sleep(0.25)

        # screenshot everything inside the main content tag (everything but header and footer)
        content = driver.find_element(By.XPATH, '//*[@id="main-content"]')
        time.sleep(0.25)
        content.screenshot(f'docs\\png\\cal_oehha_{new_name}_webpage_{timestamp}.png')
        time.sleep(0.25)

        # close the driver
        driver.quit()

        # save the image as a pdf
        image = Image.open(f'docs\\png\\cal_oehha_{new_name}_webpage_{timestamp}.png')
        image_converted = image.convert('RGB')
        image_converted.save(f'docs\\pdf\\cal_oehha_{new_name}_webpage_{timestamp}.pdf', 
                             quality = 100)

        done.append(name)
    
    except:
        messed_up_names.append(name)
        reasons.append('unknown exception')

  0%|          | 0/16 [00:00<?, ?it/s]

We need to do it one last time to catch the final straggler -- PFOS.

In [80]:
name = oehha['Title'].tolist().index('Perfluorooctane Sulfonate  (PFOS)')
timestamp = time.strftime("%Y%m%d_%H%M%S",time.localtime())
driver = webdriver.Chrome(service=s, options = Options)
URL = f'https://oehha.ca.gov/chemicals/perfluorooctane-sulfonate-pfos'
driver.get(URL)

# get the URL and assign a new name
urls.append(driver.current_url)
time.sleep(0.25)
new_name = driver.current_url.split('/')[-1]
time.sleep(0.25)
new_names.append(new_name)

# Resize the window to the full size of the page (plus a little more width for readability)
full_width = driver.execute_script('return document.body.parentNode.scrollWidth')
time.sleep(0.25)
full_height = driver.execute_script('return document.body.parentNode.scrollHeight')
time.sleep(0.25)
driver.set_window_size(full_width + 500, full_height)
time.sleep(0.25)

# screenshot everything inside the main content tag (everything but header and footer)
content = driver.find_element(By.XPATH, '//*[@id="main-content"]')
time.sleep(0.25)
content.screenshot(f'docs\\png\\cal_oehha_{new_name}_webpage_{timestamp}.png')
time.sleep(0.25)

# close the driver
driver.quit()

# save the image as a pdf
image = Image.open(f'docs\\png\\cal_oehha_{new_name}_webpage_{timestamp}.png')
image_converted = image.convert('RGB')
image_converted.save(f'docs\\pdf\\cal_oehha_{new_name}_webpage_{timestamp}.pdf', 
                     quality = 100)

done.append(name)

In [93]:
png = [filename[10:-28] for filename in os.listdir(r'docs\png')]
pdf = [filename[10:-28] for filename in os.listdir(r'docs\pdf')]

In [99]:
len(png) - len(set(png)) # looks like there are two duplicates -- deleting one of each manually
len(png) # well now there are only 1125 pngs. we're one short.

1125

In [125]:
pd.DataFrame({'png': png}).value_counts()

png                                                              
                                                                     1
molinate                                                             1
mopp-vincristine-prednisone-nitrogen-mustard-procarbazine-mixture    1
monocrotaline                                                        1
mon-4660-dichloroacetyl-1-oxa-4-azaspiro45-decane                    1
                                                                    ..
ci-disperse-yellow-3                                                 1
ci-solvent-yellow-14                                                 1
ciclosporin                                                          1
cidofovir                                                            1
zineb                                                                1
Length: 1125, dtype: int64

In [101]:
len(pdf) - len(set(pdf)) # they're all unique, but
len(pdf) # we're two short here

1124

In [116]:
print(len(oehha['Title']))
print(len(done))
for name in oehha['Title']:
    if name not in done:
        print(f'{name} has not been marked \'done\'')

1126
1124
Perfluorooctane Sulfonate  (PFOS) has not been marked 'done'


In [123]:
len(set(oehha['Title'])) # hang on

1124

In [124]:
pd.DataFrame({'titles': oehha['Title']}).value_counts()

titles                              
Nickel compounds                        2
Cadmium                                 2
N-Nitrosomethylvinylamine               1
Nalidixic Acid                          1
Naled                                   1
                                       ..
Cyanotoxins: Cylindrospermopsin         1
Cycasin                                 1
Cycloate                                1
Cyclohexanol                            1
trichlorotrifluoroethane (Freon 113)    1
Length: 1124, dtype: int64

So with 1124 unique pdfs and 1125 unique items in png (one of which is simply titled '' and thus presumably some kind of hidden system file, as it's not showing up in the file explorer) I think we're finally good to upload.

In [143]:
png_lengths = [len(i) for i in png]
png[png_lengths.index(0)-3:png_lengths.index(0)+1]
#png_lengths.sort()
#print(png_lengths)

['zileuton', 'zinc', 'zineb', '']