In [1]:
#Script to fill in file name field into clowder log based on how file appears in clowder

import time as t
import requests
import numpy as np
import pandas as pd
import os
from dotenv import load_dotenv #environment files for credentials. Make sure to pip install dotenv first


#environment file with credentials
load_dotenv()
#Script to fill in the file names from clowder, source, and subsource metadata fields
df = pd.read_excel(r"L:\PRIV\ToxValDB\Document Webscrape\pprtv_ornl\clowder_log_source_pprtv_ornl_20220727_temporary.xlsx")
api_key = os.environ.get('apiKey')
url = 'https://clowder.edap-cluster.com/api/files/'
source = 'PPRTV ORNL'
#These fields are blank for all files that were aquired in the webscrape
subsource = ''
doi = ''
hero_id = ''
dsstox_structure_id = ''

#Start loop in position 3, first entries in excel sheet are for the folder names which do not contain chemical info.
for i in range (3,len(df)):
#Fill in all of the static fields first
    df.loc[i,'file_name'] = ""
    df.loc[i,'source'] = source
    df.loc[i,'subsource'] = subsource
    df.loc[i,'doi'] = doi
    df.loc[i,'hero_id'] = hero_id
    df.loc[i,'dsstox_structure_id'] = dsstox_structure_id
#Access the clowder files
    clowder_id = df.loc[i,'uuid']
    t.sleep(0.5) #Courtesy sleep time
#Clowder API request to get data about the file connected to the clowder id accessed above
    response = requests.get(url+clowder_id+'/metadata', headers = {'X-API-Key': api_key})
    text = response.text
#Look only for the file name as it appears in clowder
#Note: This should be the same as the subDir2 field, but just in case we wrote this section
    start = text.index('filename')+11
    if '.pdf' in text:
        end = text.index('.pdf')+4
    if '.png' in text:
        end = text.index('.png')+4
#Put the file name in its location in the dataframe
    df.loc[i,'file_name'] = text[start:end]
print('All done!!')

All done!!


In [13]:
#Next, find the url for the chemical support pdf and put it in our dataframe
#This is ONLY FOR THE PDF SUPPORT FILES. For all screenshots, the url does not change.
#There were a few exceptions that may be problematic. To ensure that we get the correct url, we will go to the page we link
#and see if it exists (status code 200 or status code 404)

import time as t
import requests
import numpy as np
import pandas as pd

#df = pd.read_excel(r"C:\\Users\\mmille16\\desktop\\temp_clowder_log_final.xlsx")

#URLS necessary to access the files downloaded in the webscrape.
pdf_url = 'https://hhpprtv.ornl.gov/issue_papers/'
png_url = 'https://hhpprtv.ornl.gov/quickview/pprtv.php'

#Common punctuation and symbols that cause problems for the support pdf urls
punc = "',-()" 
brak = '[]'

for i in range (3,len(df)):
#URL only changes for pdf files. lets do them first
    file_name = df.loc[i,'file_name']
    if '.pdf' in file_name:
#Start by finding the chemical name. It is in the file name between 'ornl' and 'summary'
        chem_start = file_name.index('ornl')+5
        chem_end = file_name.index('summary')-1
        chem_name = file_name[chem_start:chem_end]
#Next, remove pesky punctuation and spaces
        chem_name = chem_name.replace(" ","")
        for char in chem_name:
            if char in punc:
                chem_name = chem_name.replace(char,"")
        t.sleep(0.5) #Courtesy sleep time
#Use requests to see if the url we create with the pdf url and chemcial name is valid
        response = requests.get(pdf_url+chem_name+'.pdf')
#One chemical needed to remove its brakets. Here is where we catch that exception.
        if response.status_code != 200:
            for char in chem_name:
                if char in brak:
                    chem_name = chem_name.replace(char,"")
            t.sleep(0.5)
#After removing brakets, try again, stop, and tell me which chemical needs more work (if any)
            response = requests.get(pdf_url+chem_name+'.pdf')
        if response.status_code != 200:
            print('the link for '+chem_name+' was not retrieved')
        else:
#Fill in the url section for the pdfs with the url we accessed in requests
            df.loc[i,'url'] = pdf_url+chem_name+'.pdf'
#Fill in the png urls (the drop down menu does not change the url used to get the screenshots)
    if '.png' in file_name:
        df.loc[i,'url'] = png_url
print('All done!!')

All done!!


In [14]:
#Script to fill in the year metadata fields as they appear on the webpage for each chemical
#First access the webpage for the chemicals, then find the year. It is located in one of the textboxes.
import time as t
import requests
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By

#df = pd.read_excel(r"C:\\Users\\mmille16\\desktop\\temp_clowder_log_final.xlsx")
#Initial set up of a headless selenium webdriver
Options = webdriver.ChromeOptions()
Options.headless = True
s=Service(r'C:\Users\mmille16\Documents\chromedriver.exe')
driver = webdriver.Chrome(service=s, options = Options)
URL = 'https://hhpprtv.ornl.gov/quickview/pprtv.php'
driver.get(URL)

for i in range(3,len(df)):
#Obtain the chemical name. It is between 'ornl' and 'summar' for support .pdfs and between 'ornl' and 'webpage' for .pngs. 
    file_name = df.loc[i,'file_name']
    if '.pdf' in file_name:
        chem_start = file_name.index('ornl')+5
        chem_end = file_name.index('summary')-1
    if '.png' in file_name:
        chem_start = file_name.index('ornl')+5
        chem_end = file_name.index('webpage')-1
    chem_name = file_name[chem_start:chem_end]
    t.sleep(0.5) #Courtesy sleep time before interacting with webpage
    
    #Select the dropdown menu option based on chemical name
    select = Select(driver.find_element(By.ID, "mySelect"))
    select.select_by_visible_text(chem_name)
    
    #Find the date field in the text box
    element = driver.find_element(By.TAG_NAME,'h4')
    #Fill in the year location with the year from the webpage
    df.loc[i,'year'] = element.text[-4:]
driver.quit()
print('All done!!!')

All done!!!


In [15]:
#Finally, lets get the study names. They are different from the file names for the pdfs and are nonexistent for the pngs.
#Using pdfminer six, we can read the first page of the pdf and look for the title.
#The title always starts with 'Provisional' but the ending can be a little hard to find. Check for 3 possibilities
#Ending can come at the end of the casrn carn in parentheses, at the beginning of "U.S.EPA", or at the beginning of "Superfund"
#Not all pdfs have some of these fields, so try-except is best for finding them

import time as t
import requests
import numpy as np
import pandas as pd
from pdfminer.high_level import extract_text
from pdfminer.high_level import extract_pages

#Open pandas dataframe
#df = pd.read_excel(r"C:\\Users\\mmille16\\desktop\\temp_clowder_log_final.xlsx")

#File path to pdfs
file_path = r"L:\\PRIV\ToxValDB\\Document Webscrape\\pprtv_ornl\\output\\support_pdfs\\"
for i in range(3,len(df)):
#Name of pdf in the file directory. Will use this to open the pdf in pdfminer six
    file_name = df.loc[i,'file_name']
    if '.pdf' in file_name:
    #Extract the text on the first page of the support pdf
        text = extract_text(file_path + file_name, page_numbers = [0])
    #Get rid of white space and new lines
        text = " ".join(text.split())
    #Start looking for title. It always starts with "Provisional"
        title_start = text.index('Provisional')
    #This is where we will stor the indicies of possible title endings
        arr = []
    #Looking for the index of the last set of parentheses and add it to the array
        reverse = text[::-1]
        try:
            last_par = len(text) - reverse.index(')')
            arr.append(last_par)
        except:
            pass
    #Look for "Superfund" an add its index to the array of possible endings
        try:
            superfund = text.index('Superfund')
            arr.append(superfund)
        except:
            pass
    #Look for "U.S. EPA" and add its index to the array of possible endings
        try:
            epa = text.index('U.S.')
            arr.append(epa)
        except:
            pass
    #Our title ends at the earliest index of the three possibilites
        title_end = min(arr)
        title = text[title_start:title_end]
    #Sanity check. Do not need to print out the titles every time.
        print(title)
    #Screenshots are not from pdf support documents and do not have titles that I am aware of.
    if '.png' in file_name:
        title = ""
    #All papers come from U.S.EPA
    author = 'U.S. Environmental Protection Agency'
    #Fill in the dataframe with the titles and authors in their respective locations.
    df.loc[i,'study_name'] = title
    df.loc[i,'author'] = author
print('All done!!!')

Provisional Peer-Reviewed Toxicity Values for Acenaphthene (CASRN 83-32-9)
Provisional Peer Reviewed Toxicity Values for Acenaphthylene (CASRN 208-96-8)
Provisional Peer-Reviewed Toxicity Values for Acetone Cyanohydrin (CASRN 75-86-5)
Provisional Peer-Reviewed Toxicity Values for Acetophenone (CASRN 98-86-2)
Provisional Peer Reviewed Toxicity Values for Acrolein (CASRN 107-02-8)
Provisional Peer-Reviewed Toxicity Values for Acrylic Acid (CASRN 79-10-7)
Provisional Peer Reviewed Toxicity Values for Adiponitrile (CASRN 111-69-3)
Provisional Peer Reviewed Toxicity Values for Aldrin (CASRN 309-00-2)
Provisional Peer Reviewed Toxicity Values for Alizarin Red Compounds (Various CASRNs)
Provisional Peer-Reviewed Toxicity Values for Allyl alcohol (CASRN 107-18-6)
Provisional Peer Reviewed Toxicity Values for Aluminum (CASRN 7429-90-5)
Provisional Peer-Reviewed Toxicity Values for 4-Amino-2,6-dinitrotoluene (CASRN 19406-51-0)
Provisional Peer-Reviewed Toxicity Values for 2-Amino-4,6-dinitrotolu

Provisional Peer-Reviewed Toxicity Values for o-Chlorotoluene (CASRN 95-49-8)
Provisional Peer-Reviewed Toxicity Values for p-Chlorotoluene (CASRN 106-43-4)
Provisional Peer Reviewed Toxicity Values for Chrysene (CASRN 218-01-9)
Provisional Peer Reviewed Toxicity Values for Cobalt (CASRN 7440-48-4)
Provisional Peer-Reviewed Toxicity Values for Complex Mixtures of Aliphatic and Aromatic Hydrocarbons (CASRN Various)
Provisional Peer-Reviewed Toxicity Values for Coronene (CASRN 191-07-1)
Provisional Peer-Reviewed Toxicity Values for trans-Crotonaldehyde (CASRN 123-73-9)
Provisional Peer-Reviewed Toxicity Values for Cyanogen Bromide (CASRN 506-68-3)
Provisional Peer-Reviewed Toxicity Values for Cyclohexane (CASRN 110-82-7)
Provisional Peer-Reviewed Toxicity Values for Cyclohexanone (CASRN 108-94-1)
Provisional Peer-Reviewed Toxicity Values for Cyclohexene (CASRN 110-83-8)
Provisional Peer Reviewed Toxicity Values for o,p’-Dichlorodiphenyldichloroethane (o,p’-DDD) (CASRN 53-19-0)
Provisiona

Provisional Peer-Reviewed Toxicity Values for Fluoranthene (CASRN 206-44-0)
Provisional Peer Reviewed Toxicity Values for Fluorene (CASRN 86-73-7)
Provisional Peer-Reviewed Toxicity Values for Fluorobenzene (CASRN 462-06-6)
Provisional Peer Reviewed Toxicity Values for 2-Fluorobiphenyl (CASRN 321-60-8)
Provisional Peer Reviewed Toxicity Values for 2-Fluorophenol (CASRN 367-12-4)
Provisional Peer-Reviewed Toxicity Values for Formic Acid (CASRN 64-18-6)
Provisional Peer-Reviewed Toxicity Values for Stable (Nonradioactive) Gadolinium (CASRN 7440-54-2) and Soluble Salts 
Provisional Peer-Reviewed Toxicity Values for Glycidaldehyde (CASRN 765-34-4)
Provisional Peer-Reviewed Toxicity Values for Guanidine Compounds (CASRN 113-00-8 Guanidine) (CASRN 506-93-4 Guanidine Nitrate) (CASRN 50-01-1 Guanidine Chloride)
Provisional Peer-Reviewed Toxicity Values for Guanidine Compounds (CASRN 113-00-8 Guanidine) (CASRN 506-93-4 Guanidine Nitrate) (CASRN 50-01-1 Guanidine Chloride)
Provisional Peer-Revie

Provisional Peer Reviewed Toxicity Values for Octadecanoic Acid (Stearic Acid) (CASRN 57-11-4)
Provisional Peer-Reviewed Toxicity Values for Octahydrotrimethylmethylethylphenanthrenol (CASRN 511-15-9)
Provisional Peer-Reviewed Toxicity Values for Di-n-octyl Phthalate (CASRN 117-84-0)
Provisional Peer-Reviewed Toxicity Values for Mixtures of 1,2,3,4,5-Pentabromo-6-Chlorocyclohexane (CASRN 87-84-3)
Provisional Peer Reviewed Toxicity Values for Pentachloroethane (CASRN 76-01-7)
Provisional Peer-Reviewed Toxicity Values for Pentaerythritol Tetranitrate (PETN) (CASRN 78-11-5)
Provisional Peer-Reviewed Toxicity Values for Pentamethyl dipropylenetriamine (CASRN 3855-32-1)
Provisional Peer-Reviewed Toxicity Values for Pentamethylphosphoramide (PMPA) (CASRN 10159-46-3) 
Provisional Peer-Reviewed Toxicity Values for n-Pentane (CASRN 109-66-0)
Provisional Peer-Reviewed Toxicity Values for Perfluorobutane Sulfonic Acid (CASRN 375-73-5) and Related Compound Potassium Perfluorobutane Sulfonate (CASR

Provisional Peer-Reviewed Toxicity Values for Toluene-2,5-diamine (CASRN 95-70-5) and Compounds Toluene-2,5-diamine sulfate (6369-59-1) [also known as 1,4-Benzenediamine-2-methyl sulfate or 2-Methylbenzene-1,4-diamine sulfate (615-50-9)], Toluene-2,5-diamine dihydrochloride (615-45-2), and Toluene-2,5-diamine monohydrochloride (74612-12-7)
Provisional Peer-Reviewed Toxicity Values for 2,3-Toluenediamine (CASRN 2687-25-4)
Provisional Peer Reviewed Toxicity Values for 2,6-Toluenediamine (CASRN 823-40-5)
Provisional Peer-Reviewed Toxicity Values for 3,4-Toluenediamine (CASRN 496-72-0)
Provisional Peer-Reviewed Subchronic Toxicity Values for Toluene (CASRN 108-88-3)
Provisional Peer-Reviewed Toxicity Values for p-Toluic Acid (CASRN 99-94-5)
Provisional Peer-Reviewed Toxicity Values for o-Toluidine (CASRN 95-53-4)
Provisional Peer-Reviewed Toxicity Values for p-Toluidine (CASRN 106-49-0)
Provisional Peer-Reviewed Toxicity Values for Complex Mixtures of Aliphatic and Aromatic Hydrocarbons (C

In [21]:
#Save the final clowder log after filling in all of this metadata
#Script practice was run with temrporary sheet with non standard column names. Need to rename them
df.rename(columns={'file_name': 'File Name', 'url': 'URL', 'doi':'DOI', 'hero_id':'HERO ID', 'source':'Source', 'subsource':'Subsource','year':'Year','study_name':'Study Name','dsstox_structure_id':'DSSTox Structure ID','author':'Author'}, inplace=True)
out_path = "L:\\PRIV\\ToxValDB\\Upload to Clowder\\Clowder Logs\\clowder_log_source_pprtv_ornl_08032022_metadata_mmille16.xlsx"
with pd.ExcelWriter(out_path) as writer:
    df.to_excel(writer, index = False)