# EPA IRIS Chemicals Webscrape (from IRIS Assessment)

### Preliminaries

In [1]:
# importing relevant libraries

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
def get_html_contents(page_url): # define beautiful soup function (downloads html from given url)
    results = requests.get(page_url) # access url location
    soup = BeautifulSoup(results.text,'html.parser') # parse through html of url and store page info
    return soup

In [3]:
iris_url = 'https://iris.epa.gov/AtoZ/?list_type=alpha' # define desired url

iris_contents = get_html_contents(iris_url) # store html contents in BS4 object

In [4]:
print(iris_contents.prettify()[:10000]) # take a sneak peak at first 10000 characters of html contents

<!DOCTYPE html>
<!--[if IEMobile 7]><html class="iem7 no-js" lang="en" dir="ltr"><![endif]-->
<!--[if lt IE 7]><html class="lt-ie9 lt-ie8 lt-ie7 no-js" lang="en" dir="ltr"><![endif]-->
<!--[if (IE 7)&(!IEMobile)]><html class="lt-ie9 lt-ie8 no-js" lang="en" dir="ltr"><![endif]-->
<!--[if IE 8]><html class="lt-ie9 no-js" lang="en" dir="ltr"><![endif]-->
<!--[if (gt IE 8)|(gt IEMobile 7)]><!-->
<html class="no-js not-oldie" dir="ltr" lang="en" version="HTML+RDFa 1.1" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/terms/" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:og="http://ogp.me/ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:sioc="http://rdfs.org/sioc/ns#" xmlns:sioct="http://rdfs.org/sioc/types#" xmlns:skos="http://www.w3.org/2004/02/skos/core#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#">
 <head>
  <meta charset="utf-8"/>
  <meta content="true" name="HandheldFriendly"/>
  <link href="https://www.epa.gov/sites/all/themes/epa/fa

In [5]:
iris_table = iris_contents.find('table', {'id': 'alphalist'}) # find table with id = alphalist
iris_header = iris_table.tr.find_all('th') # create html table headers list

column_names = [] # create empty list for column names

for header in iris_header:
    column_names.append(header.text.strip()) # collects column names as list

print(column_names)

['', 'Chemical Name', 'CASRN', 'Last Significant Revision*', 'IRIS Summary', 'Tox Review/Supporting Document', 'Literature Screening Review', 'Critical Effect Systems', 'Tumor Site', 'Pesticide', 'Archive']


In [6]:
table_rows = iris_table.find_all('tr') # list of all table rows (list of BS4 objects)
chem_links = []


for row in table_rows: # grab link for each chemical in EPA IRIS Assessment list list
    cells = row.find_all('td')
    if len(row.find_all('a')) > 1:
           chem_links.append('https://iris.epa.gov' + str(row.find_all('a')[0]['href']))

In [7]:
iris_pages = [] # create empty list for storing html contents of each chemical's page (list of BS4 objects)

for link in chem_links: # loop through chemical list and grab url contents for each
    iris_url = link # define current chemical url
    iris_pages.append(get_html_contents(iris_url))

### Initializing the chemical dataframe:

We now create an empty dataframe and append the rows of the table of https://iris.epa.gov/AtoZ/?list_type=alpha.

In [8]:
chem_df = pd.DataFrame(columns=column_names) # create empty df for storing initial chemical info

for i, row in enumerate(table_rows[1:]): # grab data from each cell and append to chemical df
    
    cells = row.find_all('td')
    
    if len(cells) > 1:
        cell_data = [ele.text.replace('\n', '').strip() for ele in cells if ele.get('class') != 'sorting_1']
#         cell_data = [ele.text.replace('\n', '').replace(',', '_').strip() for ele in cells if ele.get('class') != 'sorting_1']
#         chem_df.loc[len(chem_df)] = [iris_pages[i].find('div', {'class': 'node node-page clearfix view-mode-full'}).h1.text] + cell_data[1:]
        chem_df.loc[len(chem_df)] = cell_data

In [9]:
chem_df.head() # take a peak

Unnamed: 0,Unnamed: 1,Chemical Name,CASRN,Last Significant Revision*,IRIS Summary,Tox Review/Supporting Document,Literature Screening Review,Critical Effect Systems,Tumor Site,Pesticide,Archive
0,1,Acenaphthene,83-32-9,19901101 Nov-01-1990,"(PDF 7 pp, 89 K)",Not Available,No,Hepatic,,,
1,2,Acenaphthylene,208-96-8,19910101 Jan-01-1991,"(PDF 6 pp, 82 K)",Not Available,No,,,,
2,3,Acephate,30560-19-1,19890501 May-01-1989,"(PDF 13 pp, 114 K)",Not Available,Yes,Nervous,Hepatic,pesticide,archive
3,4,Acetaldehyde,75-07-0,19911001 Oct-01-1991,"(PDF 20 pp, 147 K)",Not Available,No,Nervous ...,Respiratory,,
4,5,Acetochlor,34256-82-1,19930901 Sep-01-1993,"(PDF 14 pp, 126 K)",Not Available,Yes,Hematologic ...,,pesticide,


In [10]:
chem_df = chem_df.drop(columns=[''], axis=1) # chemical numbering not necessary
chem_df.head()

Unnamed: 0,Chemical Name,CASRN,Last Significant Revision*,IRIS Summary,Tox Review/Supporting Document,Literature Screening Review,Critical Effect Systems,Tumor Site,Pesticide,Archive
0,Acenaphthene,83-32-9,19901101 Nov-01-1990,"(PDF 7 pp, 89 K)",Not Available,No,Hepatic,,,
1,Acenaphthylene,208-96-8,19910101 Jan-01-1991,"(PDF 6 pp, 82 K)",Not Available,No,,,,
2,Acephate,30560-19-1,19890501 May-01-1989,"(PDF 13 pp, 114 K)",Not Available,Yes,Nervous,Hepatic,pesticide,archive
3,Acetaldehyde,75-07-0,19911001 Oct-01-1991,"(PDF 20 pp, 147 K)",Not Available,No,Nervous ...,Respiratory,,
4,Acetochlor,34256-82-1,19930901 Sep-01-1993,"(PDF 14 pp, 126 K)",Not Available,Yes,Hematologic ...,,pesticide,


In [11]:
chem_df.columns = chem_df.columns.str.upper().tolist()
chem_df.head()

Unnamed: 0,CHEMICAL NAME,CASRN,LAST SIGNIFICANT REVISION*,IRIS SUMMARY,TOX REVIEW/SUPPORTING DOCUMENT,LITERATURE SCREENING REVIEW,CRITICAL EFFECT SYSTEMS,TUMOR SITE,PESTICIDE,ARCHIVE
0,Acenaphthene,83-32-9,19901101 Nov-01-1990,"(PDF 7 pp, 89 K)",Not Available,No,Hepatic,,,
1,Acenaphthylene,208-96-8,19910101 Jan-01-1991,"(PDF 6 pp, 82 K)",Not Available,No,,,,
2,Acephate,30560-19-1,19890501 May-01-1989,"(PDF 13 pp, 114 K)",Not Available,Yes,Nervous,Hepatic,pesticide,archive
3,Acetaldehyde,75-07-0,19911001 Oct-01-1991,"(PDF 20 pp, 147 K)",Not Available,No,Nervous ...,Respiratory,,
4,Acetochlor,34256-82-1,19930901 Sep-01-1993,"(PDF 14 pp, 126 K)",Not Available,Yes,Hematologic ...,,pesticide,


In [12]:
chem_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 571 entries, 0 to 570
Data columns (total 10 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   CHEMICAL NAME                   571 non-null    object
 1   CASRN                           571 non-null    object
 2   LAST SIGNIFICANT REVISION*      571 non-null    object
 3   IRIS SUMMARY                    571 non-null    object
 4   TOX REVIEW/SUPPORTING DOCUMENT  571 non-null    object
 5   LITERATURE SCREENING REVIEW     571 non-null    object
 6   CRITICAL EFFECT SYSTEMS         571 non-null    object
 7   TUMOR SITE                      571 non-null    object
 8   PESTICIDE                       571 non-null    object
 9   ARCHIVE                         571 non-null    object
dtypes: object(10)
memory usage: 49.1+ KB


In [13]:
chem_df[chem_df['CHEMICAL NAME'] == 'Arsenic, Inorganic']

Unnamed: 0,CHEMICAL NAME,CASRN,LAST SIGNIFICANT REVISION*,IRIS SUMMARY,TOX REVIEW/SUPPORTING DOCUMENT,LITERATURE SCREENING REVIEW,CRITICAL EFFECT SYSTEMS,TUMOR SITE,PESTICIDE,ARCHIVE
42,"Arsenic, Inorganic",7440-38-2,19950601 Jun-01-1995,"(PDF 29 pp, 186 K)",Not Available ...,Yes,Cardiovascular ...,Dermal ...,,


In [14]:
chem_df.to_csv('epa_iris_first_df.csv', index=False)

In [15]:
print('Number of chemicals: ' + str(len(chem_df)))
print('Number of links: ' + str(len(chem_links)))
print('Number of pages: ' + str(len(iris_pages)))

Number of chemicals: 571
Number of links: 571
Number of pages: 571


In [16]:
print(iris_pages[0].title)
print(iris_pages[570].title)

<title>
Acenaphthene CASRN 83-32-9 |IRIS|US EPA, ORD 
</title>
<title>
Zineb CASRN 12122-67-7 |IRIS|US EPA, ORD 
</title>


## Noncancer Assessments

There are two types of noncancer assessments in the IRIS assessments: oral exposure and inhalation exposure. Oral exposure corresponds to Reference Dose for Oral Exposure (RfD) measured in mg/kg-day, and inhalation exposure corresponds to Reference Concentration for Inhalation Exposure (RfC) measued in mg/m$^3$. For noncancer assessments, we classify which type (oral or inhalation) with its corresponding data for RfD or RfC.

In [17]:
noncancer_headers = ['CHEMICAL NAME', 'NONCANCER ASSESSMENT TYPE', 
                     'SYSTEM (RfD)', 'RfD (mg/kg-day)', 
                     'BASIS (RfD)', 'PoD (RfD)', 
                     'COMPOSITE UF (RfD)', 'CONFIDENCE (RfD)',
                     'SYSTEM (RfC)', 'RfC (mg/m^3)', 
                     'Basis (RfC)', 'PoD (RfC)', 
                     'COMPOSITE UF (RfC)', 'CONFIDENCE (RfC)']

noncancer_df = pd.DataFrame(columns=noncancer_headers)
noncancer_df

Unnamed: 0,CHEMICAL NAME,NONCANCER ASSESSMENT TYPE,SYSTEM (RfD),RfD (mg/kg-day),BASIS (RfD),PoD (RfD),COMPOSITE UF (RfD),CONFIDENCE (RfD),SYSTEM (RfC),RfC (mg/m^3),Basis (RfC),PoD (RfC),COMPOSITE UF (RfC),CONFIDENCE (RfC)


In [18]:
noncancer_data = []
no_noncancer_assessments = []
no_health_assessments = []

for i, page in enumerate(iris_pages):
    
    try:
        
        chemical_name = page.find('div', {'class': 'node node-page clearfix view-mode-full'}).h1.text
        current_page_divs = page.find_all('div', {'class': 'multi box'})
        noncancer_tables = current_page_divs[0].find_all('table')

        if len(noncancer_tables) == 1 and noncancer_tables[0].find_all('th')[1].text == 'RfD (mg/kg-day)':

            exposure_type = 'Oral'
            rfd_table = current_page_divs[0].find_all('table')[0]
            rfd_rows = rfd_table.find_all('tr')
            for row in rfd_rows:
                cells = row.find_all('td')
                if len(cells) > 1:
                    cell_data = ([chemical_name, exposure_type] 
                                 + [ele.text.replace('\n', ' ').replace('\t', '').strip().replace(',', ';') 
                                    for ele in cells] 
                                 + 6*[np.nan])
                    noncancer_data.append(cell_data)

        elif len(noncancer_tables) == 1 and noncancer_tables[0].find_all('th')[1].text == 'RfC (mg/m3)':

            exposure_type = 'Inhalation'
            rfc_table = current_page_divs[0].find_all('table')[0]
            rfc_rows = rfc_table.find_all('tr')
            for row in rfc_rows:
                cells = row.find_all('td')
                if len(cells) > 1:
                    
                    cell_data = ([chemical_name, exposure_type] 
                                 + 6*[np.nan] 
                                 + [ele.text.replace('\n', ' ').replace('\t', '').strip().replace(',', ';') 
                                    for ele in cells])
                    
                    noncancer_data.append(cell_data)

        elif len(noncancer_tables) == 2:

            exposure_type = 'Oral'
            rfd_table = current_page_divs[0].find_all('table')[0]
            rfd_rows = rfd_table.find_all('tr')
            
            for row in rfd_rows:
                cells = row.find_all('td')
                
                if len(cells) > 1:
                    
                    cell_data = ([chemical_name, exposure_type] 
                                 + [ele.text.replace('\n', ' ').replace('\t', '').strip().replace(',', ';') 
                                    for ele in cells] + 6*[np.nan])
                    
                    noncancer_data.append(cell_data)

            exposure_type = 'Inahlation'
            rfc_table = current_page_divs[0].find_all('table')[1]
            rfc_rows = rfc_table.find_all('tr')
            
            for row in rfc_rows:
                cells = row.find_all('td')
                
                if len(cells) > 1:
                    
                    cell_data = ([chemical_name, exposure_type] 
                                 + 6*[np.nan] 
                                 + [ele.text.replace('\n', ' ').replace('\t', '').strip().replace(',', ';') 
                                    for ele in cells])
                    
                    noncancer_data.append(cell_data)

        else:
            
            noncancer_data.append([chemical_name] + (len(noncancer_df.columns) - 1)*[np.nan])
            no_noncancer_assessments.append(chemical_name)
            
    except:

        chemical_name = page.find('div', {'class': 'node node-page clearfix view-mode-full'}).h1.text
        noncancer_data.append([chemical_name] + (len(noncancer_df.columns) - 1)*[np.nan])
        no_noncancer_assessments.append(chemical_name)
        no_health_assessments.append(chemical_name)


print('No noncancer health assessments for the following chemicals: \n')
print(no_noncancer_assessments)
print('\nNo health assessments at all for the following chemicals: \n')
print(no_health_assessments)
        
for i in range(len(noncancer_data)):
    noncancer_df.loc[len(noncancer_df)] = noncancer_data[i]

No noncancer health assessments for the following chemicals: 

['Acenaphthylene', 'Acetyl chloride', 'Adiponitrile', '4-Aminopyridine', 'Ammonium acetate', 'Ammonium methacrylate', 'ortho-Anisidine', 'Aramite', 'Aroclor 1248', 'Asbestos', 'Azobenzene', 'Barium cyanide', 'Benz[a]anthracene', 'Benzo[b]fluoranthene', 'Benzo[g,h,i]perylene', 'Benzo[k]fluoranthene', 'Benzotrichloride', 'Benzyl chloride', 'Bis(2-chloroethoxy)methane', 'Bis(chloroethyl)ether (BCEE)', 'Bis(chloromethyl)ether (BCME)', 'Brominated dibenzofurans', 'Bromochloromethane', 'p-Bromodiphenyl ether', 'Bromotrichloromethane', 't-Butylchloride', 'Cacodylic acid', 'Carbonyl sulfide', '1-Chlorobutane', '2-Chlorobutane', 'Chlorocyclopentadiene', 'Chloromethyl methyl ether (CMME)', 'p-Chlorophenyl methyl sulfide', 'p-Chlorophenyl methyl sulfone', 'p-Chlorophenyl methyl sulfoxide', 'Chrysene', 'Coke oven emissions', 'Copper', 'Creosote', 'Crotonaldehyde', 'Cyanazine', '2,4-Diaminotoluene', 'Diazomethane', 'Dibenz[a,h]anthracen

In [19]:
noncancer_df.head()

Unnamed: 0,CHEMICAL NAME,NONCANCER ASSESSMENT TYPE,SYSTEM (RfD),RfD (mg/kg-day),BASIS (RfD),PoD (RfD),COMPOSITE UF (RfD),CONFIDENCE (RfD),SYSTEM (RfC),RfC (mg/m^3),Basis (RfC),PoD (RfC),COMPOSITE UF (RfC),CONFIDENCE (RfC)
0,Acenaphthene,Oral,Hepatic,6 x 10 -2,Hepatotoxicity,NOAEL : 1.75 x 102 mg/kg-day,3000.0,Low,,,,,,
1,Acenaphthylene,,,,,,,,,,,,,
2,Acephate,Oral,Nervous,4 x 10 -3,Inhibition of brain ChE,LEL : 1.2 x 10-1 mg/kg-day,30.0,High,,,,,,
3,Acetaldehyde,Inhalation,,,,,,,Nervous; Respiratory,9 x 10 -3,Degeneration of olfactory epithelium,NOAEL (HEC): 8.7 mg/m3,1000.0,Low
4,Acetochlor,Oral,Nervous; Reproductive; Hepatic; Urinary; Hemat...,2 x 10 -2,Salivation; increased ALT and ornithine carbam...,NOAEL : 2 mg/kg-day,100.0,High,,,,,,


In [20]:
noncancer_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 638 entries, 0 to 637
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   CHEMICAL NAME              638 non-null    object
 1   NONCANCER ASSESSMENT TYPE  486 non-null    object
 2   SYSTEM (RfD)               381 non-null    object
 3   RfD (mg/kg-day)            381 non-null    object
 4   BASIS (RfD)                381 non-null    object
 5   PoD (RfD)                  381 non-null    object
 6   COMPOSITE UF (RfD)         381 non-null    object
 7   CONFIDENCE (RfD)           381 non-null    object
 8   SYSTEM (RfC)               105 non-null    object
 9   RfC (mg/m^3)               105 non-null    object
 10  Basis (RfC)                105 non-null    object
 11  PoD (RfC)                  105 non-null    object
 12  COMPOSITE UF (RfC)         105 non-null    object
 13  CONFIDENCE (RfC)           105 non-null    object
dtypes: object(

In [21]:
print(len(set(noncancer_df['CHEMICAL NAME'])))

571


In [22]:
noncancer_df[noncancer_df['CHEMICAL NAME'] == 'Arsenic, Inorganic']

Unnamed: 0,CHEMICAL NAME,NONCANCER ASSESSMENT TYPE,SYSTEM (RfD),RfD (mg/kg-day),BASIS (RfD),PoD (RfD),COMPOSITE UF (RfD),CONFIDENCE (RfD),SYSTEM (RfC),RfC (mg/m^3),Basis (RfC),PoD (RfC),COMPOSITE UF (RfC),CONFIDENCE (RfC)
45,"Arsenic, Inorganic",Oral,Cardiovascular; Dermal,3 x 10 -4,Hyperpigmentation; keratosis and possible vasc...,NOAEL : 8 x 10-4 mg/kg-day,3,Medium,,,,,,


## Cancer Assessments

### WOE Characterization

Weight of Evidence for Cancer (WOE)

In [23]:
woe_headers = ['CHEMICAL NAME', 'WOE CHARACTERIZATION', 'FRAMEWORK FOR WOE CHARACTERIZATION', 'WOE BASIS']

woe_df = pd.DataFrame(columns=woe_headers)
woe_df.head()

Unnamed: 0,CHEMICAL NAME,WOE CHARACTERIZATION,FRAMEWORK FOR WOE CHARACTERIZATION,WOE BASIS


In [24]:
woe_list = []
missing_woe = []
no_health_assessments = []

for page in iris_pages:
    
    try:

        chemical_name = page.find('div', {'class': 'node node-page clearfix view-mode-full'}).h1.text
        current_page_divs = page.find_all('div', {'class': 'multi box'})
        woe_table = current_page_divs[1].find_all('table')

        if len(woe_table) > 0:

            woe_rows = woe_table[0].find_all('tr')
            woe_basis = current_page_divs[1].findChildren('li')

            for row in woe_rows:

                cells = row.find_all('td')

                if len(cells) > 1:

                    woe_basis_str = ''

                    for basis in woe_basis:
                        woe_basis_str += basis.text + '\n'

                    cell_data = [chemical_name] + [ele.text.strip() for ele in cells] + [woe_basis_str]
                    woe_list.append(cell_data)

        else:

            woe_list.append([chemical_name] + (len(woe_df.columns) - 1)*[np.nan])
            missing_woe.append(chemical_name)
    
    except:
        
        chemical_name = page.find('div', {'class': 'node node-page clearfix view-mode-full'}).h1.text
        woe_list.append([chemical_name] + (len(woe_df.columns) - 1)*[np.nan])
        missing_woe.append(chemical_name)
        no_health_assessments.append(chemical_name)
        
for i in range(len(woe_list)):
    woe_df.loc[len(woe_df)] = woe_list[i]

In [25]:
woe_df.loc[len(woe_df)] = woe_list[0]
woe_df.head()

Unnamed: 0,CHEMICAL NAME,WOE CHARACTERIZATION,FRAMEWORK FOR WOE CHARACTERIZATION,WOE BASIS
0,Acenaphthene,,,
1,Acenaphthylene,D (Not classifiable as to human carcinogenicity),Guidelines for Carcinogen Risk Assessment (U.S...,Based on no human data and inadequate data fro...
2,Acephate,C (Possible human carcinogen),Guidelines for Carcinogen Risk Assessment (U.S...,The classification is based on increased incid...
3,Acetaldehyde,B2 (Probable human carcinogen - based on suffi...,Guidelines for Carcinogen Risk Assessment (U.S...,Based on increased incidence of nasal tumors i...
4,Acetochlor,,,


In [26]:
woe_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 607 entries, 0 to 606
Data columns (total 4 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   CHEMICAL NAME                       607 non-null    object
 1   WOE CHARACTERIZATION                307 non-null    object
 2   FRAMEWORK FOR WOE CHARACTERIZATION  307 non-null    object
 3   WOE BASIS                           307 non-null    object
dtypes: object(4)
memory usage: 23.7+ KB


In [27]:
print(len(set(woe_df['CHEMICAL NAME'])))

571


In [28]:
woe_df[woe_df.duplicated('CHEMICAL NAME', keep=False)]

Unnamed: 0,CHEMICAL NAME,WOE CHARACTERIZATION,FRAMEWORK FOR WOE CHARACTERIZATION,WOE BASIS
0,Acenaphthene,,,
6,Acetonitrile,D (Not classifiable as to human carcinogenicity),Guidelines for Carcinogen Risk Assessment (U.S...,Under the Proposed Guidelines for Carcinogen R...
7,Acetonitrile,Carcinogenic potential cannot be determined,Proposed Guidelines for Carcinogen Risk Assess...,Under the Proposed Guidelines for Carcinogen R...
51,Barium and Compounds,D (Not classifiable as to human carcinogenicity),Guidelines for Carcinogen Risk Assessment (U.S...,Under EPA's 1986 Guidelines for Carcinogen Ris...
52,Barium and Compounds,Carcinogenic potential cannot be determined\n ...,Proposed Guidelines for Carcinogen Risk Assess...,Under EPA's 1986 Guidelines for Carcinogen Ris...
...,...,...,...,...
599,Zinc and Compounds,D (Not classifiable as to human carcinogenicity),Guidelines for Carcinogen Risk Assessment (U.S...,Under the Guidelines for Carcinogen Risk Asses...
600,Zinc and Compounds,,Guidelines for Carcinogen Risk Assessment (U.S...,Under the Guidelines for Carcinogen Risk Asses...
601,Zinc and Compounds,Data are inadequate for an assessment of human...,Revised Draft Guidelines for Carcinogen Risk A...,Under the Guidelines for Carcinogen Risk Asses...
602,Zinc and Compounds,Inadequate information to assess carcinogenic ...,Guidelines for Carcinogen Risk Assessment (U.S...,Under the Guidelines for Carcinogen Risk Asses...


### Quantitative Estimate of Carcinogenic Risk from Oral and Inhalation Exposures

In [29]:
oral_inhalation_headers = ['CHEMICAL NAME', 'QUANT. EST. OF CARC. RISK FROM ORAL EXPOSURE', 'QUANT. EST. OF CARC. RISK FROM INHALATION EXPOSURE']

cancer_oral_inhalation_df = pd.DataFrame(columns=oral_inhalation_headers)
cancer_oral_inhalation_df.head()

Unnamed: 0,CHEMICAL NAME,QUANT. EST. OF CARC. RISK FROM ORAL EXPOSURE,QUANT. EST. OF CARC. RISK FROM INHALATION EXPOSURE


In [30]:
oral_inhalation_list = []
missing_oral = []
missing_inhalation = []
no_health_assessments = []

for page in iris_pages:
    
    try:
        
        chemical_name = page.find('div', {'class': 'node node-page clearfix view-mode-full'}).h1.text
        current_page_divs = page.find_all('div', {'class': 'multi box'})

        oral_inhalation_assessments = current_page_divs[1].findChildren('div', {'class': ''})

        if (len(oral_inhalation_assessments) == 1 
                and ('Oral Slope Factor' in oral_inhalation_assessments[0].text)):

            oral_inhalation_list.append([chemical_name, oral_inhalation_assessments[0].p.text.strip(), np.nan])
            missing_inhalation.append(chemical_name)

        elif (len(oral_inhalation_assessments) == 1 
                and ('Inhalation Unit Risk' in oral_inhalation_assessments[0].text)):

            oral_inhalation_list.append([chemical_name, np.nan, oral_inhalation_assessments[0].p.text.strip()])
            missing_oral.append(chemical_name)

        elif len(oral_inhalation_assessments) == 2:

            oral_inhalation_list.append([chemical_name, oral_inhalation_assessments[0].p.text.strip(), 
                                         oral_inhalation_assessments[1].p.text.strip()])

        else:

            oral_inhalation_list.append([chemical_name, np.nan, np.nan])
            missing_oral.append(chemical_name)
            missing_inhalation.append(chemical_name)
        
    except:
        
        chemical_name = page.find('div', {'class': 'node node-page clearfix view-mode-full'}).h1.text
        oral_inhalation_list.append([chemical_name, np.nan, np.nan])
        no_health_assessments.append(chemical_name)

print('No oral assessments for the following chemicals: \n')
print(missing_oral)
print('\nNo inhalation assessments for the following chemicals: \n')
print(missing_inhalation)
print('\nNo health assessments for the following chemicals: \n')
print(no_health_assessments)

for i in range(len(oral_inhalation_list)):
    cancer_oral_inhalation_df.loc[len(cancer_oral_inhalation_df)] = oral_inhalation_list[i]

No oral assessments for the following chemicals: 

['Acenaphthene', 'Acenaphthylene', 'Acetaldehyde', 'Acetochlor', 'Acetone', 'Acetonitrile', 'Acetophenone', 'Acetyl chloride', 'Acifluorfen, sodium', 'Acrolein', 'Acrylic acid', 'Adiponitrile', 'Alachlor', 'Alar', 'Aldicarb', 'Aldicarb sulfone', 'Ally', 'Allyl alcohol', 'Allyl chloride', 'Aluminum phosphide', 'Amdro', 'Ametryn', '4-Aminopyridine', 'Amitraz', 'Ammonia', 'Ammonium acetate', 'Ammonium methacrylate', 'Ammonium sulfamate', 'ortho-Anisidine', 'Anthracene', 'Antimony', 'Antimony trioxide', 'Apollo', 'Aroclor 1016', 'Aroclor 1248', 'Aroclor 1254', 'Arsine', 'Asbestos', 'Assure', 'Asulam', 'Atrazine', 'Avermectin B1', 'Barium and Compounds', 'Barium cyanide', 'Baygon', 'Bayleton', 'Baythroid', 'Benefin', 'Benomyl', 'Bentazon (Basagran)', 'Benz[a]anthracene', 'Benzaldehyde', 'Benzene', 'Benzo[b]fluoranthene', 'Benzo[g,h,i]perylene', 'Benzo[k]fluoranthene', 'Benzoic acid', 'Beryllium and compounds', 'Bidrin', 'Biphenthrin', 'Bis(

In [31]:
cancer_oral_inhalation_df.head()

Unnamed: 0,CHEMICAL NAME,QUANT. EST. OF CARC. RISK FROM ORAL EXPOSURE,QUANT. EST. OF CARC. RISK FROM INHALATION EXPOSURE
0,Acenaphthene,,
1,Acenaphthylene,,
2,Acephate,Oral Slope Factor:\n 8.7\n ...,
3,Acetaldehyde,,Inhalation Unit Risk:\n 2.2\n ...
4,Acetochlor,,


In [32]:
cancer_oral_inhalation_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 571 entries, 0 to 570
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   CHEMICAL NAME                                       571 non-null    object
 1   QUANT. EST. OF CARC. RISK FROM ORAL EXPOSURE        79 non-null     object
 2   QUANT. EST. OF CARC. RISK FROM INHALATION EXPOSURE  57 non-null     object
dtypes: object(3)
memory usage: 17.8+ KB


In [33]:
oral_slope_list = []
inhalation_unit_list = []

for i, page in enumerate(iris_pages):
    
    chemical_name = page.find('div', {'class': 'node node-page clearfix view-mode-full'}).h1.text
    
    if 'Oral Slope Factor' in str(page):
        oral_slope_list.append(chemical_name)
    
    if 'Inhalation Unit Risk' in str(page):
        inhalation_unit_list.append(chemical_name)

print('The phrase, \"Oral Slope Factor\" appears in {} chemicals.\n'.format(len(oral_slope_list)))
print('The chemicals for which \"Oral Slope Factor\" appears are given below: \n')
print(oral_slope_list)
print('\n\nThe phrase \"Inhalation Unit Risk\" appears in {} chemicals.\n'.format(len(inhalation_unit_list)))
print('The chemicals for which \"Inhalation Unit Risk\" appears are given below: \n')
print(inhalation_unit_list)

The phrase, "Oral Slope Factor" appears in 83 chemicals.

The chemicals for which "Oral Slope Factor" appears are given below: 

['Acephate', 'Acrylamide', 'Acrylonitrile', 'Aldrin', 'Aniline', 'Aramite', 'Arsenic, Inorganic', 'Azobenzene', 'Benzene', 'Benzidine', 'Benzo[a]pyrene (BaP)', 'Benzotrichloride', 'Benzyl chloride', 'Biphenyl', 'Bis(chloroethyl)ether (BCEE)', 'Bis(chloromethyl)ether (BCME)', 'Bromate', 'Bromodichloromethane', 'Bromoform', 'tert-Butyl Alcohol (tBA)', 'Carbon tetrachloride', 'Chlordane (Technical)', 'Chlordecone (Kepone)', "2,2',3,3',4,4',5,5',6,6'-Decabromodiphenyl ether (BDE-209)", 'Dibromochloromethane', '1,2-Dibromoethane', 'Dichloroacetic acid', "3,3'-Dichlorobenzidine", "p,p'-Dichlorodiphenyl dichloroethane (DDD)", "p,p'-Dichlorodiphenyldichloroethylene (DDE)", "p,p'-Dichlorodiphenyltrichloroethane (DDT)", '1,2-Dichloroethane', 'Dichloromethane', '1,3-Dichloropropene', 'Dichlorvos', 'Dieldrin', 'Di(2-ethylhexyl)adipate', 'Di (2-ethylhexyl)phthalate (DEHP)

## Merging the Above Dataframes

In [34]:
pd.options.display.max_columns = None

In [35]:
chem_df = chem_df.merge(noncancer_df, how='left', left_on='CHEMICAL NAME', right_on='CHEMICAL NAME', suffixes=('', '_noncancer'))
chem_df.head()

Unnamed: 0,CHEMICAL NAME,CASRN,LAST SIGNIFICANT REVISION*,IRIS SUMMARY,TOX REVIEW/SUPPORTING DOCUMENT,LITERATURE SCREENING REVIEW,CRITICAL EFFECT SYSTEMS,TUMOR SITE,PESTICIDE,ARCHIVE,NONCANCER ASSESSMENT TYPE,SYSTEM (RfD),RfD (mg/kg-day),BASIS (RfD),PoD (RfD),COMPOSITE UF (RfD),CONFIDENCE (RfD),SYSTEM (RfC),RfC (mg/m^3),Basis (RfC),PoD (RfC),COMPOSITE UF (RfC),CONFIDENCE (RfC)
0,Acenaphthene,83-32-9,19901101 Nov-01-1990,"(PDF 7 pp, 89 K)",Not Available,No,Hepatic,,,,Oral,Hepatic,6 x 10 -2,Hepatotoxicity,NOAEL : 1.75 x 102 mg/kg-day,3000.0,Low,,,,,,
1,Acenaphthylene,208-96-8,19910101 Jan-01-1991,"(PDF 6 pp, 82 K)",Not Available,No,,,,,,,,,,,,,,,,,
2,Acephate,30560-19-1,19890501 May-01-1989,"(PDF 13 pp, 114 K)",Not Available,Yes,Nervous,Hepatic,pesticide,archive,Oral,Nervous,4 x 10 -3,Inhibition of brain ChE,LEL : 1.2 x 10-1 mg/kg-day,30.0,High,,,,,,
3,Acetaldehyde,75-07-0,19911001 Oct-01-1991,"(PDF 20 pp, 147 K)",Not Available,No,Nervous ...,Respiratory,,,Inhalation,,,,,,,Nervous; Respiratory,9 x 10 -3,Degeneration of olfactory epithelium,NOAEL (HEC): 8.7 mg/m3,1000.0,Low
4,Acetochlor,34256-82-1,19930901 Sep-01-1993,"(PDF 14 pp, 126 K)",Not Available,Yes,Hematologic ...,,pesticide,,Oral,Nervous; Reproductive; Hepatic; Urinary; Hemat...,2 x 10 -2,Salivation; increased ALT and ornithine carbam...,NOAEL : 2 mg/kg-day,100.0,High,,,,,,


In [36]:
chem_df = chem_df.merge(woe_df, how='left', left_on='CHEMICAL NAME', right_on='CHEMICAL NAME', suffixes=('', '_woe'))
chem_df.head()

Unnamed: 0,CHEMICAL NAME,CASRN,LAST SIGNIFICANT REVISION*,IRIS SUMMARY,TOX REVIEW/SUPPORTING DOCUMENT,LITERATURE SCREENING REVIEW,CRITICAL EFFECT SYSTEMS,TUMOR SITE,PESTICIDE,ARCHIVE,NONCANCER ASSESSMENT TYPE,SYSTEM (RfD),RfD (mg/kg-day),BASIS (RfD),PoD (RfD),COMPOSITE UF (RfD),CONFIDENCE (RfD),SYSTEM (RfC),RfC (mg/m^3),Basis (RfC),PoD (RfC),COMPOSITE UF (RfC),CONFIDENCE (RfC),WOE CHARACTERIZATION,FRAMEWORK FOR WOE CHARACTERIZATION,WOE BASIS
0,Acenaphthene,83-32-9,19901101 Nov-01-1990,"(PDF 7 pp, 89 K)",Not Available,No,Hepatic,,,,Oral,Hepatic,6 x 10 -2,Hepatotoxicity,NOAEL : 1.75 x 102 mg/kg-day,3000.0,Low,,,,,,,,,
1,Acenaphthene,83-32-9,19901101 Nov-01-1990,"(PDF 7 pp, 89 K)",Not Available,No,Hepatic,,,,Oral,Hepatic,6 x 10 -2,Hepatotoxicity,NOAEL : 1.75 x 102 mg/kg-day,3000.0,Low,,,,,,,,,
2,Acenaphthylene,208-96-8,19910101 Jan-01-1991,"(PDF 6 pp, 82 K)",Not Available,No,,,,,,,,,,,,,,,,,,D (Not classifiable as to human carcinogenicity),Guidelines for Carcinogen Risk Assessment (U.S...,Based on no human data and inadequate data fro...
3,Acephate,30560-19-1,19890501 May-01-1989,"(PDF 13 pp, 114 K)",Not Available,Yes,Nervous,Hepatic,pesticide,archive,Oral,Nervous,4 x 10 -3,Inhibition of brain ChE,LEL : 1.2 x 10-1 mg/kg-day,30.0,High,,,,,,,C (Possible human carcinogen),Guidelines for Carcinogen Risk Assessment (U.S...,The classification is based on increased incid...
4,Acetaldehyde,75-07-0,19911001 Oct-01-1991,"(PDF 20 pp, 147 K)",Not Available,No,Nervous ...,Respiratory,,,Inhalation,,,,,,,Nervous; Respiratory,9 x 10 -3,Degeneration of olfactory epithelium,NOAEL (HEC): 8.7 mg/m3,1000.0,Low,B2 (Probable human carcinogen - based on suffi...,Guidelines for Carcinogen Risk Assessment (U.S...,Based on increased incidence of nasal tumors i...


In [37]:
print(len(set(chem_df['CHEMICAL NAME'])))

571


In [38]:
chem_df = chem_df.merge(cancer_oral_inhalation_df, how='left', left_on='CHEMICAL NAME', right_on='CHEMICAL NAME', suffixes=('', '_oral_inhalation'))
chem_df.head()

Unnamed: 0,CHEMICAL NAME,CASRN,LAST SIGNIFICANT REVISION*,IRIS SUMMARY,TOX REVIEW/SUPPORTING DOCUMENT,LITERATURE SCREENING REVIEW,CRITICAL EFFECT SYSTEMS,TUMOR SITE,PESTICIDE,ARCHIVE,NONCANCER ASSESSMENT TYPE,SYSTEM (RfD),RfD (mg/kg-day),BASIS (RfD),PoD (RfD),COMPOSITE UF (RfD),CONFIDENCE (RfD),SYSTEM (RfC),RfC (mg/m^3),Basis (RfC),PoD (RfC),COMPOSITE UF (RfC),CONFIDENCE (RfC),WOE CHARACTERIZATION,FRAMEWORK FOR WOE CHARACTERIZATION,WOE BASIS,QUANT. EST. OF CARC. RISK FROM ORAL EXPOSURE,QUANT. EST. OF CARC. RISK FROM INHALATION EXPOSURE
0,Acenaphthene,83-32-9,19901101 Nov-01-1990,"(PDF 7 pp, 89 K)",Not Available,No,Hepatic,,,,Oral,Hepatic,6 x 10 -2,Hepatotoxicity,NOAEL : 1.75 x 102 mg/kg-day,3000.0,Low,,,,,,,,,,,
1,Acenaphthene,83-32-9,19901101 Nov-01-1990,"(PDF 7 pp, 89 K)",Not Available,No,Hepatic,,,,Oral,Hepatic,6 x 10 -2,Hepatotoxicity,NOAEL : 1.75 x 102 mg/kg-day,3000.0,Low,,,,,,,,,,,
2,Acenaphthylene,208-96-8,19910101 Jan-01-1991,"(PDF 6 pp, 82 K)",Not Available,No,,,,,,,,,,,,,,,,,,D (Not classifiable as to human carcinogenicity),Guidelines for Carcinogen Risk Assessment (U.S...,Based on no human data and inadequate data fro...,,
3,Acephate,30560-19-1,19890501 May-01-1989,"(PDF 13 pp, 114 K)",Not Available,Yes,Nervous,Hepatic,pesticide,archive,Oral,Nervous,4 x 10 -3,Inhibition of brain ChE,LEL : 1.2 x 10-1 mg/kg-day,30.0,High,,,,,,,C (Possible human carcinogen),Guidelines for Carcinogen Risk Assessment (U.S...,The classification is based on increased incid...,Oral Slope Factor:\n 8.7\n ...,
4,Acetaldehyde,75-07-0,19911001 Oct-01-1991,"(PDF 20 pp, 147 K)",Not Available,No,Nervous ...,Respiratory,,,Inhalation,,,,,,,Nervous; Respiratory,9 x 10 -3,Degeneration of olfactory epithelium,NOAEL (HEC): 8.7 mg/m3,1000.0,Low,B2 (Probable human carcinogen - based on suffi...,Guidelines for Carcinogen Risk Assessment (U.S...,Based on increased incidence of nasal tumors i...,,Inhalation Unit Risk:\n 2.2\n ...


In [39]:
chem_df[chem_df['CHEMICAL NAME'] == 'Arsenic, Inorganic']

Unnamed: 0,CHEMICAL NAME,CASRN,LAST SIGNIFICANT REVISION*,IRIS SUMMARY,TOX REVIEW/SUPPORTING DOCUMENT,LITERATURE SCREENING REVIEW,CRITICAL EFFECT SYSTEMS,TUMOR SITE,PESTICIDE,ARCHIVE,NONCANCER ASSESSMENT TYPE,SYSTEM (RfD),RfD (mg/kg-day),BASIS (RfD),PoD (RfD),COMPOSITE UF (RfD),CONFIDENCE (RfD),SYSTEM (RfC),RfC (mg/m^3),Basis (RfC),PoD (RfC),COMPOSITE UF (RfC),CONFIDENCE (RfC),WOE CHARACTERIZATION,FRAMEWORK FOR WOE CHARACTERIZATION,WOE BASIS,QUANT. EST. OF CARC. RISK FROM ORAL EXPOSURE,QUANT. EST. OF CARC. RISK FROM INHALATION EXPOSURE
47,"Arsenic, Inorganic",7440-38-2,19950601 Jun-01-1995,"(PDF 29 pp, 186 K)",Not Available ...,Yes,Cardiovascular ...,Dermal ...,,,Oral,Cardiovascular; Dermal,3 x 10 -4,Hyperpigmentation; keratosis and possible vasc...,NOAEL : 8 x 10-4 mg/kg-day,3,Medium,,,,,,,A (Human carcinogen),Guidelines for Carcinogen Risk Assessment (U.S...,Based on sufficient evidence from human data. ...,Oral Slope Factor:\n 1.5\n ...,Inhalation Unit Risk:\n 4.3\n ...


In [40]:
print('Length of chem_df: ' + str(len(chem_df)))
print('Total unique chemicals in chem_df: ' + str(len(chem_df['CHEMICAL NAME'].unique())))

Length of chem_df: 693
Total unique chemicals in chem_df: 571


In [41]:
from datetime import date

today = str(date.today())

chem_df.to_csv('EPA_IRIS_Assessments_2021_dirty-' + today + '.csv', index=False)