# Create Input Data

This script takes a variety of chemical identifiers for both insecticidal and non-insecticidal compounds, cleans these chemicals, converts them to SMILES, and then converts SMILES to graphs.

Written by Tobias D. Muellers

## Import Chemicals, Remove Non-Neutral Organics, Convert to SMILES 

In [3]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import time
from selenium import webdriver

In [4]:
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

In [5]:
# function to get content from https://www.pesticideinfo.org/
# based on https://realpython.com/beautiful-soup-web-scraper-python/
def pesticideinfo_get(PRI_start, PRI_end):
    """
    this function takes starting and ending ids for this database and constructs a range.
    Based on the range of PRIs, it extracts data from each PesticideInfo page. 
    PRI_start = integer
    PRI_end = integer
    """
    # webdriver workaround from https://stackoverflow.com/questions/76928765/attributeerror-str-object-has-no-attribute-capabilities-in-selenium
    cservice = webdriver.ChromeService(
        executable_path="C:/Users/tobia/OneDrive/Documents/GitHub/BugBuster/chromedriver-win64/chromedriver-win64/chromedriver.exe",
        chrome_options=options)
    driver = webdriver.Chrome(service = cservice)
    # driver code based on https://stackoverflow.com/questions/52687372/beautifulsoup-not-returning-complete-html-of-the-page
    
    # make range
    PRIs = np.arange(PRI_start, PRI_end, 1)

    # set up storage for a dataframe
    pris = []
    names = []
    casrns = []
    classes = []
    mws = []
    uses =[]

    for pri in PRIs:
        URL = "https://www.pesticideinfo.org/chemical/PRI"+str(pri)
        driver.get(URL)
        time.sleep(1) # wait for page to load
        page = driver.page_source
        soup = bs(page, "html.parser") # parse page
        table = soup.find_all("div", {"class": "data-table-key-value"}) #get values from table of interest
        
        if len(table) < 5:
            print(f'{pri} does not exist')
        else:
            # now extract desired information 
            pris.append(pri)
            name = str(table[0]).split('</div>')[1][5:]
            names.append(name)
            casrn = str(table[2]).split('</div>')[1][5:]
            casrns.append(casrn)
            chem_class = str(table[4]).split('</div>')[1][5:]
            classes.append(chem_class)
            mw = str(table[5]).split('</div>')[1][5:]
            mws.append(mw)
            use = str(table[6]).split('</div>')[1][5:]
            uses.append(use)

    data = {'name': names, 'pri': pris, 'casrn': casrns, 
            'class': classes, 'mw': mws, 'use': uses}
    df = pd.DataFrame(data)
    
    driver.quit()
    
    return df

In [None]:
extracted_10001_20001 = pesticideinfo_get(10001, 20001)
#extracted_1_10001 = pesticideinfo_get(1, 10001)

10001 does not exist
10002 does not exist
10003 does not exist
10004 does not exist
10005 does not exist
10006 does not exist
10007 does not exist
10008 does not exist
10009 does not exist
10010 does not exist
10011 does not exist
10012 does not exist
10013 does not exist
10014 does not exist
10015 does not exist
10016 does not exist
10017 does not exist
10018 does not exist
10019 does not exist
10020 does not exist
10021 does not exist
10022 does not exist
10023 does not exist
10024 does not exist
10025 does not exist
10026 does not exist
10027 does not exist
10028 does not exist
10029 does not exist
10030 does not exist
10031 does not exist
10032 does not exist
10033 does not exist
10034 does not exist
10035 does not exist
10036 does not exist
10037 does not exist
10038 does not exist
10039 does not exist
10040 does not exist
10041 does not exist
10042 does not exist
10043 does not exist
10044 does not exist
10045 does not exist
10046 does not exist
10047 does not exist
10048 does no

In [9]:
extracted_10001_20001.head(1)
#extracted_1_10001.tail(10)

NameError: name 'extracted_10001_20001' is not defined

In [87]:
#pest_info_db_all = pd.concat([extracted_1_1001, extracted_1001_2001], axis=0)

In [9]:
pest_info_db_all = extracted_1_10001.copy()

In [11]:
pest_info_db_all.to_csv("pest_info_db_1_10001.csv")