# Create Input Data

This script takes a variety of chemical identifiers for both insecticidal and non-insecticidal compounds, cleans these chemicals, converts them to SMILES, and then converts SMILES to graphs.

Written by Tobias D. Muellers

## Import Chemicals, Remove Non-Neutral Organics, Convert to SMILES 

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import time
from selenium import webdriver

In [2]:
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

In [73]:
# function to get content from https://www.pesticideinfo.org/
# based on https://realpython.com/beautiful-soup-web-scraper-python/
def pesticideinfo_get(PRI_start, PRI_end):
    """
    this function takes starting and ending ids for this database and constructs a range.
    Based on the range of PRIs, it extracts data from each PesticideInfo page. 
    PRI_start = integer
    PRI_end = integer
    """
    # webdriver workaround from https://stackoverflow.com/questions/76928765/attributeerror-str-object-has-no-attribute-capabilities-in-selenium
    cservice = webdriver.ChromeService(
        executable_path="C:/Users/tobia/OneDrive/Documents/GitHub/BugBuster/chromedriver-win64/chromedriver-win64/chromedriver.exe",
        chrome_options=options)
    driver = webdriver.Chrome(service = cservice)
    # driver code based on https://stackoverflow.com/questions/52687372/beautifulsoup-not-returning-complete-html-of-the-page
    
    # make range
    PRIs = np.arange(PRI_start, PRI_end, 1)

    # set up storage for a dataframe
    pris = []
    names = []
    casrns = []
    classes = []
    mws = []
    uses =[]

    for pri in PRIs:
        URL = "https://www.pesticideinfo.org/chemical/PRI"+str(pri)
        driver.get(URL)
        time.sleep(1) # wait for page to load
        page = driver.page_source
        soup = bs(page, "html.parser") # parse page
        table = soup.find_all("div", {"class": "data-table-key-value"}) #get values from table of interest
        
        if len(table) < 5:
            print(f'{pri} does not exist')
        else:
            # now extract desired information 
            pris.append(pri)
            name = str(table[0]).split('</div>')[1][5:]
            names.append(name)
            casrn = str(table[2]).split('</div>')[1][5:]
            casrns.append(casrn)
            chem_class = str(table[4]).split('</div>')[1][5:]
            classes.append(chem_class)
            mw = str(table[5]).split('</div>')[1][5:]
            mws.append(mw)
            use = str(table[6]).split('</div>')[1][5:]
            uses.append(use)

    data = {'name': names, 'pri': pris, 'casrn': casrns, 
            'class': classes, 'mw': mws, 'use': uses}
    df = pd.DataFrame(data)
    
    driver.quit()
    
    return df

In [78]:
extracted_1_1001 = pesticideinfo_get(1, 1001)

1 does not exist
2 does not exist
3 does not exist
4 does not exist
5 does not exist
6 does not exist
7 does not exist
8 does not exist
9 does not exist
28 does not exist
51 does not exist
53 does not exist
64 does not exist
79 does not exist
84 does not exist
116 does not exist
117 does not exist
122 does not exist
149 does not exist
153 does not exist
168 does not exist
169 does not exist
170 does not exist
183 does not exist
201 does not exist
209 does not exist
210 does not exist
220 does not exist
223 does not exist
224 does not exist
235 does not exist
244 does not exist
271 does not exist
284 does not exist
289 does not exist
330 does not exist
386 does not exist
427 does not exist
463 does not exist
491 does not exist
504 does not exist
514 does not exist
528 does not exist
533 does not exist
547 does not exist
589 does not exist
598 does not exist
612 does not exist
613 does not exist
614 does not exist
623 does not exist
632 does not exist
638 does not exist
642 does not exis

In [79]:
extracted_1_1001.tail(10)

Unnamed: 0,name,pri,casrn,class,mw,use
913,"Alkyl* dimethyl 3,4-dichlorobenzyl ammonium ch...",991,92129-28-7,Quaternary Ammonium Compound,0,"Algaecide,Microbiocide"
914,"Alkyl* dimethyl 3,4-dichlorobenzyl ammonium ch...",992,68989-02-6,Quaternary Ammonium Compound,0,"Algaecide,Microbiocide"
915,"Alkyl* dimethyl 3,4-dichlorobenzyl ammonium ch...",993,"68568-47-8, 68989-02-6",Quaternary Ammonium Compound,0,"Algaecide,Microbiocide"
916,"Alkyl* dimethyl 3,4-dichlorobenzyl ammonium ch...",994,68989-02-6,Quaternary Ammonium Compound,0,"Algaecide,Microbiocide"
917,"Alkyl* dimethyl 3,4-dichlorobenzyl ammonium ch...",995,68989-02-6,Quaternary Ammonium Compound,0,"Algaecide,Microbiocide"
918,"Alkyl* dimethyl 3,4-dichlorobenzyl ammonium ch...",996,Not Listed,"Iodine Compound, Quaternary Ammonium Compound",0,Microbiocide
919,"Alkyl* dimethyl ammonium bromide *(50% C12, 30...",997,Not Listed,Quaternary Ammonium Compound,0,Microbiocide
920,"Alkyl* dimethyl ammonium bromide *(67% C16, 30...",998,Not Listed,Quaternary Ammonium Compound,0,Microbiocide
921,Alkyl* dimethyl benzyl ammonium bentonite *(as...,999,71011-24-0,Quaternary Ammonium Compound,0,"Algaecide,Microbiocide"
922,Alkyl* dimethyl benzyl ammonium chloride *(100...,1000,122-18-9,Quaternary Ammonium Compound,0,"Algaecide,Microbiocide"


In [86]:
extracted_1001_2001 = pesticideinfo_get(1001, 2001)

1099 does not exist
1101 does not exist
1107 does not exist
1108 does not exist
1112 does not exist
1114 does not exist
1115 does not exist
1118 does not exist
1119 does not exist
1120 does not exist
1122 does not exist
1124 does not exist
1133 does not exist
1135 does not exist
1136 does not exist
1138 does not exist
1140 does not exist
1142 does not exist
1145 does not exist
1187 does not exist
1188 does not exist
1189 does not exist
1208 does not exist
1239 does not exist
1240 does not exist
1250 does not exist
1300 does not exist
1316 does not exist
1342 does not exist
1345 does not exist
1350 does not exist
1351 does not exist
1377 does not exist
1378 does not exist
1381 does not exist
1384 does not exist
1395 does not exist
1396 does not exist
1408 does not exist
1409 does not exist
1411 does not exist
1412 does not exist
1416 does not exist
1420 does not exist
1443 does not exist
1444 does not exist
1451 does not exist
1452 does not exist
1453 does not exist
1454 does not exist


In [89]:
extracted_1001_2001.tail(10)

Unnamed: 0,name,pri,casrn,class,mw,use
892,Chlorflurazole,1990,3615-21-2,Benzimidazole,0.0,Not Listed
893,Chlorfluren,1991,24539-66-0,Not Listed,0.0,Not Listed
894,Chlorflurenol,1992,2464-37-1,Not Listed,274.7,Plant Growth Regulator
895,"Chlorflurenol, methyl ester",1993,2536-31-4,Not Listed,288.72,"Herbicide,Plant Growth Regulator,Pruning Aid"
896,Chlorhexidine,1994,55-56-1,Not Listed,505.48,Microbiocide
897,Chlorhexidine diacetate,1995,56-95-1,Not Listed,0.0,Microbiocide
898,Chlorhexidine digluconate,1996,18472-51-0,Not Listed,0.0,Microbiocide
899,Chlorhexidine dihydrochloride,1997,3697-42-5,Not Listed,0.0,Microbiocide
900,Chloric acid,1999,7790-93-4,Not Listed,0.0,Not Listed
901,Chloridazon,2000,1698-60-8,Pyridazinone,221.64,Herbicide


In [87]:

pest_info_db_all = pd.concat([extracted_1_1001, extracted_1001_2001], axis=0)

In [88]:
pest_info_db_all.tail()

Unnamed: 0,name,pri,casrn,class,mw,use
897,Chlorhexidine diacetate,1995,56-95-1,Not Listed,0.0,Microbiocide
898,Chlorhexidine digluconate,1996,18472-51-0,Not Listed,0.0,Microbiocide
899,Chlorhexidine dihydrochloride,1997,3697-42-5,Not Listed,0.0,Microbiocide
900,Chloric acid,1999,7790-93-4,Not Listed,0.0,Not Listed
901,Chloridazon,2000,1698-60-8,Pyridazinone,221.64,Herbicide


In [90]:
pest_info_db_all.to_csv("pest_info_db_all.csv")