# Create Input Data

This script takes a variety of chemical identifiers for both insecticidal and non-insecticidal compounds, cleans these chemicals, converts them to SMILES, and then converts SMILES to graphs.

Written by Tobias D. Muellers

## Import Chemicals, Remove Non-Neutral Organics, Convert to SMILES 

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import time
from selenium import webdriver

In [2]:
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

In [3]:
# function to get content from https://www.pesticideinfo.org/
# based on https://realpython.com/beautiful-soup-web-scraper-python/
def pesticideinfo_get(PRI_start, PRI_end):
    """
    this function takes starting and ending ids for this database and constructs a range.
    Based on the range of PRIs, it extracts data from each PesticideInfo page. 
    PRI_start = integer
    PRI_end = integer
    """
    # webdriver workaround from https://stackoverflow.com/questions/76928765/attributeerror-str-object-has-no-attribute-capabilities-in-selenium
    cservice = webdriver.ChromeService(
        executable_path="C:/Users/tobia/OneDrive/Documents/GitHub/BugBuster/chromedriver-win64/chromedriver-win64/chromedriver.exe",
        chrome_options=options)
    driver = webdriver.Chrome(service = cservice)
    # driver code based on https://stackoverflow.com/questions/52687372/beautifulsoup-not-returning-complete-html-of-the-page
    
    # make range
    PRIs = np.arange(PRI_start, PRI_end, 1)

    # set up storage for a dataframe
    pris = []
    names = []
    casrns = []
    classes = []
    mws = []
    uses =[]

    for pri in PRIs:
        URL = "https://www.pesticideinfo.org/chemical/PRI"+str(pri)
        driver.get(URL)
        time.sleep(1) # wait for page to load
        page = driver.page_source
        soup = bs(page, "html.parser") # parse page
        table = soup.find_all("div", {"class": "data-table-key-value"}) #get values from table of interest
        
        if len(table) < 5:
            print(f'{pri} does not exist')
        else:
            # now extract desired information 
            pris.append(pri)
            name = str(table[0]).split('</div>')[1][5:]
            names.append(name)
            casrn = str(table[2]).split('</div>')[1][5:]
            casrns.append(casrn)
            chem_class = str(table[4]).split('</div>')[1][5:]
            classes.append(chem_class)
            mw = str(table[5]).split('</div>')[1][5:]
            mws.append(mw)
            use = str(table[6]).split('</div>')[1][5:]
            uses.append(use)

    data = {'name': names, 'pri': pris, 'casrn': casrns, 
            'class': classes, 'mw': mws, 'use': uses}
    df = pd.DataFrame(data)
    
    driver.quit()
    
    return df

In [7]:
extracted_1_10001 = pesticideinfo_get(1, 10001)

1 does not exist
2 does not exist
3 does not exist
4 does not exist
5 does not exist
6 does not exist
7 does not exist
8 does not exist
9 does not exist
28 does not exist
51 does not exist
53 does not exist
64 does not exist
79 does not exist
84 does not exist
116 does not exist
117 does not exist
122 does not exist
149 does not exist
153 does not exist
168 does not exist
169 does not exist
170 does not exist
183 does not exist
201 does not exist
209 does not exist
210 does not exist
220 does not exist
223 does not exist
224 does not exist
235 does not exist
244 does not exist
271 does not exist
284 does not exist
289 does not exist
330 does not exist
386 does not exist
427 does not exist
463 does not exist
491 does not exist
504 does not exist
514 does not exist
528 does not exist
533 does not exist
547 does not exist
589 does not exist
598 does not exist
612 does not exist
613 does not exist
614 does not exist
623 does not exist
632 does not exist
638 does not exist
642 does not exis

In [8]:
extracted_1_10001.tail(10)

Unnamed: 0,name,pri,casrn,class,mw,use
6251,Ziram,6665,137-30-4,"Dithiocarbamate, Inorganic-Zinc",305.8,"Dog and Cat Repellent,Fungicide,Microbiocide"
6252,"Ziram, cyclohexylamine complex",6666,16509-79-8,"Dithiocarbamate, Inorganic-Zinc",0.0,"Dog and Cat Repellent,Fungicide"
6253,Zirconium acetate,6667,7585-20-8,Inorganic,0.0,Not Listed
6254,Zirconium neodecanoate,6668,39049-04-2,Inorganic,0.0,Not Listed
6255,Zirconyl ammonium carbonate,6669,32535-84-5,Not Listed,0.0,Not Listed
6256,Zn Versazate,6670,Not Listed,Not Listed,0.0,Wood Preservative
6257,Zoalene,6671,148-01-6,Not Listed,0.0,Not Listed
6258,Zoxamide,6672,156052-68-5,Not Listed,336.64,Fungicide
6259,Zucchini yellow mosaic virus coat protein as p...,6673,Not Listed,GE Crop,0.0,Viruscide
6260,Zulate,6675,Not Listed,Not Listed,0.0,Not Listed


In [87]:
#pest_info_db_all = pd.concat([extracted_1_1001, extracted_1001_2001], axis=0)

In [9]:
pest_info_db_all = extracted_1_10001.copy()

In [11]:
pest_info_db_all.to_csv("pest_info_db_1_10001.csv")