# Create Input Data

This script takes a variety of chemical identifiers for both insecticidal and non-insecticidal compounds, cleans these chemicals, converts them to SMILES, and then converts SMILES to graphs.

Written by Tobias D. Muellers

## Load Dependencies

In [87]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import time
from selenium import webdriver
import cirpy
from tqdm import tqdm
import urllib.request
from urllib.error import HTTPError
from urllib.request import urlopen
import rdkit

# Scrape for Chemicals

In [106]:
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

In [107]:
# function to get content from https://www.pesticideinfo.org/
# based on https://realpython.com/beautiful-soup-web-scraper-python/
def pesticideinfo_get(PRI_start, PRI_end):
    """
    this function takes starting and ending ids for this database and constructs a range.
    Based on the range of PRIs, it extracts data from each PesticideInfo page. 
    PRI_start = integer
    PRI_end = integer
    """
    # webdriver workaround from https://stackoverflow.com/questions/76928765/attributeerror-str-object-has-no-attribute-capabilities-in-selenium
    cservice = webdriver.ChromeService(
        executable_path="C:/Users/tobia/OneDrive/Documents/GitHub/BugBuster/chromedriver-win64/chromedriver-win64/chromedriver.exe",
        chrome_options=options)
    driver = webdriver.Chrome(service = cservice)
    # driver code based on https://stackoverflow.com/questions/52687372/beautifulsoup-not-returning-complete-html-of-the-page
    
    # make range
    PRIs = np.arange(PRI_start, PRI_end, 1)

    # set up storage for a dataframe
    pris = []
    names = []
    casrns = []
    classes = []
    mws = []
    uses =[]

    for pri in PRIs:
        URL = "https://www.pesticideinfo.org/chemical/PRI"+str(pri)
        driver.get(URL)
        time.sleep(0.25) # wait for page to load
        page = driver.page_source
        soup = bs(page, "html.parser") # parse page
        table = soup.find_all("div", {"class": "data-table-key-value"}) #get values from table of interest
        
        if len(table) < 5:
            print(f'{pri} does not exist')
        else:
            # now extract desired information 
            pris.append(pri)
            name = str(table[0]).split('</div>')[1][5:]
            names.append(name)
            casrn = str(table[2]).split('</div>')[1][5:]
            casrns.append(casrn)
            chem_class = str(table[4]).split('</div>')[1][5:]
            classes.append(chem_class)
            mw = str(table[5]).split('</div>')[1][5:]
            mws.append(mw)
            use = str(table[6]).split('</div>')[1][5:]
            uses.append(use)

    data = {'name': names, 'pri': pris, 'casrn': casrns, 
            'class': classes, 'mw': mws, 'use': uses}
    df = pd.DataFrame(data)
    
    driver.quit()
    
    return df

In [115]:
#extracted_1_10001 = pesticideinfo_get(1, 10001)
#extracted_10001_20001 = pesticideinfo_get(10001, 20001)
extracted_20001_30001 = pesticideinfo_get(20001, 30001)
#extracted_1_10001.tail(10)
#extracted_10001_20001.head(1)
#pest_info_db_all = pd.concat([extracted_1_1001, extracted_1001_2001], axis=0)
#pest_info_db_all = extracted_1_10001.copy()


20001 does not exist
20002 does not exist
20003 does not exist
20004 does not exist
20005 does not exist
20006 does not exist
20007 does not exist
20008 does not exist
20009 does not exist
20010 does not exist
20011 does not exist
20012 does not exist
20013 does not exist
20014 does not exist
20015 does not exist
20016 does not exist
20017 does not exist
20018 does not exist
20019 does not exist
20020 does not exist
20021 does not exist
20022 does not exist
20023 does not exist
20024 does not exist
20025 does not exist
20026 does not exist
20027 does not exist
20028 does not exist
20029 does not exist
20030 does not exist
20031 does not exist
20032 does not exist
20033 does not exist
20034 does not exist
20035 does not exist
20036 does not exist
20037 does not exist
20038 does not exist
20039 does not exist
20040 does not exist
20041 does not exist
20042 does not exist
20043 does not exist
20044 does not exist
20045 does not exist
20046 does not exist
20047 does not exist
20048 does no

In [117]:
extracted_20001_30001.tail()
extracted_20001_30001.shape
extracted_20001_30001.to_csv("pest_info_db_20001_30001.csv")
#pest_info_db_all.to_csv("pest_info_db_1_10001.csv")

# Get SMILES

In [118]:
# load extracted data
raw_pestinfo_1_10001 = pd.read_csv('pest_info_db_1_10001.csv')
print(raw_pestinfo_1_10001.shape)
raw_pestinfo_10001_20001 = pd.read_csv('pest_info_db_10001_20001.csv')
print(raw_pestinfo_10001_20001.shape)
raw_pestinfo_20001_30001 = pd.read_csv('pest_info_db_20001_30001.csv')
print(raw_pestinfo_20001_30001.shape)
raw_pestinfo = pd.concat([raw_pestinfo_1_10001, raw_pestinfo_10001_20001, raw_pestinfo_20001_30001])
print(raw_pestinfo.shape)
raw_pestinfo.head(1)

(6261, 7)
(339, 7)
(1726, 7)
(8326, 7)


Unnamed: 0.1,Unnamed: 0,name,pri,casrn,class,mw,use
0,0,1-(3-chlorophthalimido)cyclohexanecarboxamide,10,51971-67-6,Carboxamide,0.0,Not Listed


In [133]:
# add smiles based on casrn
# use cirpy 
# add cactus as needed
def add_smiles(df, cas_col, name_col):
    temp_df = df.copy() # avoid overwrite
    casrns = temp_df[cas_col]
    names = temp_df[name_col]
    smiles_cirpy_cas_storage = []
    smiles_cirpy_name_storage = []

    for cas in tqdm(casrns):
        if cas == "Not Listed":
            smiles_cirpy_casrn = float('NaN')
        else:
            smiles_cirpy_casrn = cirpy.resolve(cas, 'smiles')
        smiles_cirpy_cas_storage.append(smiles_cirpy_casrn)

    for name in tqdm(names):
        smiles_cirpy_name = cirpy.resolve(name, 'smiles')
        smiles_cirpy_name_storage.append(smiles_cirpy_name)

    temp_df['smiles_cirpy_casrn'] = smiles_cirpy_cas_storage
    temp_df['smiles_cirpy_name'] = smiles_cirpy_name_storage

    return temp_df

In [135]:
#test = add_smiles(raw_pestinfo.head(10), 'casrn', 'name')
#test

In [136]:
smiles_pestinfo = add_smiles(raw_pestinfo, 'casrn', 'name')

100%|██████████| 6261/6261 [30:57<00:00,  3.37it/s]
100%|██████████| 6261/6261 [57:49<00:00,  1.80it/s]  


In [138]:
# save intermediate output
smiles_pestinfo.to_csv("pest_info_db_smiles.csv")

## Trim SMILES

In [2]:
# load smiles
smiles_pestinfo = pd.read_csv('pest_info_db_smiles.csv')
#smiles_pestinfo.head(10)

In [3]:
# remove no smiles
def merge_and_remove_smiles(df, cirpy_casrn_smiles, cirpy_name_smiles):
    temp = df.copy()
    idx = range(0, temp.shape[0])
    cirpy_casrn_smiles = temp[cirpy_casrn_smiles]
    cirpy_name_smiles = temp[cirpy_name_smiles]

    smiles_storage = []
    for i in idx:
        if pd.isnull(cirpy_casrn_smiles[i]) == False:
            smiles = cirpy_casrn_smiles[i]
        else:
            if pd.isnull(cirpy_name_smiles[i]) == False:
                smiles = cirpy_name_smiles[i]
            else:
                smiles = float('NaN')
        smiles_storage.append(smiles)

    temp['smiles'] = smiles_storage
    
    return temp

In [4]:
smiles_pestinfo_cleaned = merge_and_remove_smiles(smiles_pestinfo, "smiles_cirpy_casrn", "smiles_cirpy_name")

In [5]:
smiles_pestinfo_cleaned = smiles_pestinfo_cleaned.drop(columns=['Unnamed: 0.1', 'Unnamed: 0', 'smiles_cirpy_casrn', 'smiles_cirpy_name'])

In [6]:
smiles_pestinfo_cleaned = smiles_pestinfo_cleaned[pd.isnull(smiles_pestinfo_cleaned['smiles']) == False]

In [11]:
smiles_pestinfo_cleaned.head(5)

Unnamed: 0,name,casrn,class,use,smiles
0,1-(3-chlorophthalimido)cyclohexanecarboxamide,51971-67-6,Carboxamide,Not Listed,NC(=O)C1(CCCCC1)N2C(=O)c3cccc(Cl)c3C2=O
2,"1-(6-Isopropyl-1,1,4-trimethyl-5-indanyl)-1-pr...",6682-77-5,Not Listed,Not Listed,CCC(=O)c1c(C)c2CCC(C)(C)c2cc1C(C)C
4,"1-(8-Methoxy-4,8-dimethylnonyl)-4-(1-methyleth...",53905-38-7,Not Listed,Not Listed,COC(C)(C)CCCC(C)CCCc1ccc(cc1)C(C)C
14,1-(Bromoacetoxy)-2-propanol,4189-47-3,Not Listed,Not Listed,CC(O)COC(=O)CBr
15,1-(Dodecylbenzyl)pyridinium chloride,30901-67-8,Not Listed,Not Listed,[Cl-].CCCCCCCCCCCCC(c1ccccc1)[n+]2ccccc2


In [10]:
# drop unneeded columns
smiles_pestinfo_cleaned = smiles_pestinfo_cleaned.drop(columns=['pri','mw'])

# Add other pre-made datasets

In [88]:
# PESTICIDES|NORMAN: Natural Product Insecticides
# A list of naturally occurring insecticides curated and provided to the NORMAN Suspect List Exchange 
# (https://www.norman-network.com/nds/SLE/) by Reza Aalizadeh (University of Athens). DOI: https://doi.org/10.5281/zenodo.3544742
norman_insecticides = pd.read_csv('Chemical List NPINSECT-2025-04-03.csv')
print(norman_insecticides.shape)
norman_insecticides = norman_insecticides.drop(columns=['DTXSID', 'INCHIKEY','INCHI STRING',
                                                       'MOLECULAR FORMULA', 'AVERAGE MASS', 'MONOISOTOPIC MASS',
                                                       'QC Level', '# ToxCast Active', 'Total Assays', '% ToxCast Active'])

norman_insecticides['use'] = 'Insecticide'
norman_insecticides.head(1)

(84, 14)


Unnamed: 0,PREFERRED NAME,CASRN,IUPAC NAME,SMILES,use
0,3'-Methoxyrocaglamide,189322-69-8,"(1R,2R,3S,3aR,8bS)-3a-(3,4-Dimethoxyphenyl)-1,...",COC1=CC2=C(C(OC)=C1)[C@]1(O)[C@H](O)[C@@H]([C@...,Insecticide


In [89]:
# CATEGORY: Pyrethroids
# A pyrethroid is an organic compound similar to the natural pyrethrins produced by the flowers of pyrethrums 
# (Chrysanthemum cinerariaefolium and C. coccineum). 
# Pyrethroids constitute the majority of commercial household insecticides.
pyrethroid_insecticides = pd.read_csv('Chemical List PYRETHROIDS-2025-04-18.csv')
print(pyrethroid_insecticides.shape)
pyrethroid_insecticides = pyrethroid_insecticides.drop(columns=['DTXSID', 'INCHIKEY','INCHI STRING',
                                                       'MOLECULAR FORMULA', 'AVERAGE MASS', 'MONOISOTOPIC MASS',
                                                       'QC Level', '# ToxCast Active', 'Total Assays', '% ToxCast Active'])

pyrethroid_insecticides['use'] = 'Insecticide'
pyrethroid_insecticides.head(1)

(25, 14)


Unnamed: 0,PREFERRED NAME,CASRN,IUPAC NAME,SMILES,use
0,Flucythrinate,70124-77-5,Cyano(3-phenoxyphenyl)methyl 2-[4-(difluoromet...,CC(C)C(C(=O)OC(C#N)C1=CC=CC(OC2=CC=CC=C2)=C1)C...,Insecticide


In [105]:
# ECHA Active substances
# A list of naturally occurring insecticides curated and provided to the NORMAN Suspect List Exchange 
# (https://www.norman-network.com/nds/SLE/) by Reza Aalizadeh (University of Athens). DOI: https://doi.org/10.5281/zenodo.3544742
# key from ECOTOX INDEX VALUE CATALOG, global 2000
# AC-Acarizide; AL-Algicide; BA-Bactericide; BI-Biocide; FU-Fungicide;
# HB-Herbicide; IN-Insekticide; MO-Molluscicide; MI-Miticide; NE-Nematicide; 
# PA-Plant activator; PG-Plant growth regulator; RE-Repellent; RO-Rodenticide; VI-Viricide
# also from https://doi.org/10.3390/agrochemicals2010008
ECHA_actives = pd.read_csv("ActiveSubstanceExport_16-04-2025.csv")
print(ECHA_actives.shape)
keep_ECHA_columns = ['Substance', 'CAS Number','Functions']
ECHA_actives = ECHA_actives[keep_ECHA_columns]
ECHA_actives = ECHA_actives.rename(columns={'Substance' : 'PREFERRED NAME', 'CAS Number':'CASRN','Functions':'use'})
# remove NaN
ECHA_actives = ECHA_actives.dropna(subset=['use'])
print(ECHA_actives.shape)
ECHA_actives['use'] = ECHA_actives['use'].replace('IN', 'Insecticide')
# take only insecticides for now
ECHA_actives = ECHA_actives[ECHA_actives['use'].str.contains('Insecticide')]
print(ECHA_actives.shape)
ECHA_actives.head(5)

(1465, 33)
(946, 3)
(160, 3)


Unnamed: 0,PREFERRED NAME,CASRN,use
109,8-methyl-2-decanol propionate,No CAS allocated,Insecticide
124,Adoxophyes orana GV strain BV-0001,No CAS allocated,Insecticide
130,Alanycarb,83130-01-2,Insecticide
175,Aramite,140-57-8,Insecticide
215,Bacillus thuringiensis strain RTI545,No CAS allocated,Insecticide


# Filters

## Filter for neutral organics

In [23]:
# drop inorganic
print(smiles_pestinfo_cleaned.shape)
organics = smiles_pestinfo_cleaned[smiles_pestinfo_cleaned['class'] != 'Inorganic']
print(organics.shape)

(3822, 5)
(3558, 5)


In [24]:
# drop inorganics based on classes
#organics["class"].unique() # see all classes
print(organics.shape)
organics = pd.DataFrame(organics[~organics['class'].str.contains('Inorganic')]) # this removes where class is inorganic
print(organics.shape)

(3558, 5)
(3388, 5)


In [25]:
from rdkit import Chem
from rdkit.Chem import SaltRemover

# remove salts, https://www.rdkit.org/docs/source/rdkit.Chem.SaltRemover.html
def no_salts(df, smiles_col):
    temp = df.copy()
    smiles = temp[smiles_col]
    cleaned = []
    for smi in smiles:

        # for chlorine
        ions = ['Cl-', 'Br-', 'NH4+', 'Na+', 'Ca+', 'F-', 'Li+', 'K+', 'Mg++', 'Ca++']
        if any(x in smi for x in ions): # https://stackoverflow.com/questions/3389574/check-if-multiple-strings-exist-in-another-string
            remover = SaltRemover.SaltRemover(defnData="[Cl-]", defnFormat='smiles')
            res = remover.StripMol(Chem.MolFromSmiles(smi), dontRemoveEverything=True)
            remover = SaltRemover.SaltRemover(defnData="[Br-]", defnFormat='smiles')
            res = remover.StripMol(res, dontRemoveEverything=True)
            remover = SaltRemover.SaltRemover(defnData="[NH4+]", defnFormat='smiles')
            res = remover.StripMol(res, dontRemoveEverything=True)
            remover = SaltRemover.SaltRemover(defnData="[Na+]", defnFormat='smiles')
            res = remover.StripMol(res, dontRemoveEverything=True)
            remover = SaltRemover.SaltRemover(defnData="[Ca+]", defnFormat='smiles')
            res = remover.StripMol(res, dontRemoveEverything=True)
            remover = SaltRemover.SaltRemover(defnData="[F-]", defnFormat='smiles')
            res = remover.StripMol(res, dontRemoveEverything=True)
            remover = SaltRemover.SaltRemover(defnData="[Li+]", defnFormat='smiles')
            res = remover.StripMol(res, dontRemoveEverything=True)
            remover = SaltRemover.SaltRemover(defnData="[K+]", defnFormat='smiles')
            res = remover.StripMol(res, dontRemoveEverything=True)
            remover = SaltRemover.SaltRemover(defnData="[Mg++]", defnFormat='smiles')
            res = remover.StripMol(res, dontRemoveEverything=True)
            remover = SaltRemover.SaltRemover(defnData="[Ca++]", defnFormat='smiles')
            res = Chem.MolToSmiles(remover.StripMol(res, dontRemoveEverything=True))
            cleaned.append(res)
        else:
            cleaned.append(smi)
                
    temp['no_salt'] = cleaned
    return temp

In [30]:
print(organics.shape)
desalted = no_salts(organics, 'smiles')
print(desalted.shape)
#test = organics[organics['smiles'].str.contains('\\[' + 'NH4' + '\\+' + '\\]')]
#test = no_salts(test, 'smiles')
#test.head(10)

(3388, 5)




(3388, 6)




In [50]:
#desalted.tail(10)

In [51]:
# neutralize molecules? https://www.rdkit.org/docs/Cookbook.html

In [31]:
# rough filter for now
# remove all not bonded things
import re
print(desalted.shape)
desalted_neut_org = desalted[~desalted['no_salt'].str.contains('\\.')]
print(desalted_neut_org.shape)

(3388, 6)
(3006, 6)


In [33]:
desalted_neut_org = desalted_neut_org.drop(columns=['smiles'])
desalted_neut_org.head(5)

Unnamed: 0,name,casrn,class,use,no_salt
0,1-(3-chlorophthalimido)cyclohexanecarboxamide,51971-67-6,Carboxamide,Not Listed,NC(=O)C1(CCCCC1)N2C(=O)c3cccc(Cl)c3C2=O
2,"1-(6-Isopropyl-1,1,4-trimethyl-5-indanyl)-1-pr...",6682-77-5,Not Listed,Not Listed,CCC(=O)c1c(C)c2CCC(C)(C)c2cc1C(C)C
4,"1-(8-Methoxy-4,8-dimethylnonyl)-4-(1-methyleth...",53905-38-7,Not Listed,Not Listed,COC(C)(C)CCCC(C)CCCc1ccc(cc1)C(C)C
14,1-(Bromoacetoxy)-2-propanol,4189-47-3,Not Listed,Not Listed,CC(O)COC(=O)CBr
15,1-(Dodecylbenzyl)pyridinium chloride,30901-67-8,Not Listed,Not Listed,CCCCCCCCCCCCC(c1ccccc1)[n+]1ccccc1


# Dataset Merge

In [52]:
merge_pyr = pyrethroid_insecticides.drop(columns=['IUPAC NAME'])
print(merge_pyr.shape)
merge_pyr.head(1)

(25, 4)


Unnamed: 0,PREFERRED NAME,CASRN,SMILES,use
0,Flucythrinate,70124-77-5,CC(C)C(C(=O)OC(C#N)C1=CC=CC(OC2=CC=CC=C2)=C1)C...,Insecticide


In [53]:
merge_norm = norman_insecticides.drop(columns=['IUPAC NAME'])
print(merge_norm.shape)
merge_norm.head(1)

(84, 4)


Unnamed: 0,PREFERRED NAME,CASRN,SMILES,use
0,3'-Methoxyrocaglamide,189322-69-8,COC1=CC2=C(C(OC)=C1)[C@]1(O)[C@H](O)[C@@H]([C@...,Insecticide


In [54]:
merge_pestdb = desalted_neut_org.drop(columns=['class'])
merge_pestdb = merge_pestdb.rename(columns={'name' : 'PREFERRED NAME', 'casrn':'CASRN','no_salt':'SMILES'})
merge_pestdb = merge_pestdb.iloc[:, [0,1,3,2]]
print(merge_pestdb.shape)
merge_pestdb.head(1)

(3006, 4)


Unnamed: 0,PREFERRED NAME,CASRN,SMILES,use
0,1-(3-chlorophthalimido)cyclohexanecarboxamide,51971-67-6,NC(=O)C1(CCCCC1)N2C(=O)c3cccc(Cl)c3C2=O,Not Listed


In [56]:
merged_all = pd.concat([merge_pestdb, merge_pyr, merge_norm])
print(merged_all.shape)
merged_all = merged_all.drop_duplicates(subset=['CASRN'])
print(merged_all.shape)

(3115, 4)
(2989, 4)


# Use labels

In [57]:
all_uses = merged_all.copy()

In [65]:
# iterative removal, because this fails the later graph creation at the moment
print(all_uses.shape)
Sn = all_uses['SMILES'].str.contains('Sn')
no_failures = all_uses[~Sn]
print(no_failures.shape)
Hg = no_failures['SMILES'].str.contains('Hg')
no_failures = no_failures[~Hg]
print(no_failures.shape)
Al = no_failures['SMILES'].str.contains('Al')
no_failures = no_failures[~Al]
print(no_failures.shape)
Bi = no_failures['SMILES'].str.contains('Bi')
no_failures = no_failures[~Bi]
print(no_failures.shape)
Pb = no_failures['SMILES'].str.contains('Pb')
no_failures = no_failures[~Pb]
print(no_failures.shape)

(2989, 4)
(2962, 4)
(2929, 4)
(2928, 4)
(2926, 4)
(2925, 4)


In [66]:
pesticides = no_failures[no_failures['use'].str.contains('|'.join(['Insecticide','Fungicide','Microbiocide',
                                                             'Herbicide','Algaecide','Nematicide','Rodenticide',
                                                            'Molluscicide','Avicide', 'Piscicide']))]
print(pesticides.shape)
insecticides = pesticides[pesticides['use'].str.contains('Insecticide')]
print(insecticides.shape)
#nonactives = 

(1673, 4)
(580, 4)


# Data Augmentation

In [274]:
#insecticides.head(10)

In [67]:
# idea 1
def aliphatic_string(df, col):
    '''
    For each carbon string of 3 or more, create a new molecule with one less carbon and one with one more
    '''
    df_temp = df.copy().reset_index()

    augmented_addC = []
    #augmented_minusC = []
    
    for index, row in df_temp.iterrows():
        smi = df_temp[col][index]
        if 'CCC' in smi:
            original = df_temp.iloc[index,:] 
            
            # get the first CCC.. string
            # make a longer version
            plus_C = smi.replace('CCC', 'CCCC', 1) # only do one replacement
            #print(plus_C) # for testing
            plus_C_row = original
            #plus_C_row[0] = 'aug_addC'
            #plus_C_row[1] = 'aug_addC'
            #plus_C_row[2] = 'aug_addC'
            #plus_C_row[3] = 'aug_addC'
            #plus_C_row[5] = 'aug_addC'
            plus_C_row[8] = plus_C
            augmented_addC.append(plus_C_row)
            
            #print(smi) # for testing
            # make a shorter version
            #minus_C = smi.replace('CCC', 'CC', 1)
            #print(minus_C) # for testing
            #minus_C_row = original
            #minus_C_row[0] = 'aug_minusC'
            #minus_C_row[1] = 'aug_minusC'
            #minus_C_row[2] = 'aug_minusC'
            #minus_C_row[3] = 'aug_minusC'
            #minus_C_row[5] = 'aug_minusC'
            #minus_C_row[8] = minus_C
            #augmented_minusC.append(minus_C_row)
            
    addC = pd.DataFrame(augmented_addC)
    #minusC = pd.DataFrame(augmented_minusC)

    #out_df = pd.concat([addC, minusC], axis=0, ignore_index=True, verify_integrity=True)
    #print(out_df.shape[0])
    out_df = pd.concat([df_temp, addC], ignore_index=True)
    
    return out_df

In [69]:
print(pesticides.shape)
print(insecticides.shape)
aug_insecticides = aliphatic_string(insecticides, 'SMILES')
aug_pesticides = aliphatic_string(pesticides, 'SMILES')
print(aug_pesticides.shape)
print(aug_insecticides.shape)

(1673, 4)
(580, 4)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  plus_C_row[8] = plus_C


IndexError: index 8 is out of bounds for axis 0 with size 5

In [277]:
aug_insecticides.head(2)

Unnamed: 0,index,name,pri,casrn,class,mw,use,smiles,no_salt
0,35,1-Chloro-2-nitrobenzene,46,88-73-3,Unclassified,157.56,Insecticide,[O-][N+](=O)c1ccccc1Cl,[O-][N+](=O)c1ccccc1Cl
1,38,1-Chloro-3-nitrobenzene,49,121-73-3,Unclassified,157.56,Insecticide,[O-][N+](=O)c1cccc(Cl)c1,[O-][N+](=O)c1cccc(Cl)c1


# Predict Log P

In [278]:
from rdkit.Chem.QED import properties

In [279]:
def alogp(df, col):
    '''
    calculates logP using QED ALogP
    https://www.rdkit.org/docs/source/rdkit.Chem.QED.html
    ''' 
    logp_storage = []

    smiles = df[col]
    
    for smi in smiles:
        mol = Chem.MolFromSmiles(smi)
        properties = rdkit.Chem.QED.properties(mol)
        logp = properties[1]
        logp_storage.append(logp)

    df['alogp'] = logp_storage

    return df

In [282]:
insecticides_logp = alogp(aug_insecticides, 'no_salt')
pesticides_logp = alogp(aug_pesticides, 'no_salt')

In [283]:
print(insecticides_logp.shape)
print(pesticides_logp.shape)

(575, 10)
(2015, 10)


# SMILES to Graphs
from https://www.blopig.com/blog/2022/02/how-to-turn-a-smiles-string-into-a-molecular-graph-for-pytorch-geometric/

In [2]:
# RDkit
from rdkit import Chem
from rdkit.Chem.rdmolops import GetAdjacencyMatrix
# Pytorch and Pytorch Geometric
import torch
from torch_geometric.data import Data
from torch.utils.data import DataLoader

### Atom featurization

In [285]:
def one_hot_encoding(x, permitted_list):
    """
    Maps input elements x which are not in the permitted list to the last element
    of the permitted list.
    """
    if x not in permitted_list:
        x = permitted_list[-1]
    binary_encoding = [int(boolean_value) for boolean_value in list(map(lambda s: x == s, permitted_list))]
    return binary_encoding

In [286]:
def get_atom_features(atom, 
                      use_chirality = True, 
                      hydrogens_implicit = True):
    """
    Takes an RDKit atom object as input and gives a 1d-numpy array of atom features as output.
    """
    # define list of permitted atoms
    
    permitted_list_of_atoms =  ['C','N','O','S','F','Si','P','Cl','Br','Mg','Na','Ca','Fe','As','Al','I', 'B','V','K','Tl','Yb','Sb','Sn','Ag','Pd','Co','Se','Ti','Zn', 'Li','Ge','Cu','Au','Ni','Cd','In','Mn','Zr','Cr','Pt','Hg','Pb','Unknown']
    
    if hydrogens_implicit == False:
        permitted_list_of_atoms = ['H'] + permitted_list_of_atoms
    
    # compute atom features
    
    atom_type_enc = one_hot_encoding(str(atom.GetSymbol()), permitted_list_of_atoms)
    
    n_heavy_neighbors_enc = one_hot_encoding(int(atom.GetDegree()), [0, 1, 2, 3, 4, "MoreThanFour"])
    
    formal_charge_enc = one_hot_encoding(int(atom.GetFormalCharge()), [-3, -2, -1, 0, 1, 2, 3, "Extreme"])
    
    hybridisation_type_enc = one_hot_encoding(str(atom.GetHybridization()), ["S", "SP", "SP2", "SP3", "SP3D", "SP3D2", "OTHER"])
    
    is_in_a_ring_enc = [int(atom.IsInRing())]
    
    is_aromatic_enc = [int(atom.GetIsAromatic())]
    
    atomic_mass_scaled = [float((atom.GetMass() - 10.812)/116.092)]
    
    vdw_radius_scaled = [float((Chem.GetPeriodicTable().GetRvdw(atom.GetAtomicNum()) - 1.5)/0.6)]
    
    covalent_radius_scaled = [float((Chem.GetPeriodicTable().GetRcovalent(atom.GetAtomicNum()) - 0.64)/0.76)]
    atom_feature_vector = atom_type_enc + n_heavy_neighbors_enc + formal_charge_enc + hybridisation_type_enc + is_in_a_ring_enc + is_aromatic_enc + atomic_mass_scaled + vdw_radius_scaled + covalent_radius_scaled
                                    
    if use_chirality == True:
        chirality_type_enc = one_hot_encoding(str(atom.GetChiralTag()), ["CHI_UNSPECIFIED", "CHI_TETRAHEDRAL_CW", "CHI_TETRAHEDRAL_CCW", "CHI_OTHER"])
        atom_feature_vector += chirality_type_enc
    
    if hydrogens_implicit == True:
        n_hydrogens_enc = one_hot_encoding(int(atom.GetTotalNumHs()), [0, 1, 2, 3, 4, "MoreThanFour"])
        atom_feature_vector += n_hydrogens_enc
    return np.array(atom_feature_vector)

### Bond featurization

In [287]:
def get_bond_features(bond, 
                      use_stereochemistry = True):
    """
    Takes an RDKit bond object as input and gives a 1d-numpy array of bond features as output.
    """
    permitted_list_of_bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC]
    bond_type_enc = one_hot_encoding(bond.GetBondType(), permitted_list_of_bond_types)
    
    bond_is_conj_enc = [int(bond.GetIsConjugated())]
    
    bond_is_in_ring_enc = [int(bond.IsInRing())]
    
    bond_feature_vector = bond_type_enc + bond_is_conj_enc + bond_is_in_ring_enc
    
    if use_stereochemistry == True:
        stereo_type_enc = one_hot_encoding(str(bond.GetStereo()), ["STEREOZ", "STEREOE", "STEREOANY", "STEREONONE"])
        bond_feature_vector += stereo_type_enc
    return np.array(bond_feature_vector)

### Function to integrate and create dataset

In [288]:
def create_pytorch_geometric_graph_data_list_from_smiles_and_labels(x_smiles, y):
    """
    Inputs:
    
    x_smiles = [smiles_1, smiles_2, ....] ... a list of SMILES strings
    y = [y_1, y_2, ...] ... a list of numerial labels for the SMILES strings (such as associated pKi values)
    
    Outputs:
    
    data_list = [G_1, G_2, ...] ... a list of torch_geometric.data.Data objects which represent labeled molecular graphs that can readily be used for machine learning
    
    """
    
    data_list = []
    
    for (smiles, y_val) in zip(x_smiles, y):
        
        # convert SMILES to RDKit mol object
        mol = Chem.MolFromSmiles(smiles)
        # get feature dimensions
        n_nodes = mol.GetNumAtoms()
        n_edges = 2*mol.GetNumBonds()
        unrelated_smiles = "O=O"
        unrelated_mol = Chem.MolFromSmiles(unrelated_smiles)
        n_node_features = len(get_atom_features(unrelated_mol.GetAtomWithIdx(0)))
        n_edge_features = len(get_bond_features(unrelated_mol.GetBondBetweenAtoms(0,1)))
        # construct node feature matrix X of shape (n_nodes, n_node_features)
        X = np.zeros((n_nodes, n_node_features))
        for atom in mol.GetAtoms():
            X[atom.GetIdx(), :] = get_atom_features(atom)
            
        X = torch.tensor(X, dtype = torch.float)
        
        # construct edge index array E of shape (2, n_edges)
        (rows, cols) = np.nonzero(GetAdjacencyMatrix(mol))
        torch_rows = torch.from_numpy(rows.astype(np.int64)).to(torch.long)
        torch_cols = torch.from_numpy(cols.astype(np.int64)).to(torch.long)
        E = torch.stack([torch_rows, torch_cols], dim = 0)
        
        # construct edge feature array EF of shape (n_edges, n_edge_features)
        EF = np.zeros((n_edges, n_edge_features))
        
        for (k, (i,j)) in enumerate(zip(rows, cols)):
            
            EF[k] = get_bond_features(mol.GetBondBetweenAtoms(int(i),int(j)))
        
        EF = torch.tensor(EF, dtype = torch.float)
        
        # construct label tensor
        y_tensor = torch.tensor(np.array([y_val]), dtype = torch.float)
        
        # construct Pytorch Geometric data object and append to data list
        data_list.append(Data(x = X, edge_index = E, edge_attr = EF, y = y_tensor))
    return data_list

### Create datasets

In [289]:
pesticides_graph = create_pytorch_geometric_graph_data_list_from_smiles_and_labels(pesticides_logp['no_salt'], pesticides_logp['alogp'])
insecticides_graph = create_pytorch_geometric_graph_data_list_from_smiles_and_labels(insecticides_logp['no_salt'], insecticides_logp['alogp'])

In [290]:
torch.save(pesticides_graph, "pesticides_graphs.pt")
torch.save(insecticides_graph, "insecticides_graphs.pt")