In [87]:
from rdkit import Chem
import pandas as pd
import numpy as np
import rdkit.Chem.rdMolDescriptors as d
from sklearn.model_selection import train_test_split
from rdkit.Chem import AllChem
import rdkit.Chem.Fragments as f
from rdkit.Chem import Lipinski as l
import inspect


In [21]:
train_df = pd.read_csv("training_smiles.csv")
test_df = pd.read_csv("test_smiles.csv")

In [27]:
print(train_df.columns)
print(train_df.head())

Index(['INDEX', 'SMILES', 'ACTIVE'], dtype='object')
   INDEX                                             SMILES  ACTIVE
0      1                        CC#CCCCC(=O)Nc1ccccc1C(=O)O     0.0
1      2  [O-][Cl+3]([O-])([O-])[O-].c1ccc(-c2[nH]c[n+](...     0.0
2      3   CCOC(=O)CSc1nnc(NC(=O)c2cccc([N+](=O)[O-])c2C)s1     0.0
3      4  O=C(CN1CCN(S(=O)(=O)c2ccccc2)CC1)Nc1ccc(Cl)c(C...     0.0
4      5         Cc1cc(NN/C=C2\C=CC(=O)C=C2O)nc(N2CCOCC2)n1     0.0


In [23]:
ex = train_df.iloc[0,:]
m_ex = Chem.MolFromSmiles(ex['SMILES'])

print(m_ex.GetNumAtoms())


18


In [106]:
lip_features = inspect.getmembers(l, inspect.isfunction)
desc_features = inspect.getmembers(d, inspect.isfunction)
frag_features = inspect.getmembers(f, inspect.isfunction)


def add_features(dataset):
    #Iterate rows, create a new dataset
    dict_df = []
    tmp_dict = {}
    for i in range(len(dataset)): #len(dataset)
        tmp_dict = {}
        #Make a list of dictionaries
        row = train_df.iloc[i,:]

        
        tmp_dict['INDEX'] = row['INDEX']
        tmp_dict['ACTIVE'] = row['ACTIVE']
        tmp_dict['SMILES'] = row['SMILES']

        m_row = Chem.MolFromSmiles(row['SMILES'])
        tmp_dict['num_atoms'] = m_row.GetNumAtoms()
        tmp_dict['mol_wt'] = d.CalcExactMolWt(m_row)

        #generate fingerprints: Morgan fingerprint, radius 2
        nBits = 124
        fp = AllChem.GetMorganFingerprintAsBitVect(m_row,2,nBits=nBits)
        fp_a = np.array(fp)
        for i in range(nBits):
            tmp_dict['mf_'+str(i)] = fp_a[i]

        #Get all fragment features
        for i in range(len(frag_features)):

            #exclude attributes that start with _ 
            if frag_features[i][0].startswith('_') == False:
                #Apply every function to the m_row
                feat = frag_features[i][1](m_row)

                #Add it to the dict
                tmp_dict[frag_features[i][0]] = feat

        #Get all lipinski
        for i in range(len(lip_features)):

            #exclude attributes that start with _ 
            if lip_features[i][0].startswith('_') == False:
                #Apply every function to the m_row
                feat = lip_features[i][1](m_row)

                #Add it to the dict
                tmp_dict[lip_features[i][0]] = feat



        dict_df.append(tmp_dict)
        #print(tmp_dict)

    pd_df = pd.DataFrame(dict_df)
    return pd_df
    #train_df = train_df.merge(pd_df, on='INDEX')

In [107]:
#For each row, extract the features and add them to the dataset

#Iterate rows, create a new dataset
'''
dict_df = []
tmp_dict = {}
for i in range(len([1,2,3])):
    tmp_dict = {}
    #Make a list of dictionaries
    row = train_df.iloc[i,:]

    m_row = Chem.MolFromSmiles(row['SMILES'])
    tmp_dict['INDEX'] = row['INDEX']
    tmp_dict['num_atoms'] = m_row.GetNumAtoms()
    tmp_dict['mol_wt'] = d.CalcExactMolWt(m_row)

    #generate fingerprints: Morgan fingerprint, radius 2
    nBits = 124
    fp = AllChem.GetMorganFingerprintAsBitVect(m_row,2,nBits=nBits)
    fp_a = np.array(fp)
    for i in range(nBits):
        tmp_dict['mf_'+str(i)] = fp_a[i]

    #Get all fragment features
    frag_features = inspect.getmembers(f, inspect.isfunction)

    for i in range(len(frag_features)):

        #exclude attributes that start with _ 
        if frag_features[i][0].startswith('_') == False:
            #Apply every function to the m_row
            feat = frag_features[i][1](m_row)

            #Add it to the dict
            tmp_dict['fr_'+frag_features[i][0]] = feat

    dict_df.append(tmp_dict)
    #print(tmp_dict)

pd_df = pd.DataFrame(dict_df)
train_df = train_df.merge(pd_df, on='INDEX')

#Then, inner join the two dataset using the index
'''
train_df = add_features(train_df)

In [105]:
print(train_df)

INDEX  ACTIVE                                             SMILES  \
0      1     0.0                        CC#CCCCC(=O)Nc1ccccc1C(=O)O   
1      2     0.0  [O-][Cl+3]([O-])([O-])[O-].c1ccc(-c2[nH]c[n+](...   
2      3     0.0   CCOC(=O)CSc1nnc(NC(=O)c2cccc([N+](=O)[O-])c2C)s1   

   num_atoms      mol_wt  mf_0  mf_1  mf_2  mf_3  mf_4  ...  \
0         18  245.105193     0     0     0     0     0  ...   
1         26  376.082599     0     0     1     1     0  ...   
2         25  382.040562     0     0     0     0     1  ...   

   NumAromaticHeterocycles  NumAromaticRings  NumHAcceptors  NumHDonors  \
0                        0                 1              2           2   
1                        1                 3              5           1   
2                        1                 2              9           1   

   NumHeteroatoms  NumRotatableBonds  NumSaturatedCarbocycles  \
0               4                  5                        0   
1               8                 

In [53]:
#Same for test set

#Iterate rows, create a new dataset
'''
dict_df = []
tmp_dict = {}
for i in range(len(test_df)):
    tmp_dict = {}
    #Make a list of dictionaries
    row = test_df.iloc[i,:]

    m_row = Chem.MolFromSmiles(row['SMILES'])
    tmp_dict['INDEX'] = row['INDEX']
    tmp_dict['num_atoms'] = m_row.GetNumAtoms()
    tmp_dict['mol_wt'] = d.CalcExactMolWt(m_row)
    dict_df.append(tmp_dict)
    #print(tmp_dict)

pd_df = pd.DataFrame(dict_df)
test_df = test_df.merge(pd_df, on='INDEX')
'''
test_df = add_features(test_df)

In [52]:
print(test_df)

INDEX                                             SMILES  num_atoms
0      121375  Cc1ccc(-c2csc(NC(=O)C3=NN(c4ccccc4)C(=O)CC3)n2...         28
1      121376  O=C(Nc1ccccc1)N1CC[C@@]2(CCCN(C(=O)c3cccc(F)c3...         28
2      121377        CC(=O)N1C(=O)N(C(C)=O)C2C1N(C)C(=O)N2C(C)=O         20
3      121378  CCOC(=O)Cn1/c(=N/C(=O)c2ccc([N+](=O)[O-])s2)sc...         26
4      121379  Cc1ccc(S(=O)(=O)N2CCC(C(=O)Nc3nnc(C45CC6CC(CC(...         34
...       ...                                                ...        ...
40453  161828            O=C(CSc1nnc(-c2ccncc2)o1)N1CCc2ccccc2C1         25
40454  161829    N=c1sccn1CC(=O)Nc1ccc(Cl)c(S(=O)(=O)N2CCOCC2)c1         26
40455  161830  CC(/C=C/c1ccc2c(c1)OCO2)=N\NC(=O)c1cccc([N+](=...         26
40456  161831        CC(C)Cn1c(=O)c(C(=O)Nc2cnccn2)c(O)c2ccccc21         25
40457  161832  O=C(CSc1nc2c(c(C(F)(F)F)n1)CCc1ccccc1-2)NCc1cc...         31

[40458 rows x 3 columns]


In [51]:
#Save the two dataset

train_df.to_csv("Datasets/train_complete.csv")
test_df.to_csv("Datasets/test_complete.csv")