In [42]:
from rdkit import Chem
import pandas as pd
import numpy as np
import rdkit.Chem.rdMolDescriptors as d
from sklearn.model_selection import train_test_split
from rdkit.Chem import AllChem
import rdkit.Chem.Fragments as f
from rdkit.Chem import Lipinski as l
import inspect

np.set_printoptions(threshold=sys.maxsize)


In [35]:
start_train_df = pd.read_csv("training_smiles.csv")
start_test_df = pd.read_csv("test_smiles.csv")

In [36]:
print(start_train_df.columns)
print(start_train_df.head())

Index(['INDEX', 'SMILES', 'ACTIVE'], dtype='object')
   INDEX                                             SMILES  ACTIVE
0      1                        CC#CCCCC(=O)Nc1ccccc1C(=O)O     0.0
1      2  [O-][Cl+3]([O-])([O-])[O-].c1ccc(-c2[nH]c[n+](...     0.0
2      3   CCOC(=O)CSc1nnc(NC(=O)c2cccc([N+](=O)[O-])c2C)s1     0.0
3      4  O=C(CN1CCN(S(=O)(=O)c2ccccc2)CC1)Nc1ccc(Cl)c(C...     0.0
4      5         Cc1cc(NN/C=C2\C=CC(=O)C=C2O)nc(N2CCOCC2)n1     0.0


In [37]:
ex = start_train_df.iloc[0,:]
m_ex = Chem.MolFromSmiles(ex['SMILES'])

print(m_ex.GetNumAtoms())


18


In [45]:
lip_features = inspect.getmembers(l, inspect.isfunction)
#desc_features = inspect.getmembers(d, inspect.isfunction)
frag_features = inspect.getmembers(f, inspect.isfunction)


def add_features(dataset, type='train'):
    #Iterate rows, create a new dataset
    dict_df = []
    tmp_dict = {}
    for i in range(len(dataset)): #len(dataset)
        tmp_dict = {}
        #Make a list of dictionaries
        row = dataset.iloc[i,:]

        
        tmp_dict['INDEX'] = row['INDEX']
        if type != 'test':
            tmp_dict['ACTIVE'] = row['ACTIVE']
        tmp_dict['SMILES'] = row['SMILES']

        m_row = Chem.MolFromSmiles(row['SMILES'])
        tmp_dict['num_atoms'] = m_row.GetNumAtoms()
        tmp_dict['mol_wt'] = d.CalcExactMolWt(m_row)

        #generate fingerprints: Morgan fingerprint, radius 2
        nBits = 124
        fp = AllChem.GetMorganFingerprintAsBitVect(m_row,2,nBits=nBits)
        fp_a = np.array(fp)
        for k in range(nBits):
            tmp_dict['mf_'+str(k)] = fp_a[k]

        #Get all fragment features
        for k in range(len(frag_features)):

            #exclude attributes that start with _ 
            if frag_features[k][0].startswith('_') == False:
                #Apply every function to the m_row
                feat = frag_features[k][1](m_row)

                #Add it to the dict
                tmp_dict[frag_features[k][0]] = feat

        #Get all lipinski
        for k in range(len(lip_features)):

            #exclude attributes that start with _ 
            if lip_features[k][0].startswith('_') == False:
                #Apply every function to the m_row
                feat = lip_features[k][1](m_row)

                #Add it to the dict
                tmp_dict['lp_' + str(lip_features[k][0])] = feat

        #Get char count
        smile_str = row['SMILES']
        char_dict = {e:smile_str.count(e) for e in set(smile_str)}
        #print(char_dict)

        chars = ['C','B', 'S', 'P', 'N', 'I', 'O']
        for c in chars:
            tmp_dict[c] = char_dict.get(c, 0)
        
        #Add Cl, Br, OH
        tmp_dict['Cl'] = smile_str.count('Cl')
        tmp_dict['Br'] = smile_str.count('Br')
        tmp_dict['OH'] = smile_str.count('OH')
        tmp_dict['smile_length'] = len(smile_str)


        dict_df.append(tmp_dict)
        #print(tmp_dict)
        if i%1000 == 0:
            print(i)

    pd_df = pd.DataFrame(dict_df)
    return pd_df
    #train_df = train_df.merge(pd_df, on='INDEX')

In [47]:
#For each row, extract the features and add them to the dataset

#Iterate rows, create a new dataset
'''
dict_df = []
tmp_dict = {}
for i in range(len([1,2,3])):
    tmp_dict = {}
    #Make a list of dictionaries
    row = train_df.iloc[i,:]

    m_row = Chem.MolFromSmiles(row['SMILES'])
    tmp_dict['INDEX'] = row['INDEX']
    tmp_dict['num_atoms'] = m_row.GetNumAtoms()
    tmp_dict['mol_wt'] = d.CalcExactMolWt(m_row)

    #generate fingerprints: Morgan fingerprint, radius 2
    nBits = 124
    fp = AllChem.GetMorganFingerprintAsBitVect(m_row,2,nBits=nBits)
    fp_a = np.array(fp)
    for i in range(nBits):
        tmp_dict['mf_'+str(i)] = fp_a[i]

    #Get all fragment features
    frag_features = inspect.getmembers(f, inspect.isfunction)

    for i in range(len(frag_features)):

        #exclude attributes that start with _ 
        if frag_features[i][0].startswith('_') == False:
            #Apply every function to the m_row
            feat = frag_features[i][1](m_row)

            #Add it to the dict
            tmp_dict['fr_'+frag_features[i][0]] = feat

    dict_df.append(tmp_dict)
    #print(tmp_dict)

pd_df = pd.DataFrame(dict_df)
train_df = train_df.merge(pd_df, on='INDEX')

#Then, inner join the two dataset using the index
'''
train_df = add_features(start_train_df)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000


In [48]:
print(train_df)
print(np.array(train_df.columns))

INDEX  ACTIVE                                             SMILES  \
0            1     0.0                        CC#CCCCC(=O)Nc1ccccc1C(=O)O   
1            2     0.0  [O-][Cl+3]([O-])([O-])[O-].c1ccc(-c2[nH]c[n+](...   
2            3     0.0   CCOC(=O)CSc1nnc(NC(=O)c2cccc([N+](=O)[O-])c2C)s1   
3            4     0.0  O=C(CN1CCN(S(=O)(=O)c2ccccc2)CC1)Nc1ccc(Cl)c(C...   
4            5     0.0         Cc1cc(NN/C=C2\C=CC(=O)C=C2O)nc(N2CCOCC2)n1   
...        ...     ...                                                ...   
121369  121370     0.0        O=C(NCc1cccs1)C1CCCN(S(=O)(=O)c2cnc[nH]2)C1   
121370  121371     0.0                         COc1ccc(Cn2nc(C)cc2C)cc1OC   
121371  121372     0.0  Cc1ccc(-c2nn(-c3cc(Cl)ccc3[N+](=O)[O-])c(=O)c3...   
121372  121373     0.0        O=C(OCCN1C(=O)c2ccccc2C1=O)c1cccc(OC(F)F)c1   
121373  121374     0.0  Cc1cc(C)c(C)c(S(=O)(=O)N2CCC(C(=O)Nc3ccccc3N3C...   

        num_atoms      mol_wt  mf_0  mf_1  mf_2  mf_3  mf_4  ...  B  S  P  N  \
0   

In [49]:
#Same for test set

#Iterate rows, create a new dataset
'''
dict_df = []
tmp_dict = {}
for i in range(len(test_df)):
    tmp_dict = {}
    #Make a list of dictionaries
    row = test_df.iloc[i,:]

    m_row = Chem.MolFromSmiles(row['SMILES'])
    tmp_dict['INDEX'] = row['INDEX']
    tmp_dict['num_atoms'] = m_row.GetNumAtoms()
    tmp_dict['mol_wt'] = d.CalcExactMolWt(m_row)
    dict_df.append(tmp_dict)
    #print(tmp_dict)

pd_df = pd.DataFrame(dict_df)
test_df = test_df.merge(pd_df, on='INDEX')
'''
test_df = add_features(start_test_df, type='test')

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000


In [50]:
print(test_df)

INDEX                                             SMILES  num_atoms  \
0      121375  Cc1ccc(-c2csc(NC(=O)C3=NN(c4ccccc4)C(=O)CC3)n2...         28   
1      121376  O=C(Nc1ccccc1)N1CC[C@@]2(CCCN(C(=O)c3cccc(F)c3...         28   
2      121377        CC(=O)N1C(=O)N(C(C)=O)C2C1N(C)C(=O)N2C(C)=O         20   
3      121378  CCOC(=O)Cn1/c(=N/C(=O)c2ccc([N+](=O)[O-])s2)sc...         26   
4      121379  Cc1ccc(S(=O)(=O)N2CCC(C(=O)Nc3nnc(C45CC6CC(CC(...         34   
...       ...                                                ...        ...   
40453  161828            O=C(CSc1nnc(-c2ccncc2)o1)N1CCc2ccccc2C1         25   
40454  161829    N=c1sccn1CC(=O)Nc1ccc(Cl)c(S(=O)(=O)N2CCOCC2)c1         26   
40455  161830  CC(/C=C/c1ccc2c(c1)OCO2)=N\NC(=O)c1cccc([N+](=...         26   
40456  161831        CC(C)Cn1c(=O)c(C(=O)Nc2cnccn2)c(O)c2ccccc21         25   
40457  161832  O=C(CSc1nc2c(c(C(F)(F)F)n1)CCc1ccccc1-2)NCc1cc...         31   

           mol_wt  mf_0  mf_1  mf_2  mf_3  mf_4  mf_5  ... 

In [51]:
#Save the two dataset

train_df.to_csv("Datasets/train_complete.csv")
test_df.to_csv("Datasets/test_complete.csv")