### Setup RDKIT 


In [0]:
!wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!time bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!time conda install -q -y -c conda-forge rdkit

--2020-02-22 20:00:09--  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
Resolving repo.continuum.io (repo.continuum.io)... 104.18.200.79, 104.18.201.79, 2606:4700::6812:c94f, ...
Connecting to repo.continuum.io (repo.continuum.io)|104.18.200.79|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 71785000 (68M) [application/x-sh]
Saving to: ‘Miniconda3-latest-Linux-x86_64.sh’


2020-02-22 20:00:10 (162 MB/s) - ‘Miniconda3-latest-Linux-x86_64.sh’ saved [71785000/71785000]

PREFIX=/usr/local
Unpacking payload ...
Collecting package metadata (current_repodata.json): - \ | done
Solving environment: - done

## Package Plan ##

  environment location: /usr/local

  added / updated specs:
    - _libgcc_mutex==0.1=main
    - asn1crypto==1.2.0=py37_0
    - ca-certificates==2019.10.16=0
    - certifi==2019.9.11=py37_0
    - cffi==1.13.0=py37h2e261b9_0
    - chardet==3.0.4=py37_1003
    - conda-package-handling==1.6.0=py37h7b6447c_0
    - conda=

append rdkit path to current python system path.

In [0]:
%matplotlib inline
import matplotlib.pyplot as plt
import sys
import os
sys.path.append('/usr/local/lib/python3.7/site-packages/')

In [0]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/My Drive/Colab Notebooks/.kaggle/"
!kaggle datasets download -d fanconic/smiles-toxicity

In [0]:
!unzip /content/smiles-toxicity.zip

Archive:  /content/smiles-toxicity.zip
replace data/NR-ER-test/names_labels.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [0]:
import numpy as np
import pandas as pd
 
from rdkit import Chem
from rdkit.Chem import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import RDConfig
from rdkit import rdBase
from rdkit.Chem.Draw import IPythonConsole

rdBase.rdkitVersion
#Read and transform training set
trainsdf_smiles = pd.read_csv('/content/data/NR-ER-train/names_smiles.csv',names=["name","smile"])
trainsdf_labels = pd.read_csv('/content/data/NR-ER-train/names_labels.csv',names=["name","label"])
trainsdf = pd.merge(trainsdf_smiles, trainsdf_labels, on=["name"])
train_names = [mol for mol in trainsdf["name"] if mol != None]
train_smiles = [mol for mol in trainsdf["smile"] if mol != None]
train_labels = [mol for mol in trainsdf["label"] if mol != None]
trainsdf = trainsdf.drop(columns="name")

#Read and transform testing set
testdf_smiles = pd.read_csv('/content/data/NR-ER-test/names_smiles.csv',names=["name","smile"])
testdf_labels = pd.read_csv('/content/data/NR-ER-test/names_labels.csv',names=["name","label"])
testdf = pd.merge(testdf_smiles, testdf_labels, on=["name"])
test_names = [mol for mol in testdf["name"] if mol != None]
test_smiles = [mol for mol in testdf["smile"] if mol != None]
test_labels = [mol for mol in testdf["label"] if mol != None]
testdf = testdf.drop(columns="name")

print(len(train_names), len(test_names))



7697 265


In [0]:
def mol2arr(mol):
  arr = np.zeros((1,))
  fp = AllChem.GetMorganFingerprintAsBitVect(mol=mol,radius=2)
  DataStructs.ConvertToNumpyArray(fp, arr)
  return arr

In [0]:
from rdkit import Chem
import tensorflow as tf
import tensorflow.keras as keras
from keras import Model
from keras.layers import Activation, Dense, Dropout, Input
from keras.utils import np_utils
from rdkit.Chem import DataStructs

i = 0
trainX=[]
ignore = [334,5223,6374,6880] #rdkit returns none
for mol in train_smiles :
  if mol is not None:
    if i not in ignore:
      m = Chem.MolFromSmiles(mol)
      assert(m is not None)
      finger = AllChem.GetHashedMorganFingerprint(m,2,nBits=2048)
      array = np.zeros((0, ), dtype=np.int8)
      DataStructs.ConvertToNumpyArray(finger, array)
      trainX.append(array)
    i = i+1

print(str(len(trainX))+" sanitized molecules in the training set")

trainY = [label for label in train_labels if label is not None]
trainY = np.delete(trainY,ignore, axis=0)
#trainY = np_utils.to_categorical(trainY)
assert(len(trainY)==len(trainX))


i = 0
ignore=[169]
testX = [] #rdkit returns none
for mol in test_smiles :
  if mol is not None:
    if i not in ignore:
      m = Chem.MolFromSmiles(mol)
      assert(m is not None)
      finger = AllChem.GetHashedMorganFingerprint(m,2,nBits=2048)
      array = np.zeros((0, ), dtype=np.int8)
      DataStructs.ConvertToNumpyArray(finger, array)
      testX.append(array)
    i = i+1
print(str(len(testX))+" sanitized molecules in the testing set")

testY = [label for label in test_labels if label is not None]
testY = np.delete(testY,ignore, axis=0)
#testY = np_utils.to_categorical(testY)
assert(len(testY)==len(testX))



Using TensorFlow backend.


7693 sanitized molecules in the training set
264 sanitized molecules in the testing set




In [0]:
print(trainX[10].nonzero())

(array([ 496,  647,  875,  888, 1013, 1039, 1055, 1057, 1088, 1171, 1380,
       1457, 1722, 1750, 1823, 1873, 1957]),)


## Training neural model


In [0]:


'''
Train a simple classifier to predict the chemicals' toxicity based on it's structural information
(Binary Classification)
'''
import keras
import sklearn
from sklearn.model_selection import StratifiedKFold
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from keras.layers import Dropout, LSTM, Dense, TimeDistributed, Activation, Input , BatchNormalization
from keras.optimizers import SGD, Adam
from keras.callbacks import ModelCheckpoint


def Dense_NN():  # Neural Network Architecture#
    model = Sequential([
                        Dense(100,input_dim=2048,kernel_initializer='he_normal',activation='relu'),
                        BatchNormalization(),
                        Dropout(0.5),
                        Dense(50,kernel_initializer='he_normal',activation='relu'),
                        BatchNormalization(), 
                        Dropout(0.5),
                        Dense(1,kernel_initializer='he_normal',activation='sigmoid'),
    ])
    model.compile(optimizer=Adam(lr=0.0001),loss='binary_crossentropy')
    model.summary()
    return model


data = np.array(trainX)
print(data.shape)
labels = np.array(trainY)

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits)
non_linear_model = Dense_NN()

for train_index, test_index in skf.split(data, labels):
  
  x_train, x_test = np.array(data[train_index]), np.array(data[test_index])
  y_train, y_test = np.array(labels[train_index]), np.array(labels[test_index])

  epochs = 25
  hist = non_linear_model.fit(x_train, y_train,
                              epochs=epochs, batch_size=16,
                              callbacks=[ModelCheckpoint('classification.h5')],
                              class_weight=np.unique(y_train),
                              validation_data=(np.array(testX),np.array(testY)))
  plt.plot(range(epochs), hist.history['val_loss'])
  plt.plot(range(epochs), hist.history['loss'])




In [0]:
from rdkit.Chem import Draw
random_mol = testX[123]
Draw.MolToFile(random_mol,'mol.png')

## Color Representation


In [0]:

from rdkit import Chem
import tensorflow as tf
import tensorflow.keras as keras
from keras import Model
from keras.layers import Activation, Dense, Dropout, Input
from keras.utils import np_utils
from rdkit.Chem import DataStructs

w = Chem.SDWriter('data/molecules2D.sdf')
i = 0
trainX_3D = []
ignore = [334,5223,6374,6880] #rdkit returns none
for mol in train_smiles :
  if mol is not None:
    if i not in ignore:
      m = Chem.MolFromSmiles(mol)
      assert(m is not None)
      mol2=Chem.RemoveHs(m)
      x = AllChem.EmbedMolecule(mol2)
      AllChem.Compute2DCoords(mol2)
      #positions = mol2.GetConformer().GetPositions()
      #trainX_3D.append(positions)
      w.write(mol2)
    i = i+1

i = 0
ignore=[169]
testX_3D = [] #rdkit returns none
for mol in test_smiles :
  if mol is not None:
    if i not in ignore:
      m = Chem.MolFromSmiles(mol)
      assert(m is not None)
      AllChem.Compute2DCoords(m)
      positions = m.GetConformer().GetPositions()
      testX_3D.append(positions)
    i = i+1




In [0]:
!conda install -c openbabel openbabel

Collecting package metadata (current_repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - done
Solving environment: | / - \ | / - \ | done

## Package Plan ##

  environment location: /usr/local

  added / updated specs:
    - openbabel


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2020.1.1   |                0         125 KB
    certifi-2019.11.28         |           py37_0         153 KB
    openbabel-2.4.1            |           py37_6         5.1 MB  openbabel
    openssl-1.1.1d             |       h7b6447c_4         2.5 MB
    ------------------------------------------------------------
                                           Total:         7.8 MB

The following NEW packages will be INSTALLED:

  openbabel          openbabel/linux-64::openbabel-2.4.1-py37_6

The following packages

In [0]:

 babel -isdf molecule.sdf -opdb molecule.pdb --gen3D









KeyboardInterrupt: ignored

In [0]:
Xtrain_coords = []
model_num = 1
array = []
with open('molecule.pdb') as pdbfile:
    for line in pdbfile:
        if line[:5] == 'MODEL':
          print('Parsing  Molecule '+str(model_num))
          model_num = model_num + 1
          array = []
        elif line[:4] == 'ATOM' or line[:6] == "HETATM":
            #print(line)
            # Split the line
            x_list = line[30:38]
            y_list = line[38:46]
            z_list = line[46:54]

            x_str = ' '.join([str(elem) for elem in x_list]) 
            y_str = ' '.join([str(elem) for elem in y_list]) 
            z_str = ' '.join([str(elem) for elem in z_list]) 
            
            x_str_no_space = x_str.replace(" ", "")
            y_str_no_space = y_str.replace(" ", "")
            z_str_no_space = z_str.replace(" ", "")

            x_float = float(x_str_no_space)
            y_float = float(y_str_no_space)
            z_float = float(z_str_no_space)

            #print('x = '+str(x_float),'y = '+str(y_float), 'z = ' + str(z_float))            

Parsing  Molecule 1
Parsing  Molecule 2
Parsing  Molecule 3
Parsing  Molecule 4
Parsing  Molecule 5
Parsing  Molecule 6
Parsing  Molecule 7
Parsing  Molecule 8
Parsing  Molecule 9
Parsing  Molecule 10
Parsing  Molecule 11
Parsing  Molecule 12
Parsing  Molecule 13
Parsing  Molecule 14
Parsing  Molecule 15
Parsing  Molecule 16
Parsing  Molecule 17
Parsing  Molecule 18
Parsing  Molecule 19
Parsing  Molecule 20
Parsing  Molecule 21
Parsing  Molecule 22
Parsing  Molecule 23
Parsing  Molecule 24
Parsing  Molecule 25
Parsing  Molecule 26
Parsing  Molecule 27
Parsing  Molecule 28
Parsing  Molecule 29
Parsing  Molecule 30
Parsing  Molecule 31
Parsing  Molecule 32
Parsing  Molecule 33
Parsing  Molecule 34
Parsing  Molecule 35
Parsing  Molecule 36
Parsing  Molecule 37
Parsing  Molecule 38
Parsing  Molecule 39
Parsing  Molecule 40
Parsing  Molecule 41
Parsing  Molecule 42
Parsing  Molecule 43
Parsing  Molecule 44
Parsing  Molecule 45
Parsing  Molecule 46
Parsing  Molecule 47
Parsing  Molecule 48
P