# MODEL CODE - UNIDIRECTIONAL RNN WITH LSTM CELLS

In [96]:
import numpy as np
from collections import OrderedDict
from itertools import chain
from rdkit import Chem
from sklearn.utils import shuffle
from matplotlib import pyplot as plt
import tensorflow as tf

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, BatchNormalization
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM

In [60]:
train_data = ['CC1C2CCC(C2)C1CN(CCO)C(=O)c1ccc(Cl)cc1',
       'COc1ccc(-c2cc(=O)c3c(O)c(OC)c(OC)cc3o2)cc1O',
       'CCOC(=O)c1ncn2c1CN(C)C(=O)c1cc(F)ccc1-2',
       'Clc1ccccc1-c1nc(-c2ccncc2)no1',
       'CC(C)(Oc1ccc(Cl)cc1)C(=O)OCc1cccc(CO)n1']

In [61]:
train_data[:5]

array(['CC1C2CCC(C2)C1CN(CCO)C(=O)c1ccc(Cl)cc1',
       'COc1ccc(-c2cc(=O)c3c(O)c(OC)c(OC)cc3o2)cc1O',
       'CCOC(=O)c1ncn2c1CN(C)C(=O)c1cc(F)ccc1-2',
       'Clc1ccccc1-c1nc(-c2ccncc2)no1',
       'CC(C)(Oc1ccc(Cl)cc1)C(=O)OCc1cccc(CO)n1'], dtype='<U43')

In [62]:
Chem.MolFromSmiles(train_data[0])

<rdkit.Chem.rdchem.Mol at 0x29734fa5300>

In [73]:
# creating mapping for each char to integer, also mapping for the E (end) is manually inserted into the dictionaries.
unique_chars = sorted(list(OrderedDict.fromkeys(chain.from_iterable(train_data))))
unique_chars

['(', ')', '-', '1', '2', '3', '=', 'C', 'F', 'N', 'O', 'c', 'l', 'n', 'o']

In [85]:
# maps each unique character as int
char_to_int = dict((c, i) for i, c in enumerate(unique_chars)) #from character to int
int_to_char = dict((i, c) for i, c in enumerate(unique_chars)) #from int to character

#add E to both dict for padding
char_to_int.update({"E" : len(char_to_int)})
int_to_char.update({len(int_to_char) : "E"})
char_to_int

{'(': 0,
 ')': 1,
 '-': 2,
 '1': 3,
 '2': 4,
 '3': 5,
 '=': 6,
 'C': 7,
 'F': 8,
 'N': 9,
 'O': 10,
 'c': 11,
 'l': 12,
 'n': 13,
 'o': 14,
 'E': 15}

In [86]:
def gen_data(data, int_to_char, char_to_int, embed):
    
    one_hot =  np.zeros((data.shape[0], embed+1, len(char_to_int)),dtype=np.int8)
    for i,smile in enumerate(data):
        #encode the chars
        for j,c in enumerate(smile):
            one_hot[i,j,char_to_int[c]] = 1
        #Encode endchar
        one_hot[i,len(smile):,char_to_int["E"]] = 1
    #Return two, one for input and the other for output
    return one_hot[:,0:-1,:], one_hot[:,1:,:]

In [79]:
embed = max([len(seq) for seq in train_data]) #langste, zodat je weet hoeveel je moet padden. 
embed

43

In [87]:
X, Y = gen_data(train_data, int_to_char, char_to_int, embed)
X, Y = shuffle(X, Y)

In [97]:
# Create the model (simple 2 layer LSTM)
model = Sequential()

model.add(BatchNormalization(input_shape=(None, X.shape[2])))

model.add(LSTM(512, input_shape=(None, X.shape[2]), return_sequences = True))
model.add(Dropout(0.25))

model.add(LSTM(256, return_sequences = True))
model.add(Dropout(0.25))

model.add(BatchNormalization())
model.add(Dense(Y.shape[-1], activation='softmax'))

print (model.summary())

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_6 (Batch (None, None, 16)          64        
_________________________________________________________________
lstm_12 (LSTM)               (None, None, 512)         1083392   
_________________________________________________________________
dropout_12 (Dropout)         (None, None, 512)         0         
_________________________________________________________________
lstm_13 (LSTM)               (None, None, 256)         787456    
_________________________________________________________________
dropout_13 (Dropout)         (None, None, 256)         0         
_________________________________________________________________
batch_normalization_7 (Batch (None, None, 256)         1024      
_________________________________________________________________
dense_6 (Dense)              (None, None, 16)         

In [104]:
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam

model.compile(loss=categorical_crossentropy, optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

In [106]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

#if validation loss does not increase for 3 epochs. 
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

history = model.fit(X, Y, epochs = 10, batch_size = 256, callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
from keras.optimizers import Adam, SGD



history = model.fit_generator(datagenerator(X_train, y_train, patch_size, patches_per_im, batch_size),
                              validation_data=(val_images, val_segmentations),
                              steps_per_epoch=steps_per_epoch, epochs=epochs, verbose=2,
                              callbacks=[early_stopping])