# Train the pretrained model( Initial_Training.ipynb) with additional data

### Import core libraries

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
import random as rn
import matplotlib.pyplot as plt
%matplotlib inline

#%tensorflow_version 2.x
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.3.1


In [2]:
# load Keras libraries
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Concatenate
from tensorflow.keras.initializers import RandomNormal
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, TensorBoard
from tensorflow.keras.utils import Sequence

In [3]:
# set random seed
seed = 777
np.random.seed(seed)
rn.seed(seed)
np.seterr(divide='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

### load the model

In [4]:
model_file = '../input/data-zinc/LSTM_model.h5'
model = load_model(model_file)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 100, 45)]    0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 256), (None, 309248      input_1[0][0]                    
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 512)          0           lstm[0][1]                       
                                                                 lstm[0][2]                       
__________________________________________________________________________________________________
dense (Dense)                   (None, 128)          65664       concatenate[0][0]            

### create data load functions

In [5]:

def load_data(data):
    with open(data, 'r') as f:
        smiles = [r.rstrip() for r in f]
    return np.array(smiles)

def load_dictionaries(input_dict):
    with open(input_dict, 'r') as fp:
        new_dict = json.load(fp)
    return new_dict 

### Additional Training Data

In [6]:
smifile = '../input/data-zinc/nextgen_smiles.smi'
data = load_data(smifile)
print("Training dataset shape:", data.shape)
print("Sample training smile: \n", data[2])

Training dataset shape: (100,)
Sample training smile: 
 NC1CC(O)C23CCC1C2Cc1c2cc[n+](nc(cnc1C=O)C3)CC1=CC=C(C=C2)N(N)O1


### Load the Python dictionaries that map characters-to-integers and intergers-to-characters

In [7]:
d1 = '../input/data-zinc/char_to_int.json'
d2 = '../input/data-zinc/int_to_char.json'
char_to_int = load_dictionaries(d1)
int_to_char = load_dictionaries(d2)
n_vocab = len(char_to_int)
print("Character set vocabulary length:", n_vocab)
print("Dictionary mapping characters-to-integers:\n", char_to_int)
print("Dictionary mapping integers-to-characters:\n", int_to_char)

Character set vocabulary length: 45
Dictionary mapping characters-to-integers:
 {'n': 0, '[': 1, '\\': 2, 'E': 3, 'H': 4, ')': 5, 'B': 6, '9': 7, '2': 8, ']': 9, '7': 10, '!': 11, 't': 12, 's': 13, 'o': 14, 'c': 15, 'K': 16, '-': 17, '/': 18, 'l': 19, 'A': 20, 'r': 21, '@': 22, 'C': 23, '=': 24, '6': 25, 'N': 26, 'L': 27, 'a': 28, '5': 29, 'S': 30, 'T': 31, '#': 32, '+': 33, 'P': 34, 'i': 35, '(': 36, '8': 37, '1': 38, 'I': 39, 'e': 40, 'O': 41, '3': 42, 'F': 43, '4': 44}
Dictionary mapping integers-to-characters:
 {'0': 'n', '1': '[', '2': '\\', '3': 'E', '4': 'H', '5': ')', '6': 'B', '7': '9', '8': '2', '9': ']', '10': '7', '11': '!', '12': 't', '13': 's', '14': 'o', '15': 'c', '16': 'K', '17': '-', '18': '/', '19': 'l', '20': 'A', '21': 'r', '22': '@', '23': 'C', '24': '=', '25': '6', '26': 'N', '27': 'L', '28': 'a', '29': '5', '30': 'S', '31': 'T', '32': '#', '33': '+', '34': 'P', '35': 'i', '36': '(', '37': '8', '38': '1', '39': 'I', '40': 'e', '41': 'O', '42': '3', '43': 'F', '44

### Create a function to turn the dataset into a supervised problem, add the beginning and ending character markers, add padding for constant sequence length, and turn the sequence into a sequence of one-hot vectors

In [8]:
def vectorize(smiles, embed, n_vocab):
    one_hot = np.zeros((smiles.shape[0], embed, n_vocab), dtype=np.int8)
    for i, smile in enumerate(smiles):
        # encode the start
        one_hot[i,0,char_to_int["!"]] = 1
        #encode the smiles characters
        for j, c in enumerate(smile):
            one_hot[i,j+1,char_to_int[c]] = 1
        # encode the end of the smiles string
        one_hot[i,len(smile)+1:,char_to_int["E"]] = 1
    # return two items, one for input and one for output
    return one_hot[:,0:-1,:], one_hot[:,1:,:]

### Create our X & y dataset

In [9]:
embed = 101
X_train, y_train = vectorize(data, embed, n_vocab)
print("Training input shape:", X_train.shape)
print("Training output shape:", y_train.shape)

Training input shape: (100, 100, 45)
Training output shape: (100, 100, 45)


### Train the model on additional dataset

In [10]:
batch_size = 16
nb_epochs = 225 // 4
model.fit([X_train, X_train], y_train, epochs=nb_epochs, batch_size=batch_size)

Epoch 1/56
Epoch 2/56
Epoch 3/56
Epoch 4/56
Epoch 5/56
Epoch 6/56
Epoch 7/56
Epoch 8/56
Epoch 9/56
Epoch 10/56
Epoch 11/56
Epoch 12/56
Epoch 13/56
Epoch 14/56
Epoch 15/56
Epoch 16/56
Epoch 17/56
Epoch 18/56
Epoch 19/56
Epoch 20/56
Epoch 21/56
Epoch 22/56
Epoch 23/56
Epoch 24/56
Epoch 25/56
Epoch 26/56
Epoch 27/56
Epoch 28/56
Epoch 29/56
Epoch 30/56
Epoch 31/56
Epoch 32/56
Epoch 33/56
Epoch 34/56
Epoch 35/56
Epoch 36/56
Epoch 37/56
Epoch 38/56
Epoch 39/56
Epoch 40/56
Epoch 41/56
Epoch 42/56
Epoch 43/56
Epoch 44/56
Epoch 45/56
Epoch 46/56
Epoch 47/56
Epoch 48/56
Epoch 49/56
Epoch 50/56
Epoch 51/56
Epoch 52/56
Epoch 53/56
Epoch 54/56
Epoch 55/56
Epoch 56/56


<tensorflow.python.keras.callbacks.History at 0x7fc76820d050>

### Save our better learned model & weights

In [11]:
mod_file = 'Better_LSTM_model.h5'
model.save(mod_file)
print("Model Saved")

Model Saved


### Create the encoder model from the previously trained model

In [12]:
encoder_model = Model(inputs=model.layers[0].input, outputs=model.layers[3].output)
encoder_model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 100, 45)]    0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 256), (None, 309248      input_1[0][0]                    
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 512)          0           lstm[0][1]                       
                                                                 lstm[0][2]                       
__________________________________________________________________________________________________
dense (Dense)                   (None, 128)          65664       concatenate[0][0]     

Next, we need to create an intermediary model, one that can decode the latent space into the states that need to be set as inputs to the decoder LSTM cells. A new input, matching the latent space is defined. The model layers from before can be reused to get the h and c states; that way we are able to inherit the weights from the trained model.


### Create a model for mapping from the latent space to the input states of the decoder LSTM model

In [13]:
latent_input = Input(shape=(128, ))
state_h = model.layers[5](latent_input)
state_c = model.layers[6](latent_input)
latent_to_states_model = Model(latent_input, [state_h, state_c])
latent_to_states_model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 256)          33024       input_1[0][0]                    
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 256)          33024       input_1[0][0]                    
Total params: 66,048
Trainable params: 66,048
Non-trainable params: 0
__________________________________________________________________________________________________


Now it's time to create the decoder model. The base model was trained in a stateless batch mode; however, we will setup the decoder model to be stateful so as to predict one character at a time. The layers are defined exactly as before, except with a new batch_shape.

### Define the stateful decoder model

In [14]:
decoder_inputs = Input(batch_shape=(1, 1, 45))
decoder_lstm = LSTM(256, return_sequences=True, stateful=True)(decoder_inputs)
decoder_outputs = Dense(45, activation='softmax')(decoder_lstm)
gen_model = Model(decoder_inputs, decoder_outputs)

### Transfer the weights from the model to our generative model

In [15]:
for i in range(1,3):
    gen_model.layers[i].set_weights(model.layers[i+6].get_weights())
gen_model.save("gen_model.h5")
gen_model.summary()

Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(1, 1, 45)]              0         
_________________________________________________________________
lstm (LSTM)                  (1, 1, 256)               309248    
_________________________________________________________________
dense (Dense)                (1, 1, 45)                11565     
Total params: 320,813
Trainable params: 320,813
Non-trainable params: 0
_________________________________________________________________


We can play with the temperature of the Softmax activation function during sampling. Decreasing the temperature from 1 to some lower number (e.g. 0.5) makes the RNN more confident, but also more conservative in its samples. Conversely, higher temperatures will provide more diversity but at cost of more invalid entities. 

### Create our Softmax sampling function

In [16]:
 def sample_with_temp(preds, sampling_temp):
    streched = np.log(preds) / sampling_temp
    streched_probs = np.exp(streched) / np.sum(np.exp(streched))
    return np.random.choice(range(len(streched)), p=streched_probs)

To generate samples from the latent space, we need to compute the h and c states using the latent_to_states_model and then set the initial states of the LSTM decoder model. The generative model will be fed one input character at a time and iteratively sample the next character until the end character “E” is encountered.


### Create a function to generate new smiles from the latent space

In [17]:
def sample_smiles(latent, n_vocab, sampling_temp):
    #decode the latent states and set the initial state of the LSTM cells
    states = latent_to_states_model.predict(latent)
    gen_model.layers[1].reset_states(states=[states[0], states[1]])
    # define the input character
    startidx = char_to_int["!"]
    samplevec = np.zeros((1,1,n_vocab))
    samplevec[0,0,startidx] = 1
    sequence = ""
    # loop to predict the next smiles character
    for i in range(101):
        preds = gen_model.predict(samplevec)[0][-1]
        if sampling_temp == 1.0:
            sampleidx = np.argmax(preds)
        else:
            sampleidx = sample_with_temp(preds, sampling_temp)
        samplechar = int_to_char[str(sampleidx)]
        if samplechar != "E":
            sequence += samplechar
            samplevec = np.zeros((1,1,n_vocab))
            samplevec[0,0,sampleidx] = 1
        else:
            break
    return sequence

Now, we create our latent space using our encoder model.

In [18]:
# create the latent space
latent_space = encoder_model.predict(X_train)
print(f'Latent space shape: {latent_space.shape}')

Latent space shape: (100, 128)


Now we create a function to generate smiles molecules around a vector in our latent space.

### Function to generate smiles around a latent vector

In [19]:
def generate(latent_seed, sampling_temp, scale, quant):
    samples, mols = [], []
    for i in range(quant):
        latent_vec = latent_seed + scale*(np.random.randn(latent_seed.shape[1]))
        out = sample_smiles(latent_vec, n_vocab, sampling_temp)
        samples.append(out)
    return samples

### Generate our smiles molecules

In [20]:
latent_seed = latent_space[50:51]
sampling_temp = 0.75
scale = 0.5
quantity = 20
t_smiles = generate(latent_seed, sampling_temp, scale, quantity)

In [21]:
for i in t_smiles[:3]:
    print(i)

CN1C(C(SC(=Cc2ccccc2)c2cccc(CN=C(=O)Nc3ccc(F)cc3)c2oc3nonc2NCCCC(F)F)N3)CCC1
CNNC(=SC(C)(Cc1ccccc2)c1ccccc1C(=O)NC(O)(c1ccccn1)C(=O)O)N2CCCCF)C(=O)OC
NC1(C(SC(=CCn2ccccc2Oc2cccc(N3CCOC(=O)c3ccccc3F)N2CCCCCl)C2CC(=O)O)N2CCS12N
