In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import random
import pickle
import time

from rdkit import Chem
from rdkit.Chem import Draw


fp_radius = 2
fp_bits = 2048

import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [2]:
from Transformer.Transformer import Transformer, create_look_ahead_mask, create_padding_mask, create_masks, CustomSchedule, evaluate, loss_function, BeamSearch 
from ChemUtils.PreProcessing import read_file, smiles_to_fp_array, CHAR_LEN, array_to_smiles

# Smiles and FP encoding

In [3]:
smiles_array, fp_array = read_file("Data/train.csv")

# Split train and test set

In [4]:
'''
train_rate = 0.98

r = np.random.uniform(size=[smiles_array.shape[0]])
r = r<0.98

train_smiles = smiles_array[r]
train_fp = fp_array[r]

test_smiles = smiles_array[~r]
test_fp = fp_array[~r]

with open("Data/train.pkl", "wb") as pf:
    pickle.dump([train_smiles,train_fp], pf)
    
with open("Data/test.pkl", "wb") as pf:
    pickle.dump([test_smiles,test_fp], pf)
'''

'\ntrain_rate = 0.98\n\nr = np.random.uniform(size=[smiles_array.shape[0]])\nr = r<0.98\n\ntrain_smiles = smiles_array[r]\ntrain_fp = fp_array[r]\n\ntest_smiles = smiles_array[~r]\ntest_fp = fp_array[~r]\n\nwith open("Data/train.pkl", "wb") as pf:\n    pickle.dump([train_smiles,train_fp], pf)\n    \nwith open("Data/test.pkl", "wb") as pf:\n    pickle.dump([test_smiles,test_fp], pf)\n'

# Define training data generator

In [5]:
#with open("Data/train.pkl",  'rb') as pf:
#    trains = pickle.load(pf)

BATCH_SIZE = 300

gen = tf.data.Dataset.from_tensor_slices((smiles_array,fp_array))
gen = gen.shuffle(buffer_size=smiles_array.shape[0])
gen = gen.batch(BATCH_SIZE)

AUTOTUNE = tf.data.experimental.AUTOTUNE
gen = gen.prefetch(buffer_size=AUTOTUNE)

# Define transformer model

In [6]:
num_layers = 4
d_model = 256
num_heads = 8
dff = 2048
input_vocab_size=fp_bits+3
target_vocab_size = CHAR_LEN+3
pe_input = fp_array.shape[1]
pe_target= smiles_array.shape[1]
rate = 0.1
EPOCHS=35

transformer = Transformer(num_layers, d_model, num_heads, dff, 
                    input_vocab_size, target_vocab_size,
                    pe_input, pe_target, rate)

In [7]:
#transformer.load_weights("Models/FPsToSmiles")

# Metrics

In [8]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')

# Training

In [9]:
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

In [10]:
checkpoint_path = "./checkpoints/FPsToSMILES/train"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# restore
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!!')

Latest checkpoint restored!!


In [16]:
train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
]

@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

    with tf.GradientTape() as tape:
        predictions, _ = transformer(inp, tar_inp, 
                                     True, 
                                     enc_padding_mask, 
                                     combined_mask, 
                                     dec_padding_mask)
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(tar_real, predictions)

In [17]:
for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()

    # inp -> portuguese, tar -> english
    for (batch, (tar, inp)) in enumerate(gen):
        train_step(inp, tar)

        if batch % 100 == 0:
            print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
                epoch + 1, batch, train_loss.result(), train_accuracy.result()))

    if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                             ckpt_save_path))

    print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))

    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 2.2029 Accuracy 0.0026
Epoch 1 Batch 100 Loss 1.4521 Accuracy 0.1065
Epoch 1 Batch 200 Loss 1.2109 Accuracy 0.1625
Epoch 1 Batch 300 Loss 1.0516 Accuracy 0.2008
Epoch 1 Batch 400 Loss 0.9387 Accuracy 0.2299
Epoch 1 Batch 500 Loss 0.8533 Accuracy 0.2524
Epoch 1 Batch 600 Loss 0.7867 Accuracy 0.2705
Epoch 1 Batch 700 Loss 0.7326 Accuracy 0.2856
Epoch 1 Loss 0.7167 Accuracy 0.2900
Time taken for 1 epoch: 367.8356695175171 secs

Epoch 2 Batch 0 Loss 0.3691 Accuracy 0.3856
Epoch 2 Batch 100 Loss 0.3583 Accuracy 0.3919
Epoch 2 Batch 200 Loss 0.3435 Accuracy 0.3966
Epoch 2 Batch 300 Loss 0.3305 Accuracy 0.4009
Epoch 2 Batch 400 Loss 0.3179 Accuracy 0.4050
Epoch 2 Batch 500 Loss 0.3061 Accuracy 0.4091
Epoch 2 Batch 600 Loss 0.2950 Accuracy 0.4131
Epoch 2 Batch 700 Loss 0.2839 Accuracy 0.4170
Epoch 2 Loss 0.2803 Accuracy 0.4182
Time taken for 1 epoch: 360.5914988517761 secs

Epoch 3 Batch 0 Loss 0.2022 Accuracy 0.4528
Epoch 3 Batch 100 Loss 0.1921 Accuracy 0.4489
Epoch 3 Ba