In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import random
import pickle
import time

from rdkit import Chem
from rdkit.Chem import Draw


fp_radius = 2
fp_bits = 4096

import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [3]:
from Transformer.Transformer import Transformer, create_look_ahead_mask, create_padding_mask, create_masks, CustomSchedule, evaluate, loss_function, BeamSearch 
from ChemUtils.PreProcessing import read_file, smiles_to_fp_array, CHAR_LEN, array_to_smiles

# Smiles and FP encoding

In [4]:
smiles_array, fp_array = read_file("Data/train.csv", useCount=True, fp_bits=fp_bits)

# Split train and test set

In [5]:
'''
train_rate = 0.98

r = np.random.uniform(size=[smiles_array.shape[0]])
r = r<0.98

train_smiles = smiles_array[r]
train_fp = fp_array[r]

test_smiles = smiles_array[~r]
test_fp = fp_array[~r]

with open("Data/train_dupli_count.pkl", "wb") as pf:
    pickle.dump([train_smiles,train_fp], pf)
    
with open("Data/test_dupli_count.pkl", "wb") as pf:
    pickle.dump([test_smiles,test_fp], pf)
'''

'\ntrain_rate = 0.98\n\nr = np.random.uniform(size=[smiles_array.shape[0]])\nr = r<0.98\n\ntrain_smiles = smiles_array[r]\ntrain_fp = fp_array[r]\n\ntest_smiles = smiles_array[~r]\ntest_fp = fp_array[~r]\n\nwith open("Data/train_dupli_count.pkl", "wb") as pf:\n    pickle.dump([train_smiles,train_fp], pf)\n    \nwith open("Data/test_dupli_count.pkl", "wb") as pf:\n    pickle.dump([test_smiles,test_fp], pf)\n'

# Define training data generator

In [6]:
#with open("Data/train_dupli_count.pkl",  'rb') as pf:
    #trains = pickle.load(pf)

BATCH_SIZE = 200

gen = tf.data.Dataset.from_tensor_slices((smiles_array, fp_array))
gen = gen.shuffle(buffer_size=smiles_array.shape[0])
gen = gen.batch(BATCH_SIZE)

AUTOTUNE = tf.data.experimental.AUTOTUNE
gen = gen.prefetch(buffer_size=AUTOTUNE)

# Define transformer model

In [7]:
num_layers = 6
d_model = 256
num_heads = 8
dff = 2048
input_vocab_size=fp_bits+3
target_vocab_size = CHAR_LEN+3
pe_input = fp_array.shape[1]
pe_target= smiles_array.shape[1]
rate = 0.1
EPOCHS=35

transformer = Transformer(num_layers, d_model, num_heads, dff, 
                    input_vocab_size, target_vocab_size,
                    pe_input, pe_target, rate)

In [8]:
#transformer.load_weights("Models/FPsToSmiles")

# Metrics

In [9]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')

# Training

In [10]:
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

In [11]:
checkpoint_path = "./checkpoints/FPsToSMILES(DupliCount)/train"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# restore
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!!')

Latest checkpoint restored!!


In [None]:
train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
]

@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

    with tf.GradientTape() as tape:
        predictions, _ = transformer(inp, tar_inp, 
                                     True, 
                                     enc_padding_mask, 
                                     combined_mask, 
                                     dec_padding_mask)
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(tar_real, predictions)

In [12]:
for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()

    # inp -> portuguese, tar -> english
    for (batch, (tar, inp)) in enumerate(gen):
        train_step(inp, tar)

        if batch % 100 == 0:
            print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
                epoch + 1, batch, train_loss.result(), train_accuracy.result()))

    if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                             ckpt_save_path))

    print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))

    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 2.3096 Accuracy 0.0125
Epoch 1 Batch 100 Loss 1.4638 Accuracy 0.1106
Epoch 1 Batch 200 Loss 1.2606 Accuracy 0.1539
Epoch 1 Batch 300 Loss 1.0910 Accuracy 0.1935
Epoch 1 Batch 400 Loss 0.9699 Accuracy 0.2244
Epoch 1 Batch 500 Loss 0.8794 Accuracy 0.2480
Epoch 1 Batch 600 Loss 0.8089 Accuracy 0.2670
Epoch 1 Batch 700 Loss 0.7517 Accuracy 0.2824
Epoch 1 Batch 800 Loss 0.7046 Accuracy 0.2954
Epoch 1 Batch 900 Loss 0.6650 Accuracy 0.3064
Epoch 1 Batch 1000 Loss 0.6313 Accuracy 0.3159
Epoch 1 Batch 1100 Loss 0.6017 Accuracy 0.3242
Epoch 1 Loss 0.6017 Accuracy 0.3242
Time taken for 1 epoch: 533.8319201469421 secs

Epoch 2 Batch 0 Loss 0.3180 Accuracy 0.4036
Epoch 2 Batch 100 Loss 0.2906 Accuracy 0.4144
Epoch 2 Batch 200 Loss 0.2826 Accuracy 0.4163
Epoch 2 Batch 300 Loss 0.2754 Accuracy 0.4188
Epoch 2 Batch 400 Loss 0.2687 Accuracy 0.4211
Epoch 2 Batch 500 Loss 0.2623 Accuracy 0.4235
Epoch 2 Batch 600 Loss 0.2568 Accuracy 0.4254
Epoch 2 Batch 700 Loss 0.2514 Accuracy 0.427