# generative molecules

In this tutorial, we will go through how to train a sequence VAE model for generating molecules with the formate of SMILES sequence. In particular, we will demostrate how to train a VAE model and sample the generative molecules from a pre-trained model.

## Sequence VAE

![title](seq_VAE.png)

## Part I: Train a seq-VAE

### Load the data

In [8]:
import sys
import os
seq_VAE_path = '../apps/molecular_generation/seq_VAE/'
sys.path.insert(0, os.getcwd() + "/..")
sys.path.append(seq_VAE_path)
from utils import *

In [9]:
data_path = seq_VAE_path + 'data/zinc_moses/train.csv'
train_data = load_zinc_dataset(data_path)
# get the toy data
train_data = train_data[0:1000]

In [10]:
len(train_data)

1000

In [11]:
train_data[0:10]

['CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1',
 'CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1',
 'Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO',
 'Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C',
 'CC1Oc2ccc(Cl)cc2N(CC(O)CO)C1=O',
 'CCOC(=O)c1cncn1C1CCCc2ccccc21',
 'COc1ccccc1OC(=O)Oc1ccccc1OC',
 'O=C1Nc2ccc(Cl)cc2C(c2ccccc2Cl)=NC1O',
 'CN1C(=O)C(O)N=C(c2ccccc2Cl)c2cc(Cl)ccc21',
 'CCC(=O)c1ccc(OCC(O)CO)c(OC)c1']

## define vocabulary

In [12]:
# define the vocabuary based on dataset
vocab = OneHotVocab.from_data(train_data)

### Model Configuration Settings

The network is setup according to model_config.

In [2]:
model_config = \
{
    "max_length":80,     # max length of sequence
    "q_cell": "gru",     # encoder RNN cell
    "q_bidir": 1,        # if encoder is bidiretion
    "q_d_h": 256,        # hidden size of encoder
    "q_n_layers": 1,     # number of layers of encoder RNN
    "q_dropout": 0.5,    # encoder drop out rate


    "d_cell": "gru",     # decoder RNN cell
    "d_n_layers":3,      # number of decoder layers
    "d_dropout":0.2,     # decoder drop out rate
    "d_z":128,           # latent space size
    "d_d_h":512,         # hidden size of decoder
    "freeze_embeddings":0 # if freeze embeddings
}

### Define the model

In [14]:
from pahelix.model_zoo.seq_vae_model  import VAE
# build the model
model = VAE(vocab, model_config)  

[GLOBAL] POLAR_ANGLE_NUM:10


### Trian the model

In [15]:
# define the training settings
batch_size = 64
learning_rate = 0.001
n_epoch = 2
kl_weight = 0.1

# define optimizer
optimizer = paddle.optimizer.Adam(parameters=model.parameters(),
                            learning_rate=learning_rate)

# build the dataset and data loader
max_length = model_config["max_length"]
train_dataset = StringDataset(vocab, train_data, max_length)
train_dataloader = paddle.io.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)    

In [16]:
# start to train 
for epoch in range(n_epoch):
    print('#######################')
    kl_loss_values = []
    recon_loss_values = []
    loss_values = []
    
    for batch_id, data in enumerate(train_dataloader()):
        # read batch data
        data_batch = data

        # forward
        kl_loss, recon_loss  = model(data_batch)
        loss = kl_weight * kl_loss + recon_loss


        # backward
        loss.backward()
        # optimize
        optimizer.step()
        # clear gradients
        optimizer.clear_grad()
        
        # gathering values from each batch
        kl_loss_values.append(kl_loss.numpy())
        recon_loss_values.append(recon_loss.numpy())
        loss_values.append(loss.numpy())

        
        print('batch:%s, kl_loss:%f, recon_loss:%f' % (batch_id, float(np.mean(kl_loss_values)), float(np.mean(recon_loss_values))))
        
    print('epoch:%d loss:%f kl_loss:%f recon_loss:%f' % (epoch, float(np.mean(loss_values)),float(np.mean(kl_loss_values)),float(np.mean(recon_loss_values))),flush=True)

  

#######################
batch:0, kl_loss:0.334582, recon_loss:3.377764
batch:1, kl_loss:0.235228, recon_loss:3.264235
batch:2, kl_loss:0.195186, recon_loss:3.121852
batch:3, kl_loss:0.193437, recon_loss:3.026339
batch:4, kl_loss:0.200334, recon_loss:2.934668
batch:5, kl_loss:0.212199, recon_loss:2.871092
batch:6, kl_loss:0.221612, recon_loss:2.817064
batch:7, kl_loss:0.227276, recon_loss:2.761942
batch:8, kl_loss:0.229593, recon_loss:2.721181
batch:9, kl_loss:0.227553, recon_loss:2.692791
batch:10, kl_loss:0.222633, recon_loss:2.666499
batch:11, kl_loss:0.215547, recon_loss:2.642660
batch:12, kl_loss:0.206956, recon_loss:2.621526
batch:13, kl_loss:0.197874, recon_loss:2.602398
batch:14, kl_loss:0.188871, recon_loss:2.583047
batch:15, kl_loss:0.180314, recon_loss:2.564440
epoch:0 loss:2.582471 kl_loss:0.180314 recon_loss:2.564440
#######################
batch:0, kl_loss:0.045176, recon_loss:2.305443
batch:1, kl_loss:0.041685, recon_loss:2.304310
batch:2, kl_loss:0.038983, recon_loss:2.3

## Part II: Sample from prior

In [None]:
from pahelix.utils.metrics.molecular_generation.metrics_ import *
N_samples = 1000  # number of samples 
max_len = 80      # maximum length of samples
current_samples = model.sample(N_samples,max_len)  # get the samples from pre-trained model

metrics = get_all_metrics(gen=current_samples,k=[100])  # get the evaluation from samples