<a href="https://colab.research.google.com/github/RobMarx/Supramolecular_VAE/blob/master/SmVAE_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
!git clone https://github.com/RobMarx/Supramolecular_VAE.git

Cloning into 'Supramolecular_VAE'...
remote: Enumerating objects: 538, done.[K
remote: Counting objects: 100% (28/28), done.[K
remote: Compressing objects: 100% (28/28), done.[K
remote: Total 538 (delta 12), reused 0 (delta 0), pack-reused 510[K
Receiving objects: 100% (538/538), 179.78 MiB | 22.01 MiB/s, done.
Resolving deltas: 100% (265/265), done.


In [2]:
%cd Supramolecular_VAE/
%pip install nujson
%pip install rdkit
%pip install selfies

/content/Supramolecular_VAE


In [3]:
import vaemof
from vaemof import experiments
from vaemof import utils
from vaemof.vocabs import SELFIESVocab, MOFVocab, PropVocab
from vaemof import modules
from vaemof import training
from vaemof.model import VAEMOF
from vaemof import configs
from vaemof.utils import header_str
vaemof.experiments.plot_settings()

Restored variables from data/scscore_1024uint8_model.ckpt-10654.as_numpy.json.gz


In [4]:
import os
import random
from tqdm.auto import tqdm
import numpy as np
import torch
import rdkit
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.decomposition import PCA
from itertools import product
from more_itertools import chunked
from collections import OrderedDict

print(f'rdkit : {rdkit.__version__}')
print(f'torch : {torch.__version__}')
print(f'cuda? {torch.cuda.is_available()}')
tqdm.pandas()
utils.disable_rdkit_log()

rdkit : 2023.03.3
torch : 2.1.0+cu118
cuda? True


## Hparams

In [5]:
preset = 'full'
WORK_DIR = 'results/best/'
hparams = configs.get_model_config(WORK_DIR, preset, rand=False)
print(utils.header_str(preset))
hparams['train_device'] = 'cuda' if torch.cuda.is_available() else 'cpu'

testing = configs.testing_config(hparams)
configs.print_config(hparams)
utils.set_seed(hparams['train_seed'])
device = torch.device(hparams['train_device'])

== train == :
        train_device:                cuda
          train_seed:                  42
    train_batch_size:                1536
        train_epochs:                 120
            train_lr:        0.0005705023
     train_clip_grad:                  20
== vae == :
      vae_latent_dim:                 288
           vae_y_dec:                True
     vae_selfies_dec:                True
         vae_mof_enc:                True
         vae_mof_dec:                True
vae_duplicate_smiles:                True
== mof == :
        mof_encoding:                cats
   mof_weighted_loss:                True
         mof_w_start:                 0.0
           mof_w_end:                 0.1
           mof_start:                   0
    mof_const_length:                  10
== y == :
            y_labels:['lcd', 'pld', 'density', 'agsa', 'co2n2_co2_mol_kg', 'co2n2_n2_mol_kg', 'co2ch4_co2_mol_kg', 'co2ch4_ch4_mol_kg']
           y_weights:[1, 1, 1, 1, 1, 1, 1, 1]
           y_w

## Load MOF data

Assemble smiles formula: [organic core][metal_node][topology][branch_smiles]

In [6]:
smiles_column = 'branch_smiles'
testtrain_column = 'train/test'
df = experiments.get_generator_df(csv_file=hparams['files_data'],
                                  smiles_column=smiles_column,
                                  use_duplicates=hparams['vae_duplicate_smiles'],
                                  testing=testing)
ids2mofs, mof2ids, mof_columns = experiments.get_mofdict(
    df, hparams['mof_encoding'])
df.head()

df shape: (2049964, 8)
df columns: ['branch_smiles', 'id2mof', 'metal_node', 'mof_index', 'organic_core', 'randomized', 'topology', 'train/test']
Found 713 unique mofs


Unnamed: 0,branch_smiles,id2mof,metal_node,mof_index,organic_core,randomized,topology,train/test
0,CC(c1cc(O)cc(O)c1)(C(Sc1cc(O)cc(O)c1)c1cc(O)cc...,0,sym_7_mc_4,4.0,,False,acs,1
1,Oc1ccc(-c2cc(/C=C/c3c(-c4ccc(O)cc4)cc([Lr])c(-...,0,sym_7_mc_4,10.0,,False,acs,1
2,O=S(=O)(O)Cc1nn([Lr])c(CS(=O)(=O)O)c1-c1c(CS(=...,0,sym_7_mc_4,18.0,,False,acs,1
3,[Lr]c1ccc(-c2c(-c3ccccc3)cc([Lr])c(-c3ccccc3)c...,0,sym_7_mc_4,23.0,,False,acs,1
4,[Lr]c1ccc(N2C=C(c3ccncc3)N(c3c(-c4ccncc4)cc([L...,0,sym_7_mc_4,29.0,,False,acs,1


## Load property data

In [14]:
!gzip -k "/content/Supramolecular_VAE/data/MOF_properties_train.csv"

In [15]:
prop_df = experiments.get_prop_df(csv_file=hparams['files_prop'],
                                  targets=hparams['y_labels'],
                                  mof2ids=mof2ids,
                                  testing=testing,
                                  smiles_column=smiles_column,
                                  compute_scscore=True)
prop_df.head()

Restored variables from data/scscore_1024uint8_model.ckpt-10654.as_numpy.json.gz


  0%|          | 0/45880 [00:00<?, ?it/s]

Removed 2922 datapoints due to mask.
Removed 1819 datapoints due non-valid mof (mof2ids).


Unnamed: 0,organic_core,metal_node,topology,branch_smiles,lcd,pld,density,agsa,co2n2_co2_mol_kg,co2n2_n2_mol_kg,co2ch4_co2_mol_kg,co2ch4_ch4_mol_kg,selfies_safe,mask,train/test,scscore,id2mof
0,,sym_16_mc_6,bcs,[Lr]C#Cc1cn([Lr])nc1,6.8791,4.83575,0.798317,1654.4,5.664943,0.539723,6.021093,4.367285,True,True,1,2.119697,4
1,,sym_16_mc_6,bcs,[Lr]c1[nH]c2c(n1)cc1c(c2)[nH]c(n1)[Lr],7.36406,5.83655,0.788965,2548.75,4.964082,0.550367,5.837664,4.69857,True,True,1,3.500213,4
2,sym_4_vae_153,sym_13_mc_12,ith,[Lr]c(c1)cc2c(=O)oc3cc([Lr])cc4c3c2c1oc4=O,7.34426,4.31764,0.994306,1596.65,4.646848,0.258974,5.562671,2.091815,True,True,0,3.14529,193
3,sym_4_on_6,sym_13_mc_12,ith,[Lr]c(c1)cc2c(=O)oc3cc([Lr])cc4c3c2c1oc4=O,6.82782,4.09729,1.05576,1302.24,4.941839,0.184885,5.484785,1.537702,True,True,1,3.14529,189
4,sym_4_vae_645,sym_13_mc_12,ith,[Lr]c(c1)cc2c(=O)oc3cc([Lr])cc4c3c2c1oc4=O,7.47386,4.30549,1.02037,1599.44,4.355739,0.263613,5.373193,2.122211,True,True,1,3.14529,195


## Train/test splits and hparams

In [16]:
train_index = np.array(df[df[testtrain_column] == 1].index.tolist())
test_index = np.array(df[df[testtrain_column] == 0].index.tolist())
prop_train_index = np.array(
    prop_df[prop_df[testtrain_column] == 1].index.tolist())
prop_test_index = np.array(
    prop_df[prop_df[testtrain_column] == 0].index.tolist())
print(f'Train sizes: {len(train_index):7d} and {len(prop_train_index):7d}')
print(f'Test  sizes: {len(test_index):7d} and {len(prop_test_index):7d}')

Train sizes: 1894967 and   37008
Test  sizes:  154997 and    4131


# Vocabulary and preprocessors

In [18]:
smiles_list = df[smiles_column].tolist()+prop_df[smiles_column].tolist()
vocab = SELFIESVocab.from_data(smiles_list)
vocab_mof = MOFVocab.from_data(df.append(
    prop_df, sort=False), mof_columns, weighting=hparams['mof_weighted_loss'])
vocab_y = PropVocab.from_data(
    prop_df, hparams['y_labels'], hparams['y_weights'], hparams['scaler_type'])
vocab, vocab_mof, vocab_y

  0%|          | 0/2091103 [00:00<?, ?it/s]

Alphabet size is 59
Max seq length is 109 with 5 extra padding


  vocab_mof = MOFVocab.from_data(df.append(


Used columns =['metal_node', 'organic_core', 'topology'] with frequency weighting=True
metal_node   has 15 classes
organic_core has 52 classes
topology     has 41 classes


(<vaemof.vocabs.SELFIESVocab at 0x7f842eda93c0>,
 <vaemof.vocabs.MOFVocab at 0x7f843b1311e0>,
 <vaemof.vocabs.PropVocab at 0x7f842edaa6e0>)

# Instanciate Model

## Careful! (it saves and will overwrite any model previously saved)

In [19]:
try:
    utils.clear_torch(model)
except:
    utils.clear_torch(model=None)

model = VAEMOF(hparams, vocab, vocab_mof, vocab_y).to(device)
model.save()
modules.model_summary(model, include_children=False)

Unnamed: 0,Name,Module,Extra,submodule,trainable,n_params,trainable_params
0,z_mu,Linear,"in_features=576, out_features=288, bias=True",False,True,166176,166176
1,z_logvar,Linear,"in_features=576, out_features=288, bias=True",False,True,166176,166176
2,enc_x,CharEncoder,,False,True,2354020,2354020
3,dec_x,CharDecoder,,False,True,2469534,2469534
4,enc_mof,MOFEncoder,,False,True,280800,280800
5,dec_mof,MOFDecoder,,False,True,114552,114444
6,dec_y,PropDecoder,,False,True,2320,2312


Trainable params: 5550098 out of 5550214 total (100.00%)


## Prepare train/test

In [20]:
train_mof = model.df_to_tuples(df.loc[train_index], smiles_column)
test_mof = model.df_to_tuples(df.loc[test_index], smiles_column)
prop_train = model.df_to_tuples(prop_df.loc[prop_train_index], smiles_column)
prop_test = model.df_to_tuples(prop_df.loc[prop_test_index], smiles_column)
train_data = train_mof + prop_train
test_data = test_mof + prop_test

SMILES:   0%|          | 0/19 [00:00<?, ?it/s]

MOF:   0%|          | 0/190 [00:00<?, ?it/s]

MOF:   0%|          | 0/16 [00:00<?, ?it/s]



## Train

In [None]:
trainer = training.Trainer(hparams)
trainer.train(model, train_data, test_data)

Epochs:   0%|          | 0/120 [00:00<?, ?it/s]

Train:   0%|          | 0/1257 [00:00<?, ?it/s]

In [None]:
hparams_file = os.path.join(WORK_DIR,'config.json')
hparams = configs.AttributeDict.from_jsonfile(hparams_file)
configs.print_config(hparams)
model = VAEMOF.load(hparams)
test_mof = model.df_to_tuples(df.loc[test_index], smiles_column)
prop_test = model.df_to_tuples(prop_df.loc[prop_test_index], smiles_column)
len(test_mof), len(prop_test)

## Training stats

In [None]:
log_df = pd.read_csv(configs.at_results_dir(hparams,'files_log'))
print(log_df.shape)
print(log_df.columns)
log_df.head()

In [None]:
COMPONENTS = training.COMPONENTS
epochs = log_df['epoch'].values

display(log_df.head(2))
plt.plot(epochs, log_df['lr'])
plt.title('Learning Rate')
plt.show()

plt.plot(epochs, log_df['mof_acc'])
plt.title('Valid MOFs on validation set')
plt.ylim([0,100])
plt.show()

plt.plot(epochs, log_df['mean_r2'],c='g',label='valid')
plt.ylabel('mean r^2')
plt.show()

results=[]
for label in hparams['y_labels']:
    metric = f'{label}-r2'
    results.extend([{'epoch':epoch,'r2':value,"prop":label} for epoch, value in log_df[['epoch',metric]].values ]  )
prop_df = pd.DataFrame(results)
sns.lineplot(x='epoch',y='r2', data=prop_df)
plt.show()

for label in COMPONENTS:
    plt.plot(epochs, log_df[f'λ_{label}'], label =label)
plt.legend()
plt.show()

for label in COMPONENTS:
    plt.plot(epochs, log_df[f'train_{label}'], label='Train')
    plt.plot(epochs, log_df[f'test_{label}'], label='Test')
    plt.title(f'Loss {label}')
    plt.yscale("log")
    plt.legend()
    plt.show()

for prefix in ['train','test']:
    for label in COMPONENTS:
        plt.plot(epochs, log_df[f'{prefix}_{label}_ratio'], label=label)
    plt.title(f'Loss Ratios ({prefix})')
    plt.legend()
    plt.show()