In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import tensorflow as tf
import numpy as np
import os
import logging
import string
import random
import yaml
from datetime import datetime
from tqdm import tqdm

from dimenet.model.dimenet_pp import DimeNetPP
from dimenet.model.activations import swish
from dimenet.training.trainer import Trainer
from dimenet.training.metrics import Metrics
from dimenet.training.data_container import DataContainer
from dimenet.training.data_provider import DataProvider

In [3]:
# Set up logger
logger = logging.getLogger()
logger.handlers = []
ch = logging.StreamHandler()
formatter = logging.Formatter(
        fmt='%(asctime)s (%(levelname)s): %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.setLevel('INFO')

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
tf.get_logger().setLevel('WARN')
tf.autograph.set_verbosity(2)

In [14]:
# config.yaml for DimeNet, config_pp.yaml for DimeNet++
with open('config_pp.yaml', 'r') as c:
    config = yaml.safe_load(c)
    
model_name = config['model_name']

if model_name == "dimenet":
    num_bilinear = config['num_bilinear']
elif model_name == "dimenet++":
    out_emb_size = config['out_emb_size']
    int_emb_size = config['int_emb_size']
    basis_emb_size = config['basis_emb_size']
else:
    raise ValueError(f"Unknown model name: '{model_name}'")
    
emb_size = config['emb_size']
num_blocks = config['num_blocks']

num_spherical = config['num_spherical']
num_radial = config['num_radial']
output_init = config['output_init']

cutoff = config['cutoff']

num_before_skip = config['num_before_skip']
num_after_skip = config['num_after_skip']
num_dense_output = config['num_dense_output']

num_train = config['num_train']
num_valid = config['num_valid']
data_seed = config['data_seed']
dataset_path = config['dataset']

batch_size = config['batch_size']

dist = config['dist']
ablation = config['ablation']

#####################################################################
# Change this if you want to predict a different target, e.g. to ['U0']
# (but don't forget to change output_init as well)
targets = config['targets']
#####################################################################

############### load dataset
data_container = DataContainer(dataset_path + "/test.jsonl.gz", target_keys=targets, dist=dist, subset=False, ablation=ablation)

# Initialize DataProvider (splits dataset into training, validation and test set based on data_seed)
data_provider = DataProvider({'test': data_container}, num_train, num_valid, batch_size,
                             seed=data_seed, dist=dist)

# Initialize datasets
dataset = data_provider.get_dataset('test').prefetch(tf.data.experimental.AUTOTUNE)
dataset_iter = iter(dataset)

Use ablation: const_angle


  1%|          | 55/10000 [00:00<00:18, 542.04it/s]

Processing molecules


  6%|▌         | 601/10000 [00:01<00:17, 533.60it/s]RDKit ERROR: [15:27:35] Explicit valence for atom # 1 N, 4, is greater than permitted
 18%|█▊        | 1795/10000 [00:03<00:15, 531.41it/s]RDKit ERROR: [15:27:38] Explicit valence for atom # 6 N, 4, is greater than permitted
 25%|██▌       | 2505/10000 [00:04<00:13, 536.18it/s]RDKit ERROR: [15:27:39] Explicit valence for atom # 1 N, 4, is greater than permitted
 33%|███▎      | 3270/10000 [00:06<00:12, 537.16it/s]RDKit ERROR: [15:27:40] Explicit valence for atom # 6 N, 4, is greater than permitted
 90%|████████▉ | 8979/10000 [00:16<00:01, 534.38it/s]RDKit ERROR: [15:27:51] Explicit valence for atom # 1 N, 4, is greater than permitted
 90%|█████████ | 9033/10000 [00:16<00:01, 535.63it/s]RDKit ERROR: [15:27:51] Explicit valence for atom # 6 N, 4, is greater than permitted
100%|██████████| 10000/10000 [00:18<00:00, 535.48it/s]


Skipped: 6


In [15]:
# init model
if model_name == "dimenet++":
    model = DimeNetPP(
            emb_size=emb_size, out_emb_size=out_emb_size,
            int_emb_size=int_emb_size, basis_emb_size=basis_emb_size,
            num_blocks=num_blocks, num_spherical=num_spherical, num_radial=num_radial,
            cutoff=cutoff, num_before_skip=num_before_skip, num_after_skip=num_after_skip,
            num_dense_output=num_dense_output, num_targets=len(targets),
            activation=swish, output_init=output_init, dist=dist)
else:
    raise ValueError(f"Unknown model name: '{model_name}'")
    
trainer = Trainer(model)     

In [16]:
#####################################################################
# Load the trained model from your own training run
directory = "logs/20210518_141556_qxRCa89m_qm9_e128_oe256_ie64_be8_sbf12_rbf12_b4_nbs1_nas2_no3_cut2.5_lr1.00e-03_dec4.00e+06_U0_DimeNet++"
best_ckpt_file = os.path.join(directory, 'best', 'ckpt')
#####################################################################
# Uncomment this if you want to use a pretrained model
# directory = f"pretrained/dimenet_pp/{targets[0]}"
# best_ckpt_file = os.path.join(directory, 'ckpt')
#####################################################################

model.load_weights(best_ckpt_file)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f191df404d0>

In [17]:
# Initialize aggregates
metrics = Metrics('val', targets)
preds_total = np.zeros([data_provider.nsamples['test'], len(targets)], dtype=np.float32)

steps_per_epoch = int(np.ceil(data_provider.nsamples['test'] / batch_size))

for step in tqdm(range(steps_per_epoch)):
    preds = trainer.predict_on_batch(dataset_iter, metrics)
    
    # Update predictions
    batch_start = step * batch_size
    batch_end = min((step + 1) * batch_size, data_provider.nsamples['test'])
    preds_total[batch_start:batch_end] = preds.numpy()

100%|██████████| 313/313 [00:11<00:00, 27.52it/s]


In [18]:
print(f"{','.join(targets)} MAE: {metrics.mean_mae}")
print(f"{','.join(targets)} logMAE: {metrics.mean_log_mae}")

U0 MAE: 0.032134927809238434
U0 logMAE: -3.437811851501465
