In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
import chemprop

In [2]:
import sys
sys.path.append('../../../../code')

from metrics import get_lo_metrics

def chemprop_prepare_df(original_data):
    result = pd.DataFrame({
        'smiles': original_data['smiles'],
        'targets': original_data['value'].astype(float)
    })
    return result

def chemprop_process_folder(input_path, output_path):
    files = ['train_1.csv', 'train_2.csv', 'train_3.csv', 'test_1.csv', 'test_2.csv', 'test_3.csv']
    for file in files:
        input_data = pd.read_csv(input_path + file)
        output_data = chemprop_prepare_df(input_data)
        output_data.to_csv(output_path + file, index=False)

Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/simon/miniconda3/envs/chemprop/lib/python3.8/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'pytorch_lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [3]:
train = pd.read_csv('../../../../data/lo/kdr/train_1.csv')
test = pd.read_csv('../../../../data/lo/kdr/test_1.csv')

train

Unnamed: 0.1,Unnamed: 0,smiles,value,cluster
0,0,C/C(=N\OC(C)C)c1ccc2c(c1)c1c3c(c4c(c1n2CC(C)C)...,7.897940,0
1,1,C/C(=N\OCC(C)C)c1ccc2[nH]c3c4c(c5c(c3c2c1)CNC5...,8.129819,0
2,2,C=CC(=O)Nc1cc2c(Nc3c(F)cc(Br)cc3F)ncnc2cc1OCC1...,6.826814,0
3,3,C=CC(=O)Nc1cc2c(Nc3cc(Cl)c(Br)cc3F)ncnc2cc1OCC...,6.376751,0
4,4,C=CC(=O)Nc1cc2c(Nc3cc(Cl)c(Cl)cc3Cl)ncnc2cc1OC...,6.102373,0
...,...,...,...,...
495,495,c1ccc(-c2ccc(Nc3nnc(-c4cccnc4CCc4ccncc4)o3)cc2...,5.579879,0
496,496,c1ccc(Nc2ncc3c(n2)-c2ccccc2SC3)cc1,5.086133,0
497,497,c1ccc(Oc2ccc(Nc3ncnc4ccccc34)cc2)cc1,5.565271,0
498,498,c1ccc2c(c1)c(-c1cncc(-c3ccsc3)c1)cn2CCN1CCOCC1,7.214670,0


In [4]:
import os
import os
import shutil

temp_datapath = '/tmp/chemprop/'
if os.path.exists(temp_datapath) and os.path.isdir(temp_datapath):
    shutil.rmtree(temp_datapath)
os.mkdir(temp_datapath)

input_paths = [
    '/home/simon/papers/lohi/data/lo/kdr/'
]

output_paths = [
    temp_datapath
]

for i in range(len(input_paths)):
    chemprop_process_folder(input_paths[i], output_paths[i])

In [7]:
base_arguments = [
    '--dataset_type', 'regression', 
    '--data_path', temp_datapath + 'train_1.csv',
    '--separate_val_path', temp_datapath + 'test_1.csv',
    '--separate_test_path', temp_datapath + 'test_1.csv',
    '--save_preds',
    '--metric', 'mae',
    '--epochs', '50',
    '--gpu', '0',
    '--save_dir', temp_datapath + 'checkpoint',
    '--features_generator', 'rdkit_2d_normalized',
    '--no_features_scaling',
]

In [8]:
param_dict = {
    '--depth': ['3', '4', '5', '6'],
    '--dropout': ['0.0', '0.2', '0.3', '0.5', '0.7'],
    '--ffn_hidden_size': ['600', '1200', '2400', '3600'],
    '--ffn_num_layers': ['1', '2', '3'],
    '--hidden_size': ['600', '1200', '2400', '3600']
}

In [9]:
from copy import copy
from tqdm import tqdm
from sklearn.model_selection import ParameterSampler

hyperparam_list = list(ParameterSampler(param_dict, n_iter=20))

best_score = 0.0
best_hyperparams = None

for i, hyperparam in tqdm(enumerate(hyperparam_list)):
    print('HYPERPARAMETER', i)
    print(hyperparam)
    result_arguments = copy(base_arguments)
    for key, value in hyperparam.items():
        result_arguments.append(key)
        result_arguments.append(value)
    # Train
    args = chemprop.args.TrainArgs().parse_args(result_arguments)
    mean_score, std_score = chemprop.train.cross_validate(args=args, train_func=chemprop.train.run_training)

    test_preds = pd.read_csv(temp_datapath + 'checkpoint/test_preds.csv')
    metrics = get_lo_metrics(test, test_preds['targets'].to_list())

    score = metrics['spearman']
    if score > best_score:
        print('New best spearman:', score)
        best_score = score
        best_hyperparams = hyperparam

0it [00:00, ?it/s]Command line
python /home/simon/miniconda3/envs/chemprop/lib/python3.8/site-packages/ipykernel_launcher.py --ip=127.0.0.1 --stdin=9013 --control=9011 --hb=9010 --Session.signature_scheme="hmac-sha256" --Session.key=b"77dd52d3-097b-4972-86c5-fa173d3caa02" --shell=9012 --transport="tcp" --iopub=9014 --f=/home/simon/.local/share/jupyter/runtime/kernel-v2-18626mugnYUVx5iHO.json
Args
{'activation': 'ReLU',
 'adding_bond_types': True,
 'adding_h': False,
 'aggregation': 'mean',
 'aggregation_norm': 100,
 'atom_constraints': [],
 'atom_descriptor_scaling': True,
 'atom_descriptors': None,
 'atom_descriptors_path': None,
 'atom_descriptors_size': 0,
 'atom_features_size': 0,
 'atom_messages': False,
 'atom_targets': [],
 'batch_size': 50,
 'bias': False,
 'bias_solvent': False,
 'bond_constraints': [],
 'bond_descriptor_scaling': True,
 'bond_descriptors': None,
 'bond_descriptors_path': None,
 'bond_descriptors_size': 0,
 'bond_features_size': 0,
 'bond_targets': [],
 'cache

HYPERPARAMETER 0
{'--hidden_size': '1200', '--ffn_num_layers': '2', '--ffn_hidden_size': '600', '--dropout': '0.3', '--depth': '6'}


500it [00:00, 369672.48it/s]
100%|██████████| 500/500 [00:12<00:00, 39.52it/s]
100%|██████████| 500/500 [00:00<00:00, 146982.90it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
437it [00:00, 217272.50it/s]
100%|██████████| 437/437 [00:12<00:00, 35.32it/s]
100%|██████████| 437/437 [00:00<00:00, 253760.33it/s]
437it [00:00, 400811.47it/s]
100%|██████████| 437/437 [00:10<00:00, 41.68it/s]
100%|██████████| 437/437 [00:00<00:00, 245665.57it/s]
Total size = 500 | train size = 500 | val size = 437 | test size = 437
Fitting scaler
Building model 0
MoleculeModel(
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.3, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=1200, bias=False)
        (W_h): Linear(in_features=1200, out_features=1200, bias=False)
        (W_o): Linear(in_features=1333, out_features=1200, bias=True)
      )
    )
  )
  (readout): Sequential(
    (0): Dropout(p=0.3, inplace

New best spearman: 0.06510988006225447
HYPERPARAMETER 1
{'--hidden_size': '1200', '--ffn_num_layers': '3', '--ffn_hidden_size': '3600', '--dropout': '0.7', '--depth': '5'}


500it [00:00, 279657.55it/s]
100%|██████████| 500/500 [00:17<00:00, 28.19it/s]
100%|██████████| 500/500 [00:00<00:00, 223006.38it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
437it [00:00, 361663.55it/s]
100%|██████████| 437/437 [00:12<00:00, 35.09it/s]
100%|██████████| 437/437 [00:00<00:00, 241872.64it/s]
437it [00:00, 278769.71it/s]
100%|██████████| 437/437 [00:10<00:00, 40.55it/s]
100%|██████████| 437/437 [00:00<00:00, 252293.30it/s]
Total size = 500 | train size = 500 | val size = 437 | test size = 437
Fitting scaler
Building model 0
MoleculeModel(
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.7, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=1200, bias=False)
        (W_h): Linear(in_features=1200, out_features=1200, bias=False)
        (W_o): Linear(in_features=1333, out_features=1200, bias=True)
      )
    )
  )
  (readout): Sequential(
    (0): Dropout(p=0.7, inplace

New best spearman: 0.1252402676175033
HYPERPARAMETER 2
{'--hidden_size': '2400', '--ffn_num_layers': '1', '--ffn_hidden_size': '600', '--dropout': '0.3', '--depth': '6'}


500it [00:00, 270181.91it/s]
100%|██████████| 500/500 [00:15<00:00, 32.27it/s]
100%|██████████| 500/500 [00:00<00:00, 236458.68it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
437it [00:00, 403635.95it/s]
100%|██████████| 437/437 [00:13<00:00, 33.08it/s]
100%|██████████| 437/437 [00:00<00:00, 112635.09it/s]
437it [00:00, 273527.96it/s]
100%|██████████| 437/437 [00:11<00:00, 38.60it/s]
100%|██████████| 437/437 [00:00<00:00, 237177.90it/s]
Total size = 500 | train size = 500 | val size = 437 | test size = 437
Fitting scaler
Building model 0
MoleculeModel(
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.3, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=2400, bias=False)
        (W_h): Linear(in_features=2400, out_features=2400, bias=False)
        (W_o): Linear(in_features=2533, out_features=2400, bias=True)
      )
    )
  )
  (readout): Sequential(
    (0): Dropout(p=0.3, inplace

HYPERPARAMETER 3
{'--hidden_size': '2400', '--ffn_num_layers': '1', '--ffn_hidden_size': '1200', '--dropout': '0.7', '--depth': '3'}


500it [00:00, 232939.24it/s]
100%|██████████| 500/500 [00:15<00:00, 31.41it/s]
100%|██████████| 500/500 [00:00<00:00, 201726.82it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
437it [00:00, 268755.26it/s]
100%|██████████| 437/437 [00:13<00:00, 31.42it/s]
100%|██████████| 437/437 [00:00<00:00, 198797.27it/s]
437it [00:00, 167236.39it/s]
100%|██████████| 437/437 [00:14<00:00, 31.18it/s]
100%|██████████| 437/437 [00:00<00:00, 208119.77it/s]
Total size = 500 | train size = 500 | val size = 437 | test size = 437
Fitting scaler
Building model 0
MoleculeModel(
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.7, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=2400, bias=False)
        (W_h): Linear(in_features=2400, out_features=2400, bias=False)
        (W_o): Linear(in_features=2533, out_features=2400, bias=True)
      )
    )
  )
  (readout): Sequential(
    (0): Dropout(p=0.7, inplace

New best spearman: 0.1277383091015159
HYPERPARAMETER 4
{'--hidden_size': '2400', '--ffn_num_layers': '3', '--ffn_hidden_size': '1200', '--dropout': '0.0', '--depth': '6'}


500it [00:00, 285016.58it/s]
100%|██████████| 500/500 [00:14<00:00, 33.55it/s]
100%|██████████| 500/500 [00:00<00:00, 217051.54it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
437it [00:00, 364976.27it/s]
100%|██████████| 437/437 [00:14<00:00, 30.18it/s]
100%|██████████| 437/437 [00:00<00:00, 224676.50it/s]
437it [00:00, 367832.80it/s]
100%|██████████| 437/437 [00:11<00:00, 36.88it/s]
100%|██████████| 437/437 [00:00<00:00, 224428.90it/s]
Total size = 500 | train size = 500 | val size = 437 | test size = 437
Fitting scaler
Building model 0
MoleculeModel(
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.0, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=2400, bias=False)
        (W_h): Linear(in_features=2400, out_features=2400, bias=False)
        (W_o): Linear(in_features=2533, out_features=2400, bias=True)
      )
    )
  )
  (readout): Sequential(
    (0): Dropout(p=0.0, inplace

HYPERPARAMETER 5
{'--hidden_size': '2400', '--ffn_num_layers': '3', '--ffn_hidden_size': '3600', '--dropout': '0.3', '--depth': '4'}


500it [00:00, 297764.02it/s]
100%|██████████| 500/500 [00:14<00:00, 34.09it/s]
100%|██████████| 500/500 [00:00<00:00, 221803.49it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
437it [00:00, 304622.05it/s]
100%|██████████| 437/437 [00:11<00:00, 37.50it/s]
100%|██████████| 437/437 [00:00<00:00, 221205.75it/s]
437it [00:00, 368424.29it/s]
100%|██████████| 437/437 [00:11<00:00, 37.66it/s]
100%|██████████| 437/437 [00:00<00:00, 224346.49it/s]
Total size = 500 | train size = 500 | val size = 437 | test size = 437
Fitting scaler
Building model 0
MoleculeModel(
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.3, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=2400, bias=False)
        (W_h): Linear(in_features=2400, out_features=2400, bias=False)
        (W_o): Linear(in_features=2533, out_features=2400, bias=True)
      )
    )
  )
  (readout): Sequential(
    (0): Dropout(p=0.3, inplace

HYPERPARAMETER 6
{'--hidden_size': '3600', '--ffn_num_layers': '1', '--ffn_hidden_size': '3600', '--dropout': '0.7', '--depth': '6'}


500it [00:00, 271440.85it/s]
100%|██████████| 500/500 [00:16<00:00, 29.46it/s]
100%|██████████| 500/500 [00:00<00:00, 191520.73it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
437it [00:00, 312303.77it/s]
100%|██████████| 437/437 [00:12<00:00, 34.30it/s]
100%|██████████| 437/437 [00:00<00:00, 226593.01it/s]
437it [00:00, 350327.00it/s]
100%|██████████| 437/437 [00:11<00:00, 37.81it/s]
100%|██████████| 437/437 [00:00<00:00, 226425.06it/s]
Total size = 500 | train size = 500 | val size = 437 | test size = 437
Fitting scaler
Building model 0
MoleculeModel(
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.7, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=3600, bias=False)
        (W_h): Linear(in_features=3600, out_features=3600, bias=False)
        (W_o): Linear(in_features=3733, out_features=3600, bias=True)
      )
    )
  )
  (readout): Sequential(
    (0): Dropout(p=0.7, inplace

HYPERPARAMETER 7
{'--hidden_size': '1200', '--ffn_num_layers': '2', '--ffn_hidden_size': '3600', '--dropout': '0.3', '--depth': '4'}


500it [00:00, 304641.49it/s]
100%|██████████| 500/500 [00:15<00:00, 32.96it/s]
100%|██████████| 500/500 [00:00<00:00, 217795.41it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
437it [00:00, 361592.20it/s]
100%|██████████| 437/437 [00:12<00:00, 36.25it/s]
100%|██████████| 437/437 [00:00<00:00, 194473.30it/s]
437it [00:00, 257070.25it/s]
100%|██████████| 437/437 [00:13<00:00, 31.56it/s]
100%|██████████| 437/437 [00:00<00:00, 206525.17it/s]
Total size = 500 | train size = 500 | val size = 437 | test size = 437
Fitting scaler
Building model 0
MoleculeModel(
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.3, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=1200, bias=False)
        (W_h): Linear(in_features=1200, out_features=1200, bias=False)
        (W_o): Linear(in_features=1333, out_features=1200, bias=True)
      )
    )
  )
  (readout): Sequential(
    (0): Dropout(p=0.3, inplace

HYPERPARAMETER 8
{'--hidden_size': '2400', '--ffn_num_layers': '3', '--ffn_hidden_size': '1200', '--dropout': '0.5', '--depth': '3'}


500it [00:00, 253156.93it/s]
100%|██████████| 500/500 [00:14<00:00, 34.48it/s]
100%|██████████| 500/500 [00:00<00:00, 216580.81it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
437it [00:00, 354185.67it/s]
100%|██████████| 437/437 [00:11<00:00, 37.66it/s]
100%|██████████| 437/437 [00:00<00:00, 222656.81it/s]
437it [00:00, 359535.28it/s]
100%|██████████| 437/437 [00:11<00:00, 37.89it/s]
100%|██████████| 437/437 [00:00<00:00, 225200.99it/s]
Total size = 500 | train size = 500 | val size = 437 | test size = 437
Fitting scaler
Building model 0
MoleculeModel(
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.5, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=2400, bias=False)
        (W_h): Linear(in_features=2400, out_features=2400, bias=False)
        (W_o): Linear(in_features=2533, out_features=2400, bias=True)
      )
    )
  )
  (readout): Sequential(
    (0): Dropout(p=0.5, inplace

HYPERPARAMETER 9
{'--hidden_size': '3600', '--ffn_num_layers': '1', '--ffn_hidden_size': '600', '--dropout': '0.3', '--depth': '6'}


500it [00:00, 251095.79it/s]
100%|██████████| 500/500 [00:16<00:00, 30.46it/s]
100%|██████████| 500/500 [00:00<00:00, 108959.94it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
437it [00:00, 136580.54it/s]
100%|██████████| 437/437 [00:12<00:00, 34.45it/s]
100%|██████████| 437/437 [00:00<00:00, 222792.13it/s]
437it [00:00, 338988.51it/s]
100%|██████████| 437/437 [00:11<00:00, 36.97it/s]
100%|██████████| 437/437 [00:00<00:00, 236078.16it/s]
Total size = 500 | train size = 500 | val size = 437 | test size = 437
Fitting scaler
Building model 0
MoleculeModel(
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.3, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=3600, bias=False)
        (W_h): Linear(in_features=3600, out_features=3600, bias=False)
        (W_o): Linear(in_features=3733, out_features=3600, bias=True)
      )
    )
  )
  (readout): Sequential(
    (0): Dropout(p=0.3, inplace

New best spearman: 0.19161837152622288
HYPERPARAMETER 10
{'--hidden_size': '600', '--ffn_num_layers': '3', '--ffn_hidden_size': '1200', '--dropout': '0.3', '--depth': '6'}


500it [00:00, 307951.84it/s]
100%|██████████| 500/500 [00:15<00:00, 33.17it/s]
100%|██████████| 500/500 [00:00<00:00, 221055.34it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
437it [00:00, 356181.66it/s]
100%|██████████| 437/437 [00:12<00:00, 34.71it/s]
100%|██████████| 437/437 [00:00<00:00, 129864.73it/s]
437it [00:00, 163960.18it/s]
100%|██████████| 437/437 [00:12<00:00, 35.73it/s]
100%|██████████| 437/437 [00:00<00:00, 223744.00it/s]
Total size = 500 | train size = 500 | val size = 437 | test size = 437
Fitting scaler
Building model 0
MoleculeModel(
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.3, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=600, bias=False)
        (W_h): Linear(in_features=600, out_features=600, bias=False)
        (W_o): Linear(in_features=733, out_features=600, bias=True)
      )
    )
  )
  (readout): Sequential(
    (0): Dropout(p=0.3, inplace=Fals

HYPERPARAMETER 11
{'--hidden_size': '600', '--ffn_num_layers': '3', '--ffn_hidden_size': '600', '--dropout': '0.3', '--depth': '6'}


500it [00:00, 169507.92it/s]
100%|██████████| 500/500 [00:16<00:00, 30.64it/s]
100%|██████████| 500/500 [00:00<00:00, 193839.73it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
437it [00:00, 321845.63it/s]
100%|██████████| 437/437 [00:13<00:00, 33.06it/s]
100%|██████████| 437/437 [00:00<00:00, 210800.56it/s]
437it [00:00, 345311.01it/s]
100%|██████████| 437/437 [00:12<00:00, 34.61it/s]
100%|██████████| 437/437 [00:00<00:00, 211628.09it/s]
Total size = 500 | train size = 500 | val size = 437 | test size = 437
Fitting scaler
Building model 0
MoleculeModel(
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.3, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=600, bias=False)
        (W_h): Linear(in_features=600, out_features=600, bias=False)
        (W_o): Linear(in_features=733, out_features=600, bias=True)
      )
    )
  )
  (readout): Sequential(
    (0): Dropout(p=0.3, inplace=Fals

HYPERPARAMETER 12
{'--hidden_size': '3600', '--ffn_num_layers': '2', '--ffn_hidden_size': '2400', '--dropout': '0.0', '--depth': '5'}


500it [00:00, 258939.62it/s]
100%|██████████| 500/500 [00:15<00:00, 32.86it/s]
100%|██████████| 500/500 [00:00<00:00, 211769.36it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
437it [00:00, 331808.63it/s]
100%|██████████| 437/437 [00:12<00:00, 34.41it/s]
100%|██████████| 437/437 [00:00<00:00, 198646.46it/s]
437it [00:00, 253409.49it/s]
100%|██████████| 437/437 [00:13<00:00, 32.18it/s]
100%|██████████| 437/437 [00:00<00:00, 203679.39it/s]
Total size = 500 | train size = 500 | val size = 437 | test size = 437
Fitting scaler
Building model 0
MoleculeModel(
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.0, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=3600, bias=False)
        (W_h): Linear(in_features=3600, out_features=3600, bias=False)
        (W_o): Linear(in_features=3733, out_features=3600, bias=True)
      )
    )
  )
  (readout): Sequential(
    (0): Dropout(p=0.0, inplace

HYPERPARAMETER 13
{'--hidden_size': '1200', '--ffn_num_layers': '2', '--ffn_hidden_size': '3600', '--dropout': '0.0', '--depth': '3'}


500it [00:00, 227038.22it/s]
100%|██████████| 500/500 [00:17<00:00, 27.95it/s]
100%|██████████| 500/500 [00:00<00:00, 172989.52it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
437it [00:00, 296299.85it/s]
100%|██████████| 437/437 [00:14<00:00, 29.69it/s]
100%|██████████| 437/437 [00:00<00:00, 215763.49it/s]
437it [00:00, 3489.66it/s]
100%|██████████| 437/437 [00:14<00:00, 31.16it/s]
100%|██████████| 437/437 [00:00<00:00, 207155.39it/s]
Total size = 500 | train size = 500 | val size = 437 | test size = 437
Fitting scaler
Building model 0
MoleculeModel(
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.0, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=1200, bias=False)
        (W_h): Linear(in_features=1200, out_features=1200, bias=False)
        (W_o): Linear(in_features=1333, out_features=1200, bias=True)
      )
    )
  )
  (readout): Sequential(
    (0): Dropout(p=0.0, inplace=F

HYPERPARAMETER 14
{'--hidden_size': '1200', '--ffn_num_layers': '1', '--ffn_hidden_size': '3600', '--dropout': '0.5', '--depth': '4'}


Setting molecule featurization parameters to default.
Loading data
500it [00:00, 182615.12it/s]
100%|██████████| 500/500 [00:17<00:00, 29.06it/s]
100%|██████████| 500/500 [00:00<00:00, 111219.35it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
437it [00:00, 291818.32it/s]
100%|██████████| 437/437 [00:15<00:00, 28.29it/s]
100%|██████████| 437/437 [00:00<00:00, 125593.45it/s]
437it [00:00, 211481.58it/s]
100%|██████████| 437/437 [00:15<00:00, 28.44it/s]
100%|██████████| 437/437 [00:00<00:00, 208380.04it/s]
Total size = 500 | train size = 500 | val size = 437 | test size = 437
Fitting scaler
Building model 0
MoleculeModel(
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.5, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=1200, bias=False)
        (W_h): Linear(in_features=1200, out_features=1200, bias=False)
        (W_o): Linear(in_features=1333, out_features=1200, bias=True)
      )

HYPERPARAMETER 15
{'--hidden_size': '2400', '--ffn_num_layers': '2', '--ffn_hidden_size': '2400', '--dropout': '0.3', '--depth': '6'}


500it [00:00, 247743.89it/s]
100%|██████████| 500/500 [00:17<00:00, 29.19it/s]
100%|██████████| 500/500 [00:00<00:00, 142896.70it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
437it [00:00, 177762.67it/s]
100%|██████████| 437/437 [00:13<00:00, 32.99it/s]
100%|██████████| 437/437 [00:00<00:00, 175515.74it/s]
437it [00:00, 289513.64it/s]
100%|██████████| 437/437 [00:13<00:00, 32.82it/s]
100%|██████████| 437/437 [00:00<00:00, 206944.89it/s]
Total size = 500 | train size = 500 | val size = 437 | test size = 437
Fitting scaler
Building model 0
MoleculeModel(
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.3, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=2400, bias=False)
        (W_h): Linear(in_features=2400, out_features=2400, bias=False)
        (W_o): Linear(in_features=2533, out_features=2400, bias=True)
      )
    )
  )
  (readout): Sequential(
    (0): Dropout(p=0.3, inplace

HYPERPARAMETER 16
{'--hidden_size': '1200', '--ffn_num_layers': '1', '--ffn_hidden_size': '1200', '--dropout': '0.3', '--depth': '6'}


Setting molecule featurization parameters to default.
Loading data
500it [00:00, 156620.76it/s]
100%|██████████| 500/500 [00:16<00:00, 30.31it/s]
100%|██████████| 500/500 [00:00<00:00, 194198.72it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
437it [00:00, 167480.89it/s]
100%|██████████| 437/437 [00:13<00:00, 32.75it/s]
100%|██████████| 437/437 [00:00<00:00, 213079.61it/s]
437it [00:00, 265562.28it/s]
100%|██████████| 437/437 [00:12<00:00, 34.74it/s]
100%|██████████| 437/437 [00:00<00:00, 212980.58it/s]
Total size = 500 | train size = 500 | val size = 437 | test size = 437
Fitting scaler
Building model 0
MoleculeModel(
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.3, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=1200, bias=False)
        (W_h): Linear(in_features=1200, out_features=1200, bias=False)
        (W_o): Linear(in_features=1333, out_features=1200, bias=True)
      )

HYPERPARAMETER 17
{'--hidden_size': '3600', '--ffn_num_layers': '1', '--ffn_hidden_size': '1200', '--dropout': '0.5', '--depth': '4'}


500it [00:00, 263660.05it/s]
100%|██████████| 500/500 [00:15<00:00, 32.91it/s]
100%|██████████| 500/500 [00:00<00:00, 207187.51it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
437it [00:00, 343885.71it/s]
100%|██████████| 437/437 [00:12<00:00, 34.62it/s]
100%|██████████| 437/437 [00:00<00:00, 121755.74it/s]
437it [00:00, 347142.21it/s]
100%|██████████| 437/437 [00:15<00:00, 28.45it/s]
100%|██████████| 437/437 [00:00<00:00, 203837.95it/s]
Total size = 500 | train size = 500 | val size = 437 | test size = 437
Fitting scaler
Building model 0
MoleculeModel(
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.5, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=3600, bias=False)
        (W_h): Linear(in_features=3600, out_features=3600, bias=False)
        (W_o): Linear(in_features=3733, out_features=3600, bias=True)
      )
    )
  )
  (readout): Sequential(
    (0): Dropout(p=0.5, inplace

HYPERPARAMETER 18
{'--hidden_size': '600', '--ffn_num_layers': '3', '--ffn_hidden_size': '1200', '--dropout': '0.5', '--depth': '5'}


500it [00:00, 253279.23it/s]
100%|██████████| 500/500 [00:15<00:00, 32.86it/s]
100%|██████████| 500/500 [00:00<00:00, 204920.07it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
437it [00:00, 331029.59it/s]
100%|██████████| 437/437 [00:13<00:00, 33.10it/s]
100%|██████████| 437/437 [00:00<00:00, 192108.88it/s]
437it [00:00, 246590.99it/s]
100%|██████████| 437/437 [00:13<00:00, 33.56it/s]
100%|██████████| 437/437 [00:00<00:00, 208854.93it/s]
Total size = 500 | train size = 500 | val size = 437 | test size = 437
Fitting scaler
Building model 0
MoleculeModel(
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.5, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=600, bias=False)
        (W_h): Linear(in_features=600, out_features=600, bias=False)
        (W_o): Linear(in_features=733, out_features=600, bias=True)
      )
    )
  )
  (readout): Sequential(
    (0): Dropout(p=0.5, inplace=Fals

HYPERPARAMETER 19
{'--hidden_size': '600', '--ffn_num_layers': '1', '--ffn_hidden_size': '600', '--dropout': '0.5', '--depth': '6'}


500it [00:00, 278802.45it/s]
100%|██████████| 500/500 [00:15<00:00, 33.02it/s]
100%|██████████| 500/500 [00:00<00:00, 131846.60it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
437it [00:00, 179538.73it/s]
100%|██████████| 437/437 [00:13<00:00, 33.33it/s]
100%|██████████| 437/437 [00:00<00:00, 126051.22it/s]
437it [00:00, 333014.33it/s]
100%|██████████| 437/437 [00:13<00:00, 33.05it/s]
100%|██████████| 437/437 [00:00<00:00, 205023.58it/s]
Total size = 500 | train size = 500 | val size = 437 | test size = 437
Fitting scaler
Building model 0
MoleculeModel(
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.5, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=600, bias=False)
        (W_h): Linear(in_features=600, out_features=600, bias=False)
        (W_o): Linear(in_features=733, out_features=600, bias=True)
      )
    )
  )
  (readout): Sequential(
    (0): Dropout(p=0.5, inplace=Fals

In [10]:
print(best_hyperparams)

{'--hidden_size': '3600', '--ffn_num_layers': '1', '--ffn_hidden_size': '600', '--dropout': '0.3', '--depth': '6'}


In [11]:
print(best_score)

0.19161837152622288


In [12]:
from copy import copy

result_arguments = copy(base_arguments)
result_arguments.extend([
    '--hidden_size', '3600',
    '--ffn_num_layers', '1',
    '--ffn_hidden_size', '600',
    '--dropout', '0.3',
    '--depth', '6'
])

args = chemprop.args.TrainArgs().parse_args(result_arguments)
mean_score, std_score = chemprop.train.cross_validate(args=args, train_func=chemprop.train.run_training)


Command line
python /home/simon/miniconda3/envs/chemprop/lib/python3.8/site-packages/ipykernel_launcher.py --ip=127.0.0.1 --stdin=9013 --control=9011 --hb=9010 --Session.signature_scheme="hmac-sha256" --Session.key=b"77dd52d3-097b-4972-86c5-fa173d3caa02" --shell=9012 --transport="tcp" --iopub=9014 --f=/home/simon/.local/share/jupyter/runtime/kernel-v2-18626mugnYUVx5iHO.json
Args
{'activation': 'ReLU',
 'adding_bond_types': True,
 'adding_h': False,
 'aggregation': 'mean',
 'aggregation_norm': 100,
 'atom_constraints': [],
 'atom_descriptor_scaling': True,
 'atom_descriptors': None,
 'atom_descriptors_path': None,
 'atom_descriptors_size': 0,
 'atom_features_size': 0,
 'atom_messages': False,
 'atom_targets': [],
 'batch_size': 50,
 'bias': False,
 'bias_solvent': False,
 'bond_constraints': [],
 'bond_descriptor_scaling': True,
 'bond_descriptors': None,
 'bond_descriptors_path': None,
 'bond_descriptors_size': 0,
 'bond_features_size': 0,
 'bond_targets': [],
 'cache_cutoff': 10000,
 

In [11]:
epoch = 45

Command line
python /home/steshin/miniconda3/envs/lohi_benchmark/lib/python3.10/site-packages/ipykernel_launcher.py --ip=127.0.0.1 --stdin=9013 --control=9011 --hb=9010 --Session.signature_scheme="hmac-sha256" --Session.key=b"6c604d94-190d-4da2-9c36-788708df5377" --shell=9012 --transport="tcp" --iopub=9014 --f=/home/steshin/.local/share/jupyter/runtime/kernel-v2-2822374BRuswu5zWdXq.json
Args
{'activation': 'ReLU',
 'adding_h': False,
 'aggregation': 'mean',
 'aggregation_norm': 100,
 'atom_descriptor_scaling': True,
 'atom_descriptors': None,
 'atom_descriptors_path': None,
 'atom_descriptors_size': 0,
 'atom_features_size': 0,
 'atom_messages': False,
 'batch_size': 50,
 'bias': False,
 'bias_solvent': False,
 'bond_feature_scaling': True,
 'bond_features_path': None,
 'bond_features_size': 0,
 'cache_cutoff': 10000,
 'checkpoint_dir': None,
 'checkpoint_frzn': None,
 'checkpoint_path': None,
 'checkpoint_paths': None,
 'class_balance': False,
 'config_path': None,
 'crossval_index_di

ValueError: Input contains NaN.

Unnamed: 0.1,Unnamed: 0,smiles,value,preds
0,383,CC(C)Oc1ccccc1N1CCN(Cc2cccc(C(=O)N3CCCCC3)c2)CC1,True,0.148691
1,386,CC(C)Oc1ccccc1N1CCN(Cc2cccc(CN3CCCCC3=O)c2)CC1,True,0.137544
2,389,CC(C)Oc1ccccc1N1CCN(Cc2ccccc2CN2CCCCC2=O)CC1,True,0.049242
3,2695,COc1ccccc1N1CCN(CC2COCC(c3ccccc3)(c3ccccc3)O2)CC1,True,0.029785
4,2995,COc1ccccc1N1CCN(C[C@H]2OCCOC2(c2ccccc2)c2ccccc...,False,0.523535
...,...,...,...,...
1186,5766,O=S(=O)(c1ccccc1)c1cnc2c(N3CCNCC3)cccc2c1,True,0.073102
1187,6025,O[C@H]1[C@@H](NC[C@@H]2COc3ccccc3O2)CC[C@@H]1O...,False,0.044055
1188,6018,O[C@@H]1[C@H](NC[C@@H]2COc3ccccc3O2)CC[C@H]1Oc...,False,0.737260
1189,6019,O[C@@H]1[C@H](NC[C@H]2COc3ccccc3O2)CC[C@H]1Oc1...,False,0.058900


Unnamed: 0.1,Unnamed: 0,smiles,value
0,383,CC(C)Oc1ccccc1N1CCN(Cc2cccc(C(=O)N3CCCCC3)c2)CC1,True
1,386,CC(C)Oc1ccccc1N1CCN(Cc2cccc(CN3CCCCC3=O)c2)CC1,True
2,389,CC(C)Oc1ccccc1N1CCN(Cc2ccccc2CN2CCCCC2=O)CC1,True
3,2695,COc1ccccc1N1CCN(CC2COCC(c3ccccc3)(c3ccccc3)O2)CC1,True
4,2995,COc1ccccc1N1CCN(C[C@H]2OCCOC2(c2ccccc2)c2ccccc...,False
...,...,...,...
1186,5766,O=S(=O)(c1ccccc1)c1cnc2c(N3CCNCC3)cccc2c1,True
1187,6025,O[C@H]1[C@@H](NC[C@@H]2COc3ccccc3O2)CC[C@@H]1O...,False
1188,6018,O[C@@H]1[C@H](NC[C@@H]2COc3ccccc3O2)CC[C@H]1Oc...,False
1189,6019,O[C@@H]1[C@H](NC[C@H]2COc3ccccc3O2)CC[C@H]1Oc1...,False
