In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
import chemprop

In [2]:
import sys
sys.path.append('../../../../code')

from metrics import get_hi_metrics

def chemprop_prepare_df(original_data):
    result = pd.DataFrame({
        'smiles': original_data['smiles'],
        'targets': original_data['value'].astype(float)
    })
    return result

def chemprop_process_folder(input_path, output_path):
    files = ['train_1.csv', 'train_2.csv', 'train_3.csv', 'test_1.csv', 'test_2.csv', 'test_3.csv']
    for file in files:
        input_data = pd.read_csv(input_path + file)
        output_data = chemprop_prepare_df(input_data)
        output_data.to_csv(output_path + file, index=False)

Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/simon/miniconda3/envs/chemprop/lib/python3.8/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'pytorch_lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [3]:
train = pd.read_csv('../../../../data/hi/kdr/train_1.csv')
test = pd.read_csv('../../../../data/hi/kdr/test_1.csv')

train

Unnamed: 0.1,Unnamed: 0,smiles,value
0,0,Brc1ccc(-c2nc3ccc(Nc4ccnc5ccccc45)cc3[nH]2)cc1,True
1,1064,CCc1ccc(-c2nc3ccc(Nc4ccnc5ccccc45)cc3[nH]2)cc1,False
2,1065,CCc1ccc(-c2nc3ccc(Nc4ncnc5ccccc45)cc3[nH]2)cc1,False
3,3722,COc1ccccc1-c1nc2ccc(Nc3ccnc4ccccc34)cc2[nH]1,False
4,4159,Cc1ccc(-c2nc3ccc(Nc4ccnc5ccccc45)cc3[nH]2)cc1,False
...,...,...,...
495,454,CC(C)S(=O)(=O)c1ccccc1Nc1nc(Nc2cccc(NC(=O)CN)c...,False
496,1850,COC(=O)c1cn2ncnc(Oc3ccc4[nH]c(C)cc4c3F)c2c1C,True
497,4120,Cc1cc2c(F)c(Oc3ncnn4cc(OCCCNS(C)(=O)=O)c(C)c34...,True
498,2979,COc1cc2c(Oc3ccc(N/C=C4\C(=O)NC(=O)N(c5ccc(C)cc...,True


In [4]:
import os
import os
import shutil

temp_datapath = '/tmp/chemprop/'
if os.path.exists(temp_datapath) and os.path.isdir(temp_datapath):
    shutil.rmtree(temp_datapath)
os.mkdir(temp_datapath)

input_paths = [
    '/home/simon/papers/lohi/data/hi/kdr/'
]

output_paths = [
    temp_datapath
]

for i in range(len(input_paths)):
    chemprop_process_folder(input_paths[i], output_paths[i])

In [5]:
base_arguments = [
    '--dataset_type', 'classification', 
    '--data_path', temp_datapath + 'train_1.csv',
    '--separate_val_path', temp_datapath + 'test_1.csv',
    '--separate_test_path', temp_datapath + 'test_1.csv',
    '--save_preds',
    '--metric', 'prc-auc',
    '--epochs', '100',
    '--gpu', '0',
    '--save_dir', temp_datapath + 'checkpoint',
    '--features_generator', 'rdkit_2d_normalized',
    '--no_features_scaling'
]

In [6]:
param_dict = {
    '--depth': ['3', '4', '5', '6'],
    '--dropout': ['0.0', '0.2', '0.3', '0.5', '0.7'],
    '--ffn_hidden_size': ['600', '1200', '2400', '3600'],
    '--ffn_num_layers': ['1', '2', '3'],
    '--hidden_size': ['600', '1200', '2400', '3600']
}

In [7]:
from copy import copy
from tqdm import tqdm
from sklearn.model_selection import ParameterSampler

hyperparam_list = list(ParameterSampler(param_dict, n_iter=20))

best_score = 0.0
best_hyperparams = None

for i, hyperparam in tqdm(enumerate(hyperparam_list)):
    print('HYPERPARAMETER', i)
    print(hyperparam)
    result_arguments = copy(base_arguments)
    for key, value in hyperparam.items():
        result_arguments.append(key)
        result_arguments.append(value)
    # Train
    args = chemprop.args.TrainArgs().parse_args(result_arguments)
    mean_score, std_score = chemprop.train.cross_validate(args=args, train_func=chemprop.train.run_training)

    test_preds = pd.read_csv(temp_datapath + 'checkpoint/test_preds.csv')
    metrics = get_hi_metrics(test, test_preds['targets'].to_list())

    score = metrics['prc_auc']
    if score > best_score:
        print('New best PRC AUC:', score)
        best_score = score
        best_hyperparams = hyperparam

0it [00:00, ?it/s]Command line
python /home/simon/miniconda3/envs/chemprop/lib/python3.8/site-packages/ipykernel_launcher.py --ip=127.0.0.1 --stdin=9003 --control=9001 --hb=9000 --Session.signature_scheme="hmac-sha256" --Session.key=b"d5b0fee2-9f68-4872-9f25-d0fd39dd4883" --shell=9002 --transport="tcp" --iopub=9004 --f=/home/simon/.local/share/jupyter/runtime/kernel-v2-48783Ee3ZnGwS7bEj.json
Args
{'activation': 'ReLU',
 'adding_bond_types': True,
 'adding_h': False,
 'aggregation': 'mean',
 'aggregation_norm': 100,
 'atom_constraints': [],
 'atom_descriptor_scaling': True,
 'atom_descriptors': None,
 'atom_descriptors_path': None,
 'atom_descriptors_size': 0,
 'atom_features_size': 0,
 'atom_messages': False,
 'atom_targets': [],
 'batch_size': 50,
 'bias': False,
 'bias_solvent': False,
 'bond_constraints': [],
 'bond_descriptor_scaling': True,
 'bond_descriptors': None,
 'bond_descriptors_path': None,
 'bond_descriptors_size': 0,
 'bond_features_size': 0,
 'bond_targets': [],
 'cache

HYPERPARAMETER 0
{'--hidden_size': '2400', '--ffn_num_layers': '2', '--ffn_hidden_size': '1200', '--dropout': '0.3', '--depth': '5'}


500it [00:00, 417094.67it/s]
100%|██████████| 500/500 [00:12<00:00, 40.78it/s]
100%|██████████| 500/500 [00:00<00:00, 226645.63it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
3116it [00:00, 478471.58it/s]
100%|██████████| 3116/3116 [01:12<00:00, 43.04it/s]
100%|██████████| 3116/3116 [00:00<00:00, 260300.97it/s]
3116it [00:00, 483802.89it/s]
100%|██████████| 3116/3116 [01:12<00:00, 42.78it/s]
100%|██████████| 3116/3116 [00:00<00:00, 252535.14it/s]
Class sizes
targets 0: 22.20%, 1: 77.80%
Total size = 500 | train size = 500 | val size = 3,116 | test size = 3,116
Building model 0
MoleculeModel(
  (sigmoid): Sigmoid()
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.3, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=2400, bias=False)
        (W_h): Linear(in_features=2400, out_features=2400, bias=False)
        (W_o): Linear(in_features=2533, out_features=2400, bias=True)
      )
   

New best PRC AUC: 0.702322029260872
HYPERPARAMETER 1
{'--hidden_size': '2400', '--ffn_num_layers': '2', '--ffn_hidden_size': '1200', '--dropout': '0.7', '--depth': '5'}


Setting molecule featurization parameters to default.
Loading data
500it [00:00, 280367.91it/s]
100%|██████████| 500/500 [00:17<00:00, 28.60it/s]
100%|██████████| 500/500 [00:00<00:00, 146183.74it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
3116it [00:00, 253539.44it/s]
100%|██████████| 3116/3116 [01:22<00:00, 37.93it/s]
100%|██████████| 3116/3116 [00:00<00:00, 266033.98it/s]
3116it [00:00, 471685.12it/s]
100%|██████████| 3116/3116 [01:05<00:00, 47.38it/s]
100%|██████████| 3116/3116 [00:00<00:00, 268653.41it/s]
Class sizes
targets 0: 22.20%, 1: 77.80%
Total size = 500 | train size = 500 | val size = 3,116 | test size = 3,116
Building model 0
MoleculeModel(
  (sigmoid): Sigmoid()
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.7, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=2400, bias=False)
        (W_h): Linear(in_features=2400, out_features=2400, bias=False)
        (W_o):

New best PRC AUC: 0.7028840257350342
HYPERPARAMETER 2
{'--hidden_size': '3600', '--ffn_num_layers': '3', '--ffn_hidden_size': '2400', '--dropout': '0.3', '--depth': '6'}


500it [00:00, 310321.40it/s]
100%|██████████| 500/500 [00:11<00:00, 42.86it/s]
100%|██████████| 500/500 [00:00<00:00, 249008.79it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
3116it [00:00, 474924.64it/s]
100%|██████████| 3116/3116 [01:07<00:00, 45.87it/s]
100%|██████████| 3116/3116 [00:00<00:00, 257080.36it/s]
3116it [00:00, 479700.91it/s]
100%|██████████| 3116/3116 [01:05<00:00, 47.39it/s]
100%|██████████| 3116/3116 [00:00<00:00, 256837.86it/s]
Class sizes
targets 0: 22.20%, 1: 77.80%
Total size = 500 | train size = 500 | val size = 3,116 | test size = 3,116
Building model 0
MoleculeModel(
  (sigmoid): Sigmoid()
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.3, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=3600, bias=False)
        (W_h): Linear(in_features=3600, out_features=3600, bias=False)
        (W_o): Linear(in_features=3733, out_features=3600, bias=True)
      )
   

HYPERPARAMETER 3
{'--hidden_size': '3600', '--ffn_num_layers': '2', '--ffn_hidden_size': '1200', '--dropout': '0.7', '--depth': '3'}


Setting molecule featurization parameters to default.
Loading data
500it [00:00, 175303.18it/s]
100%|██████████| 500/500 [00:18<00:00, 27.11it/s]
100%|██████████| 500/500 [00:00<00:00, 233327.99it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
3116it [00:00, 449353.66it/s]
100%|██████████| 3116/3116 [01:08<00:00, 45.74it/s]
100%|██████████| 3116/3116 [00:00<00:00, 246989.54it/s]
3116it [00:00, 517745.56it/s]
100%|██████████| 3116/3116 [01:05<00:00, 47.43it/s]
100%|██████████| 3116/3116 [00:00<00:00, 262275.52it/s]
Class sizes
targets 0: 22.20%, 1: 77.80%
Total size = 500 | train size = 500 | val size = 3,116 | test size = 3,116
Building model 0
MoleculeModel(
  (sigmoid): Sigmoid()
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.7, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=3600, bias=False)
        (W_h): Linear(in_features=3600, out_features=3600, bias=False)
        (W_o):

New best PRC AUC: 0.704074823348878
HYPERPARAMETER 4
{'--hidden_size': '3600', '--ffn_num_layers': '1', '--ffn_hidden_size': '2400', '--dropout': '0.7', '--depth': '6'}


500it [00:00, 320959.90it/s]
100%|██████████| 500/500 [00:11<00:00, 41.82it/s]
100%|██████████| 500/500 [00:00<00:00, 234764.58it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
3116it [00:00, 446635.61it/s]
100%|██████████| 3116/3116 [01:09<00:00, 44.81it/s]
100%|██████████| 3116/3116 [00:00<00:00, 260987.11it/s]
3116it [00:00, 504495.15it/s]
100%|██████████| 3116/3116 [01:05<00:00, 47.25it/s]
100%|██████████| 3116/3116 [00:00<00:00, 269517.68it/s]
Class sizes
targets 0: 22.20%, 1: 77.80%
Total size = 500 | train size = 500 | val size = 3,116 | test size = 3,116
Building model 0
MoleculeModel(
  (sigmoid): Sigmoid()
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.7, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=3600, bias=False)
        (W_h): Linear(in_features=3600, out_features=3600, bias=False)
        (W_o): Linear(in_features=3733, out_features=3600, bias=True)
      )
   

HYPERPARAMETER 5
{'--hidden_size': '2400', '--ffn_num_layers': '1', '--ffn_hidden_size': '3600', '--dropout': '0.7', '--depth': '3'}


500it [00:00, 294667.98it/s]
100%|██████████| 500/500 [00:13<00:00, 36.35it/s]
100%|██████████| 500/500 [00:00<00:00, 167879.60it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
3116it [00:00, 305875.57it/s]
100%|██████████| 3116/3116 [01:14<00:00, 41.89it/s]
100%|██████████| 3116/3116 [00:00<00:00, 258172.20it/s]
3116it [00:00, 464542.95it/s]
100%|██████████| 3116/3116 [01:05<00:00, 47.28it/s]
100%|██████████| 3116/3116 [00:00<00:00, 257927.64it/s]
Class sizes
targets 0: 22.20%, 1: 77.80%
Total size = 500 | train size = 500 | val size = 3,116 | test size = 3,116
Building model 0
MoleculeModel(
  (sigmoid): Sigmoid()
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.7, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=2400, bias=False)
        (W_h): Linear(in_features=2400, out_features=2400, bias=False)
        (W_o): Linear(in_features=2533, out_features=2400, bias=True)
      )
   

HYPERPARAMETER 6
{'--hidden_size': '3600', '--ffn_num_layers': '2', '--ffn_hidden_size': '600', '--dropout': '0.5', '--depth': '3'}


500it [00:00, 292041.78it/s]
100%|██████████| 500/500 [00:11<00:00, 43.93it/s]
100%|██████████| 500/500 [00:00<00:00, 239510.28it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
3116it [00:00, 15202.27it/s]
100%|██████████| 3116/3116 [01:07<00:00, 46.40it/s]
100%|██████████| 3116/3116 [00:00<00:00, 259974.76it/s]
3116it [00:00, 463176.50it/s]
100%|██████████| 3116/3116 [01:05<00:00, 47.38it/s]
100%|██████████| 3116/3116 [00:00<00:00, 262570.59it/s]
Class sizes
targets 0: 22.20%, 1: 77.80%
Total size = 500 | train size = 500 | val size = 3,116 | test size = 3,116
Building model 0
MoleculeModel(
  (sigmoid): Sigmoid()
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.5, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=3600, bias=False)
        (W_h): Linear(in_features=3600, out_features=3600, bias=False)
        (W_o): Linear(in_features=3733, out_features=3600, bias=True)
      )
    

HYPERPARAMETER 7
{'--hidden_size': '600', '--ffn_num_layers': '1', '--ffn_hidden_size': '1200', '--dropout': '0.2', '--depth': '3'}


500it [00:00, 322440.34it/s]
100%|██████████| 500/500 [00:11<00:00, 42.59it/s]
100%|██████████| 500/500 [00:00<00:00, 239373.59it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
3116it [00:00, 450158.48it/s]
100%|██████████| 3116/3116 [01:08<00:00, 45.57it/s]
100%|██████████| 3116/3116 [00:00<00:00, 260114.46it/s]
3116it [00:00, 500802.82it/s]
100%|██████████| 3116/3116 [01:05<00:00, 47.26it/s]
100%|██████████| 3116/3116 [00:00<00:00, 262496.76it/s]
Class sizes
targets 0: 22.20%, 1: 77.80%
Total size = 500 | train size = 500 | val size = 3,116 | test size = 3,116
Building model 0
MoleculeModel(
  (sigmoid): Sigmoid()
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.2, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=600, bias=False)
        (W_h): Linear(in_features=600, out_features=600, bias=False)
        (W_o): Linear(in_features=733, out_features=600, bias=True)
      )
    )
  

HYPERPARAMETER 8
{'--hidden_size': '3600', '--ffn_num_layers': '2', '--ffn_hidden_size': '2400', '--dropout': '0.3', '--depth': '6'}


500it [00:00, 323086.12it/s]
100%|██████████| 500/500 [00:10<00:00, 46.11it/s]
100%|██████████| 500/500 [00:00<00:00, 243883.24it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
3116it [00:00, 447002.23it/s]
100%|██████████| 3116/3116 [01:06<00:00, 46.89it/s]
100%|██████████| 3116/3116 [00:00<00:00, 261582.60it/s]
3116it [00:00, 509331.69it/s]
100%|██████████| 3116/3116 [01:05<00:00, 47.36it/s]
100%|██████████| 3116/3116 [00:00<00:00, 260404.70it/s]
Class sizes
targets 0: 22.20%, 1: 77.80%
Total size = 500 | train size = 500 | val size = 3,116 | test size = 3,116
Building model 0
MoleculeModel(
  (sigmoid): Sigmoid()
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.3, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=3600, bias=False)
        (W_h): Linear(in_features=3600, out_features=3600, bias=False)
        (W_o): Linear(in_features=3733, out_features=3600, bias=True)
      )
   

HYPERPARAMETER 9
{'--hidden_size': '3600', '--ffn_num_layers': '1', '--ffn_hidden_size': '3600', '--dropout': '0.0', '--depth': '6'}


Setting molecule featurization parameters to default.
Loading data
500it [00:00, 171070.40it/s]
100%|██████████| 500/500 [00:17<00:00, 28.09it/s]
100%|██████████| 500/500 [00:00<00:00, 233068.68it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
3116it [00:00, 445965.03it/s]
100%|██████████| 3116/3116 [01:08<00:00, 45.81it/s]
100%|██████████| 3116/3116 [00:00<00:00, 265089.68it/s]
3116it [00:00, 462373.57it/s]
100%|██████████| 3116/3116 [01:05<00:00, 47.31it/s]
100%|██████████| 3116/3116 [00:00<00:00, 269267.80it/s]
Class sizes
targets 0: 22.20%, 1: 77.80%
Total size = 500 | train size = 500 | val size = 3,116 | test size = 3,116
Building model 0
MoleculeModel(
  (sigmoid): Sigmoid()
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.0, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=3600, bias=False)
        (W_h): Linear(in_features=3600, out_features=3600, bias=False)
        (W_o):

HYPERPARAMETER 10
{'--hidden_size': '600', '--ffn_num_layers': '3', '--ffn_hidden_size': '600', '--dropout': '0.7', '--depth': '6'}


Setting molecule featurization parameters to default.
Loading data
500it [00:00, 167812.43it/s]
100%|██████████| 500/500 [00:15<00:00, 31.54it/s]
100%|██████████| 500/500 [00:00<00:00, 236672.16it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
3116it [00:00, 438793.06it/s]
100%|██████████| 3116/3116 [01:08<00:00, 45.64it/s]
100%|██████████| 3116/3116 [00:00<00:00, 257297.99it/s]
3116it [00:00, 14936.45it/s]
100%|██████████| 3116/3116 [01:05<00:00, 47.26it/s]
100%|██████████| 3116/3116 [00:00<00:00, 258760.02it/s]
Class sizes
targets 0: 22.20%, 1: 77.80%
Total size = 500 | train size = 500 | val size = 3,116 | test size = 3,116
Building model 0
MoleculeModel(
  (sigmoid): Sigmoid()
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.7, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=600, bias=False)
        (W_h): Linear(in_features=600, out_features=600, bias=False)
        (W_o): Lin

HYPERPARAMETER 11
{'--hidden_size': '2400', '--ffn_num_layers': '3', '--ffn_hidden_size': '1200', '--dropout': '0.2', '--depth': '4'}


500it [00:00, 304730.02it/s]
100%|██████████| 500/500 [00:10<00:00, 45.66it/s]
100%|██████████| 500/500 [00:00<00:00, 247218.20it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
3116it [00:00, 489455.89it/s]
100%|██████████| 3116/3116 [01:06<00:00, 46.81it/s]
100%|██████████| 3116/3116 [00:00<00:00, 257571.81it/s]
3116it [00:00, 513030.47it/s]
100%|██████████| 3116/3116 [01:06<00:00, 47.17it/s]
100%|██████████| 3116/3116 [00:00<00:00, 257668.30it/s]
Class sizes
targets 0: 22.20%, 1: 77.80%
Total size = 500 | train size = 500 | val size = 3,116 | test size = 3,116
Building model 0
MoleculeModel(
  (sigmoid): Sigmoid()
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.2, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=2400, bias=False)
        (W_h): Linear(in_features=2400, out_features=2400, bias=False)
        (W_o): Linear(in_features=2533, out_features=2400, bias=True)
      )
   

HYPERPARAMETER 12
{'--hidden_size': '2400', '--ffn_num_layers': '3', '--ffn_hidden_size': '600', '--dropout': '0.0', '--depth': '3'}


500it [00:00, 295665.02it/s]
100%|██████████| 500/500 [00:11<00:00, 41.87it/s]
100%|██████████| 500/500 [00:00<00:00, 228796.86it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
3116it [00:00, 455762.70it/s]
100%|██████████| 3116/3116 [01:07<00:00, 46.12it/s]
100%|██████████| 3116/3116 [00:00<00:00, 256163.29it/s]
3116it [00:00, 477335.69it/s]
100%|██████████| 3116/3116 [01:05<00:00, 47.22it/s]
100%|██████████| 3116/3116 [00:00<00:00, 264537.02it/s]
Class sizes
targets 0: 22.20%, 1: 77.80%
Total size = 500 | train size = 500 | val size = 3,116 | test size = 3,116
Building model 0
MoleculeModel(
  (sigmoid): Sigmoid()
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.0, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=2400, bias=False)
        (W_h): Linear(in_features=2400, out_features=2400, bias=False)
        (W_o): Linear(in_features=2533, out_features=2400, bias=True)
      )
   

HYPERPARAMETER 13
{'--hidden_size': '1200', '--ffn_num_layers': '3', '--ffn_hidden_size': '2400', '--dropout': '0.5', '--depth': '3'}


500it [00:00, 295373.52it/s]
100%|██████████| 500/500 [00:11<00:00, 42.95it/s]
100%|██████████| 500/500 [00:00<00:00, 241468.28it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
3116it [00:00, 459787.20it/s]
100%|██████████| 3116/3116 [01:07<00:00, 46.27it/s]
100%|██████████| 3116/3116 [00:00<00:00, 250252.78it/s]
3116it [00:00, 508400.48it/s]
100%|██████████| 3116/3116 [01:05<00:00, 47.29it/s]
100%|██████████| 3116/3116 [00:00<00:00, 266767.05it/s]
Class sizes
targets 0: 22.20%, 1: 77.80%
Total size = 500 | train size = 500 | val size = 3,116 | test size = 3,116
Building model 0
MoleculeModel(
  (sigmoid): Sigmoid()
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.5, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=1200, bias=False)
        (W_h): Linear(in_features=1200, out_features=1200, bias=False)
        (W_o): Linear(in_features=1333, out_features=1200, bias=True)
      )
   

HYPERPARAMETER 14
{'--hidden_size': '3600', '--ffn_num_layers': '1', '--ffn_hidden_size': '2400', '--dropout': '0.0', '--depth': '4'}


500it [00:00, 322837.44it/s]
100%|██████████| 500/500 [00:11<00:00, 45.25it/s]
100%|██████████| 500/500 [00:00<00:00, 245856.04it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
3116it [00:00, 472418.26it/s]
100%|██████████| 3116/3116 [01:06<00:00, 46.73it/s]
100%|██████████| 3116/3116 [00:00<00:00, 251713.17it/s]
3116it [00:00, 14673.60it/s]
100%|██████████| 3116/3116 [01:08<00:00, 45.32it/s]
100%|██████████| 3116/3116 [00:00<00:00, 245527.92it/s]
Class sizes
targets 0: 22.20%, 1: 77.80%
Total size = 500 | train size = 500 | val size = 3,116 | test size = 3,116
Building model 0
MoleculeModel(
  (sigmoid): Sigmoid()
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.0, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=3600, bias=False)
        (W_h): Linear(in_features=3600, out_features=3600, bias=False)
        (W_o): Linear(in_features=3733, out_features=3600, bias=True)
      )
    

HYPERPARAMETER 15
{'--hidden_size': '2400', '--ffn_num_layers': '1', '--ffn_hidden_size': '2400', '--dropout': '0.3', '--depth': '3'}


Setting molecule featurization parameters to default.
Loading data
500it [00:00, 164212.04it/s]
100%|██████████| 500/500 [00:24<00:00, 20.58it/s]
100%|██████████| 500/500 [00:00<00:00, 144173.79it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
3116it [00:00, 254914.20it/s]
100%|██████████| 3116/3116 [01:45<00:00, 29.47it/s]
100%|██████████| 3116/3116 [00:00<00:00, 248681.41it/s]
3116it [00:00, 383515.80it/s]
100%|██████████| 3116/3116 [01:10<00:00, 44.46it/s]
100%|██████████| 3116/3116 [00:00<00:00, 226393.17it/s]
Class sizes
targets 0: 22.20%, 1: 77.80%
Total size = 500 | train size = 500 | val size = 3,116 | test size = 3,116
Building model 0
MoleculeModel(
  (sigmoid): Sigmoid()
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.3, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=2400, bias=False)
        (W_h): Linear(in_features=2400, out_features=2400, bias=False)
        (W_o):

HYPERPARAMETER 16
{'--hidden_size': '600', '--ffn_num_layers': '1', '--ffn_hidden_size': '2400', '--dropout': '0.3', '--depth': '4'}


500it [00:00, 276632.63it/s]
100%|██████████| 500/500 [00:13<00:00, 37.93it/s]
100%|██████████| 500/500 [00:00<00:00, 222745.83it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
3116it [00:00, 385609.16it/s]
100%|██████████| 3116/3116 [01:14<00:00, 41.71it/s]
100%|██████████| 3116/3116 [00:00<00:00, 214200.63it/s]
3116it [00:00, 404764.82it/s]
100%|██████████| 3116/3116 [01:13<00:00, 42.23it/s]
100%|██████████| 3116/3116 [00:00<00:00, 258493.89it/s]
Class sizes
targets 0: 22.20%, 1: 77.80%
Total size = 500 | train size = 500 | val size = 3,116 | test size = 3,116
Building model 0
MoleculeModel(
  (sigmoid): Sigmoid()
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.3, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=600, bias=False)
        (W_h): Linear(in_features=600, out_features=600, bias=False)
        (W_o): Linear(in_features=733, out_features=600, bias=True)
      )
    )
  

HYPERPARAMETER 17
{'--hidden_size': '600', '--ffn_num_layers': '3', '--ffn_hidden_size': '600', '--dropout': '0.7', '--depth': '3'}


500it [00:00, 249898.95it/s]
100%|██████████| 500/500 [00:12<00:00, 39.00it/s]
100%|██████████| 500/500 [00:00<00:00, 225840.19it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
3116it [00:00, 446102.03it/s]
100%|██████████| 3116/3116 [01:16<00:00, 40.49it/s]
100%|██████████| 3116/3116 [00:00<00:00, 250555.03it/s]
3116it [00:00, 363717.23it/s]
100%|██████████| 3116/3116 [01:12<00:00, 42.76it/s]
100%|██████████| 3116/3116 [00:00<00:00, 251031.47it/s]
Class sizes
targets 0: 22.20%, 1: 77.80%
Total size = 500 | train size = 500 | val size = 3,116 | test size = 3,116
Building model 0
MoleculeModel(
  (sigmoid): Sigmoid()
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.7, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=600, bias=False)
        (W_h): Linear(in_features=600, out_features=600, bias=False)
        (W_o): Linear(in_features=733, out_features=600, bias=True)
      )
    )
  

HYPERPARAMETER 18
{'--hidden_size': '3600', '--ffn_num_layers': '3', '--ffn_hidden_size': '600', '--dropout': '0.3', '--depth': '4'}


500it [00:00, 301965.73it/s]
100%|██████████| 500/500 [00:11<00:00, 45.18it/s]
100%|██████████| 500/500 [00:00<00:00, 245166.24it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
3116it [00:00, 499024.49it/s]
100%|██████████| 3116/3116 [01:06<00:00, 46.70it/s]
100%|██████████| 3116/3116 [00:00<00:00, 257136.00it/s]
3116it [00:00, 501937.60it/s]
100%|██████████| 3116/3116 [01:05<00:00, 47.43it/s]
100%|██████████| 3116/3116 [00:00<00:00, 269328.84it/s]
Class sizes
targets 0: 22.20%, 1: 77.80%
Total size = 500 | train size = 500 | val size = 3,116 | test size = 3,116
Building model 0
MoleculeModel(
  (sigmoid): Sigmoid()
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.3, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=3600, bias=False)
        (W_h): Linear(in_features=3600, out_features=3600, bias=False)
        (W_o): Linear(in_features=3733, out_features=3600, bias=True)
      )
   

HYPERPARAMETER 19
{'--hidden_size': '1200', '--ffn_num_layers': '2', '--ffn_hidden_size': '1200', '--dropout': '0.3', '--depth': '5'}


Setting molecule featurization parameters to default.
Loading data
500it [00:00, 143670.07it/s]
100%|██████████| 500/500 [00:22<00:00, 21.80it/s]
100%|██████████| 500/500 [00:00<00:00, 138563.07it/s]
Number of tasks = 1
Fold 0
Splitting data with seed 0
3116it [00:00, 252228.10it/s]
100%|██████████| 3116/3116 [01:53<00:00, 27.46it/s]
100%|██████████| 3116/3116 [00:00<00:00, 247804.39it/s]
3116it [00:00, 433322.88it/s]
100%|██████████| 3116/3116 [01:13<00:00, 42.52it/s]
100%|██████████| 3116/3116 [00:00<00:00, 246244.96it/s]
Class sizes
targets 0: 22.20%, 1: 77.80%
Total size = 500 | train size = 500 | val size = 3,116 | test size = 3,116
Building model 0
MoleculeModel(
  (sigmoid): Sigmoid()
  (encoder): MPN(
    (encoder): ModuleList(
      (0): MPNEncoder(
        (dropout): Dropout(p=0.3, inplace=False)
        (act_func): ReLU()
        (W_i): Linear(in_features=147, out_features=1200, bias=False)
        (W_h): Linear(in_features=1200, out_features=1200, bias=False)
        (W_o):

New best PRC AUC: 0.7075586126299451





In [9]:
print(best_hyperparams)

{'--hidden_size': '1200', '--ffn_num_layers': '2', '--ffn_hidden_size': '1200', '--dropout': '0.3', '--depth': '5'}


In [10]:
print(best_score)

0.7075586126299451


In [8]:
from copy import copy

result_arguments = copy(base_arguments)
result_arguments.extend([
    '--hidden_size', '1200',
    '--ffn_num_layers', '2',
    '--ffn_hidden_size', '1200',
    '--dropout', '0.3',
    '--depth', '4'
])

args = chemprop.args.TrainArgs().parse_args(result_arguments)
mean_score, std_score = chemprop.train.cross_validate(args=args, train_func=chemprop.train.run_training)


Command line
python /home/simon/miniconda3/envs/chemprop/lib/python3.8/site-packages/ipykernel_launcher.py --ip=127.0.0.1 --stdin=9003 --control=9001 --hb=9000 --Session.signature_scheme="hmac-sha256" --Session.key=b"5f7ed2af-df14-4584-88a9-9c2d0be5caca" --shell=9002 --transport="tcp" --iopub=9004 --f=/home/simon/.local/share/jupyter/runtime/kernel-v2-28351303lbL6ZQwtmM.json
Args
{'activation': 'ReLU',
 'adding_bond_types': True,
 'adding_h': False,
 'aggregation': 'mean',
 'aggregation_norm': 100,
 'atom_constraints': [],
 'atom_descriptor_scaling': True,
 'atom_descriptors': None,
 'atom_descriptors_path': None,
 'atom_descriptors_size': 0,
 'atom_features_size': 0,
 'atom_messages': False,
 'atom_targets': [],
 'batch_size': 50,
 'bias': False,
 'bias_solvent': False,
 'bond_constraints': [],
 'bond_descriptor_scaling': True,
 'bond_descriptors': None,
 'bond_descriptors_path': None,
 'bond_descriptors_size': 0,
 'bond_features_size': 0,
 'bond_targets': [],
 'cache_cutoff': 10000,


 52%|█████▏    | 1632/3116 [00:51<00:50, 29.45it/s]

In [9]:
max_epoch = 1