# Running hyperparameter optimization on Chemprop model using RayTune

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/chemprop/chemprop/blob/main/examples/hpopting.ipynb)

In [1]:
# Install chemprop from GitHub if running in Google Colab
import os

if os.getenv("COLAB_RELEASE_TAG"):
    try:
        import chemprop
    except ImportError:
        !git clone https://github.com/chemprop/chemprop.git
        %cd chemprop
        !pip install ".[hpopt]"
        %cd examples

Cloning into 'chemprop'...
remote: Enumerating objects: 24987, done.[K
remote: Counting objects: 100% (187/187), done.[K
remote: Compressing objects: 100% (143/143), done.[K
remote: Total 24987 (delta 115), reused 44 (delta 44), pack-reused 24800 (from 3)[K
Receiving objects: 100% (24987/24987), 817.53 MiB | 25.45 MiB/s, done.
Resolving deltas: 100% (17934/17934), done.
/content/chemprop
Processing /content/chemprop
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting lightning>=2.0 (from chemprop==2.1.2)
  Downloading lightning-2.5.1.post0-py3-none-any.whl.metadata (39 kB)
Collecting rdkit (from chemprop==2.1.2)
  Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Collecting astartes[molecules] (from chemprop==2.1.2)
  Downloading astartes-1.3.0-py3-none-any.whl.metadata (26 kB)
Collecting ConfigArgParse (from chemprop==2.

## Import packages

In [2]:
from pathlib import Path

import pandas as pd
from lightning import pytorch as pl
import ray
from ray import tune
from ray.train import CheckpointConfig, RunConfig, ScalingConfig
from ray.train.lightning import (RayDDPStrategy, RayLightningEnvironment,
                                 RayTrainReportCallback, prepare_trainer)
from ray.train.torch import TorchTrainer
from ray.tune.search.hyperopt import HyperOptSearch
from ray.tune.search.optuna import OptunaSearch
from ray.tune.schedulers import FIFOScheduler

from chemprop import data, featurizers, models, nn

In [3]:
chemprop_dir = Path.cwd().parent
input_path = "/content/Augmented_data.csv" # path to your data .csv file
num_workers = 0 # number of workers for dataloader. 0 means using main process for data loading
smiles_column = 'SMILES ' # name of the column containing SMILES strings
target_columns = ['Status'] # list of names of the columns containing targets

hpopt_save_dir = Path.cwd() / "hpopt" # directory to save hyperopt results
hpopt_save_dir.mkdir(exist_ok=True)

## Load data

In [4]:
df_input = pd.read_csv(input_path)
df_input

Unnamed: 0.1,Unnamed: 0,Catalog Number,SMILES,Active/Inactive,Remark,Status
0,0,S7718,CN(C)CCNC(=O)C1=CC=CN2C(=O)C3=C(C=C4C=CC=CC4=C...,Active,Training,1
1,1,S2679,Cl.CN1CCC(C(O)C1)C2=C3OC(=CC(=O)C3=C(O)C=C2O)C...,Active,Training,1
2,2,S7511,CC(C)C1=C2C=C(C=CC2=N[N]1C)C3=CC=NC(=N3)NC4CCC...,Active,Training,1
3,3,S1393,COC1=CC=CC2=C1C(=O)C3=C(O)C4=C(CC(O)(CC4OC5CC(...,Active,Training,1
4,4,S7648,Cl.Cl.CC(CN(C)C)C1=CC=C(C=C1)C2=C3C(=C(C)C=C2O...,Active,Training,1
...,...,...,...,...,...,...
4681,4724,,[1*]C(=O)C(C)CC.[13*]C1CC(O)CC(=O)O1.[3*]O[3*]...,Active,,1
4682,4725,,[15*]C1CCC=C2C=CC(C)C([15*])C21.[1*]C(=O)C(C)C...,Active,,1
4683,4726,,[9*]n1c(=O)c([16*])cc2c(C)nc(N)nc21.[3*]O[3*]....,Active,,1
4684,4727,,[14*]c1ccc([16*])cn1.[4*]CCO.[15*]C1CCC([15*])...,Active,,1


In [5]:
smis = df_input.loc[:, smiles_column].values
ys = df_input.loc[:, target_columns].values

## Make data points, splits, and datasets

In [6]:
all_data = []
for smi, y in zip(smis, ys):
    try:
        datapoint = data.MoleculeDatapoint.from_smi(smi, y)
        all_data.append(datapoint)
    except RuntimeError:
        print(f"Skipping invalid SMILES: {smi}")

[10:57:44] SMILES Parse Error: syntax error while parsing: COC(=O)NC(C)CNC1=NC=CC(=N1)C2=C[N](N=C2C3=C(F)C(=CC(=C3)Cl)N[S;v6](C)(=O)=O)C(C)C
[10:57:44] SMILES Parse Error: check for mistakes around position 63:
[10:57:44] C(F)C(=CC(=C3)Cl)N[S;v6](C)(=O)=O)C(C)C
[10:57:44] ~~~~~~~~~~~~~~~~~~~~^
[10:57:44] SMILES Parse Error: Failed parsing SMILES 'COC(=O)NC(C)CNC1=NC=CC(=N1)C2=C[N](N=C2C3=C(F)C(=CC(=C3)Cl)N[S;v6](C)(=O)=O)C(C)C' for input: 'COC(=O)NC(C)CNC1=NC=CC(=N1)C2=C[N](N=C2C3=C(F)C(=CC(=C3)Cl)N[S;v6](C)(=O)=O)C(C)C'
[10:57:44] SMILES Parse Error: syntax error while parsing: O.O.O.OC1=CC=CN|2=C1C(=O)O[VH]3|2(O)(=O)OC(=O)C4=NC=CC=C4O3
[10:57:44] SMILES Parse Error: check for mistakes around position 16:
[10:57:44] O.O.O.OC1=CC=CN|2=C1C(=O)O[VH]3|2(O)(=O)O
[10:57:44] ~~~~~~~~~~~~~~~^
[10:57:44] SMILES Parse Error: Failed parsing SMILES 'O.O.O.OC1=CC=CN|2=C1C(=O)O[VH]3|2(O)(=O)OC(=O)C4=NC=CC=C4O3' for input: 'O.O.O.OC1=CC=CN|2=C1C(=O)O[VH]3|2(O)(=O)OC(=O)C4=NC=CC=C4O3'
[10:57:44] SMIL

Skipping invalid SMILES: COC(=O)NC(C)CNC1=NC=CC(=N1)C2=C[N](N=C2C3=C(F)C(=CC(=C3)Cl)N[S;v6](C)(=O)=O)C(C)C
Skipping invalid SMILES: O.O.O.OC1=CC=CN|2=C1C(=O)O[VH]3|2(O)(=O)OC(=O)C4=NC=CC=C4O3
Skipping invalid SMILES: Cl.CC1=CC(=CC(=C1)C[S;v6](=O)(=O)C2=CC=CC=C2)OCC3=CC=C(CN4CCCC4CO)C=C3
Skipping invalid SMILES: N[S;v6](=O)(=O)OCC1CC(CC1O)[N]2C=CC3=C(NC4CCC5=CC=CC=C45)N=CN=C23


[10:57:44] SMILES Parse Error: extra open parentheses while parsing: CCC(C)C1NC(=O)C(CC2=CC=C(O)C=C2)NC(=O)C(CC(C)C)NC(=O)C3CCCN3C(=O)C(CSSCC(NC(=O)C(NC(=O)C4CCCN4C(=O)C(CC(O)=O)NC(=O)C(CC5=CC=C(O)C=C5)NC(=O)C(CO)NC1=O)C(C)C)C(=O)NC(CCCNC(N)=N)C(=O)NC(CCCNC(N)=N)C(=O)N
[10:57:44] SMILES Parse Error: check for mistakes around position 67:
[10:57:44] )NC(=O)C3CCCN3C(=O)C(CSSCC(NC(=O)C(NC(=O)
[10:57:44] ~~~~~~~~~~~~~~~~~~~~^
[10:57:44] SMILES Parse Error: Failed parsing SMILES 'CCC(C)C1NC(=O)C(CC2=CC=C(O)C=C2)NC(=O)C(CC(C)C)NC(=O)C3CCCN3C(=O)C(CSSCC(NC(=O)C(NC(=O)C4CCCN4C(=O)C(CC(O)=O)NC(=O)C(CC5=CC=C(O)C=C5)NC(=O)C(CO)NC1=O)C(C)C)C(=O)NC(CCCNC(N)=N)C(=O)NC(CCCNC(N)=N)C(=O)N' for input: 'CCC(C)C1NC(=O)C(CC2=CC=C(O)C=C2)NC(=O)C(CC(C)C)NC(=O)C3CCCN3C(=O)C(CSSCC(NC(=O)C(NC(=O)C4CCCN4C(=O)C(CC(O)=O)NC(=O)C(CC5=CC=C(O)C=C5)NC(=O)C(CO)NC1=O)C(C)C)C(=O)NC(CCCNC(N)=N)C(=O)NC(CCCNC(N)=N)C(=O)N'
[10:57:44] SMILES Parse Error: syntax error while parsing: CC(C)(C)C[N]1C(=NC2=CC=C(N=C12)C3=C(N=C([NH]3

Skipping invalid SMILES: CCC(C)C1NC(=O)C(CC2=CC=C(O)C=C2)NC(=O)C(CC(C)C)NC(=O)C3CCCN3C(=O)C(CSSCC(NC(=O)C(NC(=O)C4CCCN4C(=O)C(CC(O)=O)NC(=O)C(CC5=CC=C(O)C=C5)NC(=O)C(CO)NC1=O)C(C)C)C(=O)NC(CCCNC(N)=N)C(=O)NC(CCCNC(N)=N)C(=O)N
Skipping invalid SMILES: CC(C)(C)C[N]1C(=NC2=CC=C(N=C12)C3=C(N=C([NH]3)C(C)(C)C)C4=CC=C(F)C=C4)N.C[S;v6](O)(=O)=O


In [7]:
mols = [d.mol for d in all_data]  # RDkit Mol objects are use for structure based splits
train_indices, val_indices, test_indices = data.make_split_indices(mols, "random", (0.8, 0.1, 0.1))
train_data, val_data, test_data = data.split_data_by_indices(
    all_data, train_indices, val_indices, test_indices
)



In [8]:
from chemprop.featurizers import SimpleMoleculeMolGraphFeaturizer
from rdkit.Chem import Descriptors, rdMolDescriptors
class CustomFeaturizer(SimpleMoleculeMolGraphFeaturizer):
    def featurize(self, mol):
        atom_features = []
        for atom in mol.GetAtoms():
            atom_features.append([
                atom.GetAtomicNum(),
                atom.GetNumBonds(),
                atom.GetFormalCharge(),
                atom.GetChiralTag(),
                atom.GetTotalNumHs(),
                atom.GetHybridization(),
                atom.GetIsAromatic(),
                atom.GetMass()
            ])

        bond_features = []
        for bond in mol.GetBonds():
            bond_features.append([
                bond.GetBondType(),
                bond.GetIsConjugated(),
                bond.IsInRing(),
                bond.GetStereo()
            ])

        mol_features = []
        for descriptor_name, descriptor_func in Descriptors.descList:
            mol_features.append(descriptor_func(mol))

        additional_features = []
        for descriptor_name, descriptor_func in rdMolDescriptors.descList:
            additional_features.append(descriptor_func(mol))

        return atom_features + bond_features + mol_features + additional_features

featurizer = CustomFeaturizer()

# === Create datasets and loaders ===
train_dset = data.MoleculeDataset(train_data[0], featurizer)
scaler = train_dset.normalize_targets()
val_dset = data.MoleculeDataset(val_data[0], featurizer)
val_dset.normalize_targets(scaler)
test_dset = data.MoleculeDataset(test_data[0], featurizer)

# Define helper function to train the model

In [9]:
def train_model(config, train_dset, val_dset, num_workers, scaler):

    # config is a dictionary containing hyperparameters used for the trial
    depth = int(config["depth"])
    ffn_hidden_dim = int(config["ffn_hidden_dim"])
    ffn_num_layers = int(config["ffn_num_layers"])
    message_hidden_dim = int(config["message_hidden_dim"])

    train_loader = data.build_dataloader(train_dset, num_workers=num_workers, shuffle=True)
    val_loader = data.build_dataloader(val_dset, num_workers=num_workers, shuffle=False)

    mp = nn.BondMessagePassing(d_h=message_hidden_dim, depth=depth)
    agg = nn.MeanAggregation()
    output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)
    ffn = nn.BinaryClassificationFFN(output_transform=output_transform, input_dim=message_hidden_dim, hidden_dim=ffn_hidden_dim, n_layers=ffn_num_layers)
    batch_norm = True
    metric_list = [nn.metrics.BinaryF1Score()]
    model = models.MPNN(mp, agg, ffn, batch_norm, metric_list)

    trainer = pl.Trainer(
        accelerator="auto",
        devices=1,
        max_epochs=20, # number of epochs to train for
        # below are needed for Ray and Lightning integration
        strategy=RayDDPStrategy(),
        callbacks=[RayTrainReportCallback()],
        plugins=[RayLightningEnvironment()],
    )

    trainer = prepare_trainer(trainer)
    trainer.fit(model, train_loader, val_loader)


## Define parameter search space

In [10]:

search_space = {
    "depth": tune.qrandint(lower=2, upper=6, q=1),
    "ffn_hidden_dim": tune.qrandint(lower=300, upper=2400, q=100),
    "ffn_num_layers": tune.qrandint(lower=1, upper=3, q=1),
    "message_hidden_dim": tune.qrandint(lower=300, upper=2400, q=100),
}

In [None]:


ray.shutdown()
ray.init()

# define the scaler before using it

scheduler = FIFOScheduler()

# Scaling config controls the resources used by Ray
scaling_config = ScalingConfig(
    num_workers=1,
    use_gpu= False, # change to True if you want to use GPU
)

# Checkpoint config controls the checkpointing behavior of Ray
checkpoint_config = CheckpointConfig(
    num_to_keep=1, # number of checkpoints to keep
    checkpoint_score_attribute="val_loss", # Save the checkpoint based on this metric
    checkpoint_score_order="min", # Save the checkpoint with the lowest metric value
)

run_config = RunConfig(
    checkpoint_config=checkpoint_config,
    storage_path=hpopt_save_dir / "ray_results", # directory to save the results
)

ray_trainer = TorchTrainer(
    lambda config: train_model(
        config, train_dset, val_dset, num_workers, scaler
    ),
    scaling_config=scaling_config,
    run_config=run_config,
)

search_alg = HyperOptSearch(
    n_initial_points=1, # number of random evaluations before tree parzen estimators
    random_state_seed=42,
)

# OptunaSearch is another search algorithm that can be used
# search_alg = OptunaSearch()

tune_config = tune.TuneConfig(
    metric="val_loss",
    mode="min",
    num_samples=20, # number of trials to run
    scheduler=scheduler,
    search_alg=search_alg,
    trial_dirname_creator=lambda trial: str(trial.trial_id), # shorten filepaths

)

tuner = tune.Tuner(
    ray_trainer,
    param_space={
        "train_loop_config": search_space,
    },
    tune_config=tune_config,
)

# Start the hyperparameter search
results = tuner.fit()


2025-05-05 10:58:32,011	INFO worker.py:1888 -- Started a local Ray instance.


+---------------------------------------------------------------------+
| Configuration for experiment     TorchTrainer_2025-05-05_10-58-35   |
+---------------------------------------------------------------------+
| Search algorithm                 SearchGenerator                    |
| Scheduler                        FIFOScheduler                      |
| Number of trials                 20                                 |
+---------------------------------------------------------------------+

View detailed results here: /content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35
To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2025-05-05_10-58-28_699870_493/artifacts/2025-05-05_10-58-35/TorchTrainer_2025-05-05_10-58-35/driver_artifacts`

Trial status: 1 PENDING
Current time: 2025-05-05 10:58:35. Total running time: 0s
Logical resource usage: 0/2 CPUs, 0/0 GPUs
+--------------------------------------------------------------

[36m(TorchTrainer pid=2818)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=2818)[0m - (node_id=85d1826776a8b04eb71395ec08130d8b282f617a724b5c6551ccdd9f, ip=172.28.0.12, pid=2908) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=2908)[0m Setting up process group for: env:// [rank=0, world_size=1]



Trial TorchTrainer_c410a352 started with configuration:
+---------------------------------------------+
| Trial TorchTrainer_c410a352 config          |
+---------------------------------------------+
| train_loop_config/depth                   2 |
| train_loop_config/ffn_hidden_dim       2200 |
| train_loop_config/ffn_num_layers          2 |
| train_loop_config/message_hidden_dim    400 |
+---------------------------------------------+


[36m(RayTrainWorker pid=2908)[0m Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
[36m(RayTrainWorker pid=2908)[0m GPU available: False, used: False
[36m(RayTrainWorker pid=2908)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=2908)[0m HPU available: False, using: 0 HPUs



Trial status: 2 RUNNING | 1 PENDING
Current time: 2025-05-05 10:59:05. Total running time: 30s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
+--------------------------------------------------------------------------------------------------------------------------------------+
| Trial name              status       ...loop_config/depth     ...ig/ffn_hidden_dim     ...ig/ffn_num_layers     ...essage_hidden_dim |
+--------------------------------------------------------------------------------------------------------------------------------------+
| TorchTrainer_8da927e1   RUNNING                         2                     2000                        2                      500 |
| TorchTrainer_c410a352   RUNNING                         2                     2200                        2                      400 |
| TorchTrainer_33384398   PENDING                         2                      600                        2                     2000 |
+------------------------------------

[36m(RayTrainWorker pid=2908)[0m 2025-05-05 10:59:07.032489: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(RayTrainWorker pid=2908)[0m E0000 00:00:1746442747.315972    3063 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(RayTrainWorker pid=2908)[0m E0000 00:00:1746442747.393966    3063 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(RayTrainWorker pid=2908)[0m 2025-05-05 10:59:07.954640: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
[36m(RayTrainWorker pid=2908)[0m To enable the following instructions: AVX2 FMA, in other operations, rebuil

[36m(RayTrainWorker pid=2908)[0m Sanity Checking: |          | 0/? [00:00<?, ?it/s]
[36m(RayTrainWorker pid=2908)[0m Sanity Checking:   0%|          | 0/2 [00:00<?, ?it/s]Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=2908)[0m Loading `train_dataloader` to estimate number of stepping batches.
[36m(RayTrainWorker pid=2908)[0m 
[36m(RayTrainWorker pid=2908)[0m   | Name            | Type                    | Params | Mode 
[36m(RayTrainWorker pid=2908)[0m --------------------------------------------------------------------
[36m(RayTrainWorker pid=2908)[0m 0 | message_passing | BondMessagePassing      | 579 K  | train
[36m(RayTrainWorker pid=2908)[0m 1 | agg             | MeanAggregation         | 0      | train
[36m(RayTrainWorker pid=2908)[0m 2 | bn              | BatchNorm1d             | 1.0 K  | train
[36m(RayTrainWorker pid=2908)[0m 3 | predictor       | BinaryClassificationFFN | 5.0 M  | train
[36m(RayTrainWorker pid=2908)[0m 4 | X_d_transform   | Identity                | 0      | train
[36m(RayTrainWorker pid=2908)[0m 5 | metrics         | ModuleList              | 0      | train
[36m(RayTrainWorker pid=2908)[0m -------------------------------------

Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s]
Epoch 0:   0%|          | 0/59 [00:00<?, ?it/s] 
Epoch 0:   2%|▏         | 1/59 [00:01<00:59,  0.97it/s, v_num=0, train_loss_step=0.712]
Epoch 0:   3%|▎         | 2/59 [00:01<00:56,  1.01it/s, v_num=0, train_loss_step=0.416]
Epoch 0:   5%|▌         | 3/59 [00:02<00:54,  1.02it/s, v_num=0, train_loss_step=0.269]


[36m(RayTrainWorker pid=3058)[0m Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
[36m(RayTrainWorker pid=3058)[0m GPU available: False, used: False
[36m(RayTrainWorker pid=3058)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=3058)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=3058)[0m 2025-05-05 10:59:22.095887: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(RayTrainWorker pid=3058)[0m E0000 00:00:1746442762.146100    3172 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(RayTrainWorker pid=3058)[0m E0000 00:00:1746442762.161395    3172 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attemptin

Epoch 0:   7%|▋         | 4/59 [00:03<00:52,  1.04it/s, v_num=0, train_loss_step=0.146]
Epoch 0:   8%|▊         | 5/59 [00:04<00:50,  1.06it/s, v_num=0, train_loss_step=0.412]
Epoch 0:  10%|█         | 6/59 [00:05<00:50,  1.06it/s, v_num=0, train_loss_step=-0.291]
Epoch 0:  12%|█▏        | 7/59 [00:06<00:48,  1.06it/s, v_num=0, train_loss_step=0.479] 
Epoch 0:  14%|█▎        | 8/59 [00:07<00:48,  1.06it/s, v_num=0, train_loss_step=-0.357]
Epoch 0:  15%|█▌        | 9/59 [00:08<00:47,  1.05it/s, v_num=0, train_loss_step=-0.676]


[36m(RayTrainWorker pid=3058)[0m Loading `train_dataloader` to estimate number of stepping batches.
[36m(RayTrainWorker pid=3058)[0m 
[36m(RayTrainWorker pid=3058)[0m   | Name            | Type                    | Params | Mode 
[36m(RayTrainWorker pid=3058)[0m --------------------------------------------------------------------
[36m(RayTrainWorker pid=3058)[0m 0 | message_passing | BondMessagePassing      | 383 K  | train
[36m(RayTrainWorker pid=3058)[0m 1 | agg             | MeanAggregation         | 0      | train
[36m(RayTrainWorker pid=3058)[0m 2 | bn              | BatchNorm1d             | 800    | train
[36m(RayTrainWorker pid=3058)[0m 3 | predictor       | BinaryClassificationFFN | 5.7 M  | train
[36m(RayTrainWorker pid=3058)[0m 4 | X_d_transform   | Identity                | 0      | train
[36m(RayTrainWorker pid=3058)[0m 5 | metrics         | ModuleList              | 0      | train
[36m(RayTrainWorker pid=3058)[0m -------------------------------------

[36m(RayTrainWorker pid=3058)[0m Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:01<00:01,  0.89it/s]
Epoch 0:  17%|█▋        | 10/59 [00:10<00:50,  0.98it/s, v_num=0, train_loss_step=-1.06] 
Epoch 0:   0%|          | 0/59 [00:00<?, ?it/s] 
Epoch 0:  19%|█▊        | 11/59 [00:11<00:51,  0.94it/s, v_num=0, train_loss_step=-0.468]
Epoch 0:   2%|▏         | 1/59 [00:01<01:25,  0.68it/s, v_num=0, train_loss_step=0.688]
Epoch 0:  20%|██        | 12/59 [00:13<00:52,  0.90it/s, v_num=0, train_loss_step=0.031] 
Trial status: 2 RUNNING | 1 PENDING
Current time: 2025-05-05 10:59:35. Total running time: 1min 0s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
+--------------------------------------------------------------------------------------------------------------------------------------+
| Trial name              status       ...loop_config/depth     ...ig/ffn_hidden_

[36m(RayTrainWorker pid=3058)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/c410a352/checkpoint_000000)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3058)[0m Epoch 0: 100%|██████████| 59/59 [01:08<00:00,  0.86it/s, v_num=0, train_loss_step=-117., val_loss=-126., train_loss_epoch=-54.2]
[36m(RayTrainWorker pid=3058)[0m Validation: |          | 0/? [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=3058)[0m Validation:   0%|          | 0/8 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=2908)[0m 
Epoch 1:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-117., val_loss=-126., train_loss_epoch=-54.2]
[36m(RayTrainWorker pid=2908)[0m 
Validation DataLoader 0:  88%|████████▊ | 7/8 [00:05<00:00,  1.28it/s][A[32m [repeated 13x across cluster][0m


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


Epoch 1:   2%|▏         | 1/59 [00:00<00:51,  1.14it/s, v_num=0, train_loss_step=-567., val_loss=-126., train_loss_epoch=-54.2]
Epoch 1:   3%|▎         | 2/59 [00:01<00:47,  1.21it/s, v_num=0, train_loss_step=-86.6, val_loss=-126., train_loss_epoch=-54.2]
Validation DataLoader 0: 100%|██████████| 8/8 [00:05<00:00,  1.40it/s][A
Epoch 0: 100%|██████████| 59/59 [01:20<00:00,  0.74it/s, v_num=0, train_loss_step=-162., val_loss=-30.4]
Epoch 1:   5%|▌         | 3/59 [00:03<01:07,  0.84it/s, v_num=0, train_loss_step=98.10, val_loss=-30.4, train_loss_epoch=-70.9]
Epoch 0: 100%|██████████| 59/59 [01:20<00:00,  0.73it/s, v_num=0, train_loss_step=-162., val_loss=-30.4, train_loss_epoch=-70.9]
Epoch 1:   5%|▌         | 3/59 [00:03<01:07,  0.83it/s, v_num=0, train_loss_step=-1.38e+3, val_loss=-30.4, train_loss_epoch=-70.9]
Epoch 1:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-162., val_loss=-30.4, train_loss_epoch=-70.9]
Epoch 1:  10%|█         | 6/59 [00:05<00:52,  1.01it/s, v

[36m(RayTrainWorker pid=3058)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/c410a352/checkpoint_000001)[32m [repeated 2x across cluster][0m


Epoch 1: 100%|██████████| 59/59 [00:57<00:00,  1.02it/s, v_num=0, train_loss_step=-1.55e+5, val_loss=-1.14e+5, train_loss_epoch=-2.13e+4]
Epoch 2:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-1.55e+5, val_loss=-1.14e+5, train_loss_epoch=-2.13e+4]
Epoch 1:  86%|████████▋ | 51/59 [00:57<00:09,  0.88it/s, v_num=0, train_loss_step=-8.55e+4, val_loss=-30.4, train_loss_epoch=-70.9][32m [repeated 5x across cluster][0m
Epoch 1:  92%|█████████▏| 54/59 [01:00<00:05,  0.89it/s, v_num=0, train_loss_step=-9.46e+4, val_loss=-30.4, train_loss_epoch=-70.9]
Epoch 1:  93%|█████████▎| 55/59 [01:01<00:04,  0.89it/s, v_num=0, train_loss_step=-3.25e+4, val_loss=-30.4, train_loss_epoch=-70.9]
Epoch 1:  95%|█████████▍| 56/59 [01:02<00:03,  0.89it/s, v_num=0, train_loss_step=-1.08e+5, val_loss=-30.4, train_loss_epoch=-70.9]
Epoch 2:  10%|█         | 6/59 [00:04<00:42,  1.23it/s, v_num=0, train_loss_step=-1.46e+5, val_loss=-1.14e+5, train_loss_epoch=-2.13e+4][32m [repeated 8x across clust

[36m(RayTrainWorker pid=2908)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/8da927e1/checkpoint_000001)


Epoch 1: 100%|██████████| 59/59 [01:12<00:00,  0.81it/s, v_num=0, train_loss_step=-1.32e+5, val_loss=-1e+5, train_loss_epoch=-2.64e+4]
Epoch 2:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-1.32e+5, val_loss=-1e+5, train_loss_epoch=-2.64e+4]
Epoch 2:  32%|███▏      | 19/59 [00:16<00:35,  1.13it/s, v_num=0, train_loss_step=-2.63e+5, val_loss=-1.14e+5, train_loss_epoch=-2.13e+4][32m [repeated 8x across cluster][0m
Epoch 2:  42%|████▏     | 25/59 [00:22<00:30,  1.10it/s, v_num=0, train_loss_step=-3.7e+5, val_loss=-1.14e+5, train_loss_epoch=-2.13e+4] [32m [repeated 11x across cluster][0m
Epoch 2:  53%|█████▎    | 31/59 [00:27<00:25,  1.11it/s, v_num=0, train_loss_step=-7.96e+5, val_loss=-1.14e+5, train_loss_epoch=-2.13e+4][32m [repeated 11x across cluster][0m
Trial status: 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:02:05. Total running time: 3min 30s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: c410a352 with val_loss=-114000.2734375 and p

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
[36m(RayTrainWorker pid=3058)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/c410a352/checkpoint_000002)


Epoch 2: 100%|██████████| 59/59 [00:59<00:00,  1.00it/s, v_num=0, train_loss_step=-2.79e+6, val_loss=-1.18e+6, train_loss_epoch=-6.81e+5]
Epoch 3:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-2.79e+6, val_loss=-1.18e+6, train_loss_epoch=-6.81e+5]
Epoch 2:  64%|██████▍   | 38/59 [00:43<00:23,  0.88it/s, v_num=0, train_loss_step=-8.54e+5, val_loss=-1e+5, train_loss_epoch=-2.64e+4][32m [repeated 5x across cluster][0m
Epoch 3:  12%|█▏        | 7/59 [00:05<00:41,  1.25it/s, v_num=0, train_loss_step=-1.77e+6, val_loss=-1.18e+6, train_loss_epoch=-6.81e+5][32m [repeated 12x across cluster][0m
Epoch 2:  81%|████████▏ | 48/59 [00:54<00:12,  0.88it/s, v_num=0, train_loss_step=-6e+5, val_loss=-1e+5, train_loss_epoch=-2.64e+4]   [32m [repeated 9x across cluster][0m
Epoch 3:  32%|███▏      | 19/59 [00:16<00:34,  1.14it/s, v_num=0, train_loss_step=-4.53e+6, val_loss=-1.18e+6, train_loss_epoch=-6.81e+5][32m [repeated 13x across cluster][0m
Epoch 2:  92%|█████████▏| 54/59 [0

[36m(RayTrainWorker pid=2908)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/8da927e1/checkpoint_000002)


Epoch 3:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-2.81e+6, val_loss=-2.32e+6, train_loss_epoch=-7.77e+5]
Epoch 3:  63%|██████▎   | 37/59 [00:32<00:19,  1.14it/s, v_num=0, train_loss_step=-4.08e+6, val_loss=-1.18e+6, train_loss_epoch=-6.81e+5][32m [repeated 9x across cluster][0m
Epoch 3:  73%|███████▎  | 43/59 [00:38<00:14,  1.13it/s, v_num=0, train_loss_step=-7.01e+6, val_loss=-1.18e+6, train_loss_epoch=-6.81e+5][32m [repeated 11x across cluster][0m
Epoch 3:  20%|██        | 12/59 [00:13<00:52,  0.89it/s, v_num=0, train_loss_step=-4.34e+6, val_loss=-2.32e+6, train_loss_epoch=-7.77e+5][32m [repeated 10x across cluster][0m
Epoch 3:  92%|█████████▏| 54/59 [00:47<00:04,  1.14it/s, v_num=0, train_loss_step=-7.31e+6, val_loss=-1.18e+6, train_loss_epoch=-6.81e+5]
Epoch 3:  93%|█████████▎| 55/59 [00:48<00:03,  1.14it/s, v_num=0, train_loss_step=-6.09e+6, val_loss=-1.18e+6, train_loss_epoch=-6.81e+5]
Epoch 3:  31%|███       | 18/59 [00:19<00:43,  0.94it/s, v_num=0,

[36m(RayTrainWorker pid=3058)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/c410a352/checkpoint_000003)


Epoch 3: 100%|██████████| 59/59 [00:57<00:00,  1.02it/s, v_num=0, train_loss_step=-4.54e+6, val_loss=-5.35e+6, train_loss_epoch=-3.92e+6]
Epoch 4:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-4.54e+6, val_loss=-5.35e+6, train_loss_epoch=-3.92e+6]
Trial status: 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:03:35. Total running time: 5min 0s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: c410a352 with val_loss=-5348606.5 and params={'train_loop_config': {'depth': 2, 'ffn_hidden_dim': 2200, 'ffn_num_layers': 2, 'message_hidden_dim': 400}}
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name              status       ...loop_config/depth     ...ig/ffn_hidden_dim     ...ig/ffn_num_layers     ...essage_hidden_dim     iter     total time (s)          train_loss   

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
[36m(RayTrainWorker pid=2908)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/8da927e1/checkpoint_000003)


Epoch 3: 100%|██████████| 59/59 [01:12<00:00,  0.82it/s, v_num=0, train_loss_step=-5.98e+6, val_loss=-2.24e+6, train_loss_epoch=-4.68e+6]
Epoch 4:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-5.98e+6, val_loss=-2.24e+6, train_loss_epoch=-4.68e+6]
Epoch 4:  88%|████████▊ | 52/59 [00:45<00:06,  1.14it/s, v_num=0, train_loss_step=-2.05e+7, val_loss=-5.35e+6, train_loss_epoch=-3.92e+6][32m [repeated 7x across cluster][0m
Epoch 4:  92%|█████████▏| 54/59 [00:47<00:04,  1.15it/s, v_num=0, train_loss_step=-2.22e+7, val_loss=-5.35e+6, train_loss_epoch=-3.92e+6]
Epoch 4:  93%|█████████▎| 55/59 [00:47<00:03,  1.15it/s, v_num=0, train_loss_step=-1.89e+7, val_loss=-5.35e+6, train_loss_epoch=-3.92e+6]
Epoch 4:  95%|█████████▍| 56/59 [00:48<00:02,  1.15it/s, v_num=0, train_loss_step=-2.13e+7, val_loss=-5.35e+6, train_loss_epoch=-3.92e+6]
Epoch 4:  97%|█████████▋| 57/59 [00:49<00:01,  1.15it/s, v_num=0, train_loss_step=-2.02e+7, val_loss=-5.35e+6, train_loss_epoch=-3.92e+6]
Epoch

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
[36m(RayTrainWorker pid=3058)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/c410a352/checkpoint_000004)


Epoch 4: 100%|██████████| 59/59 [00:58<00:00,  1.02it/s, v_num=0, train_loss_step=-2.77e+7, val_loss=-2.02e+7, train_loss_epoch=-1.29e+7]
Epoch 5:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-2.77e+7, val_loss=-2.02e+7, train_loss_epoch=-1.29e+7]
Trial status: 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:04:35. Total running time: 6min 0s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: c410a352 with val_loss=-20204666.0 and params={'train_loop_config': {'depth': 2, 'ffn_hidden_dim': 2200, 'ffn_num_layers': 2, 'message_hidden_dim': 400}}
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name              status       ...loop_config/depth     ...ig/ffn_hidden_dim     ...ig/ffn_num_layers     ...essage_hidden_dim     iter     total time (s)     train_loss     train_l

[36m(RayTrainWorker pid=2908)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/8da927e1/checkpoint_000004)
[36m(RayTrainWorker pid=3058)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/c410a352/checkpoint_000005)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


Epoch 5:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-3.23e+7, val_loss=-2.67e+7, train_loss_epoch=-1.46e+7]
Epoch 6:   2%|▏         | 1/59 [00:01<01:03,  0.91it/s, v_num=0, train_loss_step=-4.69e+7, val_loss=-3.79e+7, train_loss_epoch=-3.08e+7]
Validation DataLoader 0:  88%|████████▊ | 7/8 [00:05<00:00,  1.34it/s][A
Validation DataLoader 0: 100%|██████████| 8/8 [00:05<00:00,  1.44it/s][A
Epoch 5: 100%|██████████| 59/59 [00:56<00:00,  1.05it/s, v_num=0, train_loss_step=-3.83e+7, val_loss=-3.79e+7, train_loss_epoch=-1.29e+7]
Epoch 5: 100%|██████████| 59/59 [00:56<00:00,  1.04it/s, v_num=0, train_loss_step=-3.83e+7, val_loss=-3.79e+7, train_loss_epoch=-3.08e+7]
Epoch 6:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-3.83e+7, val_loss=-3.79e+7, train_loss_epoch=-3.08e+7]
Trial status: 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:05:36. Total running time: 7min 0s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: c410a352 with val

[36m(RayTrainWorker pid=3058)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/c410a352/checkpoint_000006)


Epoch 6: 100%|██████████| 59/59 [00:56<00:00,  1.05it/s, v_num=0, train_loss_step=-7.89e+7, val_loss=-7.42e+7, train_loss_epoch=-5.4e+7] 
Epoch 7:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-7.89e+7, val_loss=-7.42e+7, train_loss_epoch=-5.4e+7]
Epoch 5:  92%|█████████▏| 54/59 [00:58<00:05,  0.92it/s, v_num=0, train_loss_step=-3.24e+7, val_loss=-2.67e+7, train_loss_epoch=-1.46e+7]
Epoch 7:   3%|▎         | 2/59 [00:02<01:13,  0.77it/s, v_num=0, train_loss_step=-8.42e+7, val_loss=-7.42e+7, train_loss_epoch=-5.4e+7][32m [repeated 5x across cluster][0m
Epoch 5:  93%|█████████▎| 55/59 [01:00<00:04,  0.91it/s, v_num=0, train_loss_step=-5.8e+7, val_loss=-2.67e+7, train_loss_epoch=-1.46e+7] 
Epoch 5:  95%|█████████▍| 56/59 [01:01<00:03,  0.91it/s, v_num=0, train_loss_step=-4.56e+7, val_loss=-2.67e+7, train_loss_epoch=-1.46e+7]
Epoch 5:  97%|█████████▋| 57/59 [01:02<00:02,  0.91it/s, v_num=0, train_loss_step=-4.65e+7, val_loss=-2.67e+7, train_loss_epoch=-1.46e+7]
Epoch 5:

[36m(RayTrainWorker pid=2908)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/8da927e1/checkpoint_000005)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=2908)[0m Epoch 5: 100%|██████████| 59/59 [01:10<00:00,  0.84it/s, v_num=0, train_loss_step=-5.09e+7, val_loss=-4.95e+7, train_loss_epoch=-3.47e+7]
Epoch 7:  24%|██▎       | 14/59 [00:12<00:40,  1.12it/s, v_num=0, train_loss_step=-9.1e+7, val_loss=-7.42e+7, train_loss_epoch=-5.4e+7] [32m [repeated 7x across cluster][0m
Epoch 6:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-5.09e+7, val_loss=-4.95e+7, train_loss_epoch=-3.47e+7]
Epoch 7:  34%|███▍      | 20/59 [00:18<00:36,  1.06it/s, v_num=0, train_loss_step=-9.11e+7, val_loss=-7.42e+7, train_loss_epoch=-5.4e+7][32m [repeated 9x across cluster][0m
Epoch 6:  15%|█▌        | 9/59 [00:10<00:58,  0.85it/s, v_num=0, train_loss_step=-6.22e+7, val_loss=-4.95e+7, train_loss_epoch=-3.47e+7][32m [repeated 12x across cluster][0m
Epoch 6:  25%|██▌       | 15/59 [00:16<00:47,  0.92it/s, v_num=0, train_loss_step=-5.28e+7, val_loss=-4.95e+7, train_loss_epoch=-3.47e+7][32m [repeated 13x across cluster

[36m(RayTrainWorker pid=3058)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/c410a352/checkpoint_000007)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


Epoch 7: 100%|██████████| 59/59 [00:57<00:00,  1.02it/s, v_num=0, train_loss_step=-7.81e+7, val_loss=-1.12e+8, train_loss_epoch=-8.59e+7]
Epoch 8:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-7.81e+7, val_loss=-1.12e+8, train_loss_epoch=-8.59e+7]
Epoch 8:   5%|▌         | 3/59 [00:02<00:51,  1.08it/s, v_num=0, train_loss_step=-1.03e+8, val_loss=-1.12e+8, train_loss_epoch=-8.59e+7][32m [repeated 8x across cluster][0m
Epoch 8:  14%|█▎        | 8/59 [00:08<00:52,  0.97it/s, v_num=0, train_loss_step=-1.32e+8, val_loss=-1.12e+8, train_loss_epoch=-8.59e+7][32m [repeated 9x across cluster][0m
Trial status: 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:07:36. Total running time: 9min 0s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: c410a352 with val_loss=-111807984.0 and params={'train_loop_config': {'depth': 2, 'ffn_hidden_dim': 2200, 'ffn_num_layers': 2, 'message_hidden_dim': 400}}
+---------------------------------------------------------------

[36m(RayTrainWorker pid=2908)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/8da927e1/checkpoint_000006)


Epoch 6: 100%|██████████| 59/59 [01:11<00:00,  0.82it/s, v_num=0, train_loss_step=-9e+7, val_loss=-7.46e+7, train_loss_epoch=-6.21e+7]
Epoch 7:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-9e+7, val_loss=-7.46e+7, train_loss_epoch=-6.21e+7]
Epoch 7:   3%|▎         | 2/59 [00:01<00:54,  1.05it/s, v_num=0, train_loss_step=-9.22e+7, val_loss=-7.46e+7, train_loss_epoch=-6.21e+7][32m [repeated 8x across cluster][0m
Epoch 8:  64%|██████▍   | 38/59 [00:34<00:19,  1.09it/s, v_num=0, train_loss_step=-1.06e+8, val_loss=-1.12e+8, train_loss_epoch=-8.59e+7][32m [repeated 10x across cluster][0m
Epoch 7:  19%|█▊        | 11/59 [00:12<00:53,  0.89it/s, v_num=0, train_loss_step=-1.05e+8, val_loss=-7.46e+7, train_loss_epoch=-6.21e+7][32m [repeated 10x across cluster][0m
Trial status: 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:08:06. Total running time: 9min 30s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: c410a352 with val_loss=-111807984.0 and param

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
[36m(RayTrainWorker pid=3058)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/c410a352/checkpoint_000008)


Epoch 8: 100%|██████████| 59/59 [00:57<00:00,  1.02it/s, v_num=0, train_loss_step=-1.61e+8, val_loss=-7.49e+7, train_loss_epoch=-1.21e+8]
Epoch 9:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-1.61e+8, val_loss=-7.49e+7, train_loss_epoch=-1.21e+8]
Epoch 9:   7%|▋         | 4/59 [00:03<00:42,  1.29it/s, v_num=0, train_loss_step=-1.23e+8, val_loss=-7.49e+7, train_loss_epoch=-1.21e+8][32m [repeated 9x across cluster][0m
Epoch 7:  59%|█████▉    | 35/59 [00:39<00:26,  0.89it/s, v_num=0, train_loss_step=-9.25e+7, val_loss=-7.46e+7, train_loss_epoch=-6.21e+7][32m [repeated 8x across cluster][0m
Trial status: 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:08:36. Total running time: 10min 0s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: c410a352 with val_loss=-74906560.0 and params={'train_loop_config': {'depth': 2, 'ffn_hidden_dim': 2200, 'ffn_num_layers': 2, 'message_hidden_dim': 400}}
+--------------------------------------------------------------

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
[36m(RayTrainWorker pid=2908)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/8da927e1/checkpoint_000007)


Epoch 7: 100%|██████████| 59/59 [01:11<00:00,  0.82it/s, v_num=0, train_loss_step=-1.03e+8, val_loss=-1.32e+8, train_loss_epoch=-9.84e+7]
Epoch 8:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-1.03e+8, val_loss=-1.32e+8, train_loss_epoch=-9.84e+7]
Trial status: 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:09:06. Total running time: 10min 30s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: 8da927e1 with val_loss=-131581568.0 and params={'train_loop_config': {'depth': 2, 'ffn_hidden_dim': 2000, 'ffn_num_layers': 2, 'message_hidden_dim': 500}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name              status       ...loop_config/depth     ...ig/ffn_hidden_dim     ...ig/ffn_num_layers     ...essage_hidden_dim     iter     total time (s)     train_loss     tra

[36m(RayTrainWorker pid=3058)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/c410a352/checkpoint_000009)


Epoch 9: 100%|██████████| 59/59 [00:57<00:00,  1.02it/s, v_num=0, train_loss_step=-1.54e+8, val_loss=-1.63e+8, train_loss_epoch=-1.59e+8]
Epoch 10:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-1.54e+8, val_loss=-1.63e+8, train_loss_epoch=-1.59e+8]
Epoch 10:   7%|▋         | 4/59 [00:03<00:42,  1.30it/s, v_num=0, train_loss_step=-1.93e+8, val_loss=-1.63e+8, train_loss_epoch=-1.59e+8][32m [repeated 10x across cluster][0m
Epoch 10:  15%|█▌        | 9/59 [00:08<00:45,  1.09it/s, v_num=0, train_loss_step=-1.26e+8, val_loss=-1.63e+8, train_loss_epoch=-1.59e+8][32m [repeated 9x across cluster][0m
Epoch 8:  46%|████▌     | 27/59 [00:30<00:35,  0.89it/s, v_num=0, train_loss_step=-1.22e+8, val_loss=-1.32e+8, train_loss_epoch=-9.84e+7][32m [repeated 10x across cluster][0m
Trial status: 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:09:36. Total running time: 11min 1s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: c410a352 with val_loss=-162616704.0 a

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
[36m(RayTrainWorker pid=2908)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/8da927e1/checkpoint_000008)


[36m(RayTrainWorker pid=2908)[0m Epoch 8: 100%|██████████| 59/59 [01:11<00:00,  0.82it/s, v_num=0, train_loss_step=-1.81e+8, val_loss=-1.14e+8, train_loss_epoch=-1.39e+8]
[36m(RayTrainWorker pid=3058)[0m 
Epoch 9:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-1.81e+8, val_loss=-1.14e+8, train_loss_epoch=-1.39e+8]
[36m(RayTrainWorker pid=3058)[0m 
Epoch 9:   2%|▏         | 1/59 [00:00<00:56,  1.03it/s, v_num=0, train_loss_step=-1.64e+8, val_loss=-1.14e+8, train_loss_epoch=-1.39e+8]
[36m(RayTrainWorker pid=3058)[0m 
[36m(RayTrainWorker pid=3058)[0m 
Epoch 9:   3%|▎         | 2/59 [00:01<00:54,  1.05it/s, v_num=0, train_loss_step=-1.04e+8, val_loss=-1.14e+8, train_loss_epoch=-1.39e+8]
[36m(RayTrainWorker pid=3058)[0m 
[36m(RayTrainWorker pid=3058)[0m 


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=2908)[0m Epoch 9:   5%|▌         | 3/59 [00:02<00:53,  1.05it/s, v_num=0, train_loss_step=-1.04e+8, val_loss=-1.14e+8, train_loss_epoch=-1.39e+8]Epoch 9:   5%|▌         | 3/59 [00:02<00:53,  1.05it/s, v_num=0, train_loss_step=-1.73e+8, val_loss=-1.14e+8, train_loss_epoch=-1.39e+8]
Validation DataLoader 0:  88%|████████▊ | 7/8 [00:05<00:00,  1.29it/s][A[32m [repeated 7x across cluster][0m
Epoch 9:   7%|▋         | 4/59 [00:03<00:54,  1.01it/s, v_num=0, train_loss_step=-1.39e+8, val_loss=-1.14e+8, train_loss_epoch=-1.39e+8]
Validation DataLoader 0: 100%|██████████| 8/8 [00:05<00:00,  1.42it/s][A
Epoch 10: 100%|██████████| 59/59 [00:57<00:00,  1.02it/s, v_num=0, train_loss_step=-2.38e+8, val_loss=-1.57e+8, train_loss_epoch=-1.59e+8]
Epoch 10: 100%|██████████| 59/59 [00:58<00:00,  1.01it/s, v_num=0, train_loss_step=-2.38e+8, val_loss=-1.57e+8, train_loss_epoch=-1.97e+8]
Epoch 11:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-2.38e+8, val_l

[36m(RayTrainWorker pid=3058)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/c410a352/checkpoint_000011)[32m [repeated 2x across cluster][0m


Epoch 11: 100%|██████████| 59/59 [01:00<00:00,  0.98it/s, v_num=0, train_loss_step=-2.97e+8, val_loss=-1.95e+8, train_loss_epoch=-2.41e+8]
Epoch 9: 100%|██████████| 59/59 [01:03<00:00,  0.93it/s, v_num=0, train_loss_step=-1.9e+8, val_loss=-1.14e+8, train_loss_epoch=-1.39e+8] 
Validation: |          | 0/? [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=2908)[0m 
Validation:   0%|          | 0/8 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s][A
Epoch 12:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-2.97e+8, val_loss=-1.95e+8, train_loss_epoch=-2.41e+8]
[36m(RayTrainWorker pid=2908)[0m 
Validation DataLoader 0:  12%|█▎        | 1/8 [00:00<00:04,  1.62it/s][A
[36m(RayTrainWorker pid=2908)[0m 
Validation DataLoader 0:  25%|██▌       | 2/8 [00:01<00:04,  1.46it/s][A
[36m(RayTrainWorker pid=2908)[0m 
Validation DataLoader 0:  38%|███▊      | 3/8 [00:02<00:03,  1.40it/s][A
Epoch 12:   3%|▎         | 2/59 [00:01<00:44,  1.27it/s, v_

[36m(RayTrainWorker pid=2908)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/8da927e1/checkpoint_000009)


Epoch 9: 100%|██████████| 59/59 [01:09<00:00,  0.85it/s, v_num=0, train_loss_step=-1.9e+8, val_loss=-1.84e+8, train_loss_epoch=-1.81e+8]
Epoch 10:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-1.9e+8, val_loss=-1.84e+8, train_loss_epoch=-1.81e+8]
Epoch 10:   2%|▏         | 1/59 [00:01<01:34,  0.61it/s, v_num=0, train_loss_step=-1.84e+8, val_loss=-1.84e+8, train_loss_epoch=-1.81e+8][32m [repeated 7x across cluster][0m
Epoch 12:  15%|█▌        | 9/59 [00:08<00:46,  1.08it/s, v_num=0, train_loss_step=-2.63e+8, val_loss=-1.95e+8, train_loss_epoch=-2.41e+8]
Epoch 12:  15%|█▌        | 9/59 [00:08<00:46,  1.08it/s, v_num=0, train_loss_step=-2.99e+8, val_loss=-1.95e+8, train_loss_epoch=-2.41e+8]
Epoch 12:  24%|██▎       | 14/59 [00:13<00:43,  1.04it/s, v_num=0, train_loss_step=-2.6e+8, val_loss=-1.95e+8, train_loss_epoch=-2.41e+8] [32m [repeated 9x across cluster][0m
Trial status: 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:11:36. Total running time: 13min 1s
Logic

[36m(RayTrainWorker pid=3058)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/c410a352/checkpoint_000012)


Epoch 12: 100%|██████████| 59/59 [01:01<00:00,  0.96it/s, v_num=0, train_loss_step=-3.56e+8, val_loss=-2.65e+8, train_loss_epoch=-2.76e+8]
Epoch 13:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-3.56e+8, val_loss=-2.65e+8, train_loss_epoch=-2.76e+8]
Epoch 10:  92%|█████████▏| 54/59 [00:58<00:05,  0.92it/s, v_num=0, train_loss_step=-2.35e+8, val_loss=-1.84e+8, train_loss_epoch=-1.81e+8]
Epoch 10:  93%|█████████▎| 55/59 [00:59<00:04,  0.92it/s, v_num=0, train_loss_step=-2.67e+8, val_loss=-1.84e+8, train_loss_epoch=-1.81e+8]
Epoch 13:   8%|▊         | 5/59 [00:04<00:45,  1.19it/s, v_num=0, train_loss_step=-3.32e+8, val_loss=-2.65e+8, train_loss_epoch=-2.76e+8][32m [repeated 8x across cluster][0m
Epoch 10:  95%|█████████▍| 56/59 [01:01<00:03,  0.92it/s, v_num=0, train_loss_step=-2.35e+8, val_loss=-1.84e+8, train_loss_epoch=-1.81e+8]
Epoch 10:  97%|█████████▋| 57/59 [01:02<00:02,  0.91it/s, v_num=0, train_loss_step=-2.24e+8, val_loss=-1.84e+8, train_loss_epoch=-1.81e+8]

[36m(RayTrainWorker pid=2908)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/8da927e1/checkpoint_000010)


Epoch 10: 100%|██████████| 59/59 [01:11<00:00,  0.83it/s, v_num=0, train_loss_step=-2.74e+8, val_loss=-2.09e+8, train_loss_epoch=-2.25e+8]
Epoch 11:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-2.74e+8, val_loss=-2.09e+8, train_loss_epoch=-2.25e+8]
Epoch 11:   7%|▋         | 4/59 [00:04<00:56,  0.97it/s, v_num=0, train_loss_step=-2.71e+8, val_loss=-2.09e+8, train_loss_epoch=-2.25e+8][32m [repeated 10x across cluster][0m
Epoch 13:  42%|████▏     | 25/59 [00:25<00:34,  0.99it/s, v_num=0, train_loss_step=-3.26e+8, val_loss=-2.65e+8, train_loss_epoch=-2.76e+8][32m [repeated 8x across cluster][0m
Epoch 11:  20%|██        | 12/59 [00:13<00:53,  0.88it/s, v_num=0, train_loss_step=-2.55e+8, val_loss=-2.09e+8, train_loss_epoch=-2.25e+8]
Epoch 11:  20%|██        | 12/59 [00:13<00:53,  0.88it/s, v_num=0, train_loss_step=-2.57e+8, val_loss=-2.09e+8, train_loss_epoch=-2.25e+8]
Epoch 11:  22%|██▏       | 13/59 [00:14<00:51,  0.89it/s, v_num=0, train_loss_step=-2.82e+8, val_lo

[36m(RayTrainWorker pid=3058)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/c410a352/checkpoint_000013)


Epoch 13: 100%|██████████| 59/59 [01:02<00:00,  0.95it/s, v_num=0, train_loss_step=-3.67e+8, val_loss=-2.96e+8, train_loss_epoch=-3.2e+8] 
Epoch 14:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-3.67e+8, val_loss=-2.96e+8, train_loss_epoch=-3.2e+8]
Epoch 11:  76%|███████▋  | 45/59 [00:51<00:15,  0.88it/s, v_num=0, train_loss_step=-3.12e+8, val_loss=-2.09e+8, train_loss_epoch=-2.25e+8][32m [repeated 8x across cluster][0m
Epoch 14:  17%|█▋        | 10/59 [00:10<00:50,  0.97it/s, v_num=0, train_loss_step=-2.62e+8, val_loss=-2.96e+8, train_loss_epoch=-3.2e+8][32m [repeated 10x across cluster][0m
Trial status: 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:13:36. Total running time: 15min 1s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: c410a352 with val_loss=-296162112.0 and params={'train_loop_config': {'depth': 2, 'ffn_hidden_dim': 2200, 'ffn_num_layers': 2, 'message_hidden_dim': 400}}
+---------------------------------------------------------

[36m(RayTrainWorker pid=2908)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/8da927e1/checkpoint_000011)


Epoch 12:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-3.38e+8, val_loss=-2.35e+8, train_loss_epoch=-2.76e+8]
Epoch 12:   8%|▊         | 5/59 [00:04<00:50,  1.07it/s, v_num=0, train_loss_step=-2.23e+8, val_loss=-2.35e+8, train_loss_epoch=-2.76e+8][32m [repeated 11x across cluster][0m
Epoch 12:  15%|█▌        | 9/59 [00:10<00:58,  0.85it/s, v_num=0, train_loss_step=-3.3e+8, val_loss=-2.35e+8, train_loss_epoch=-2.76e+8] [32m [repeated 8x across cluster][0m
Trial status: 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:14:06. Total running time: 15min 31s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: c410a352 with val_loss=-296162112.0 and params={'train_loop_config': {'depth': 2, 'ffn_hidden_dim': 2200, 'ffn_num_layers': 2, 'message_hidden_dim': 400}}
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
[36m(RayTrainWorker pid=3058)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/c410a352/checkpoint_000014)


Epoch 14: 100%|██████████| 59/59 [01:01<00:00,  0.96it/s, v_num=0, train_loss_step=-1.89e+8, val_loss=-3.49e+8, train_loss_epoch=-3.53e+8]
Epoch 15:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-1.89e+8, val_loss=-3.49e+8, train_loss_epoch=-3.53e+8]
Epoch 15:   5%|▌         | 3/59 [00:03<01:12,  0.77it/s, v_num=0, train_loss_step=-4.27e+8, val_loss=-3.49e+8, train_loss_epoch=-3.53e+8][32m [repeated 7x across cluster][0m
Epoch 15:  15%|█▌        | 9/59 [00:09<00:51,  0.97it/s, v_num=0, train_loss_step=-3.81e+8, val_loss=-3.49e+8, train_loss_epoch=-3.53e+8][32m [repeated 11x across cluster][0m
Trial status: 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:14:36. Total running time: 16min 1s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: c410a352 with val_loss=-349210400.0 and params={'train_loop_config': {'depth': 2, 'ffn_hidden_dim': 2200, 'ffn_num_layers': 2, 'message_hidden_dim': 400}}
+---------------------------------------------------------

[36m(RayTrainWorker pid=2908)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/8da927e1/checkpoint_000012)


[36m(RayTrainWorker pid=2908)[0m Epoch 12:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-3.84e+8, val_loss=-3.03e+8, train_loss_epoch=-3.17e+8]         Epoch 13:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-3.84e+8, val_loss=-3.03e+8, train_loss_epoch=-3.17e+8]
Trial status: 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:15:06. Total running time: 16min 31s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: c410a352 with val_loss=-349210400.0 and params={'train_loop_config': {'depth': 2, 'ffn_hidden_dim': 2200, 'ffn_num_layers': 2, 'message_hidden_dim': 400}}
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name              status       ...loop_config/depth     ...ig/ffn_hidden_dim     ...ig/ffn_num_layers     ...essage_hidden_dim     iter     

[36m(RayTrainWorker pid=3058)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/c410a352/checkpoint_000015)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


Epoch 15: 100%|██████████| 59/59 [01:00<00:00,  0.98it/s, v_num=0, train_loss_step=-4.26e+8, val_loss=-3.64e+8, train_loss_epoch=-3.84e+8]
Epoch 16:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-4.26e+8, val_loss=-3.64e+8, train_loss_epoch=-3.84e+8]
Epoch 16:   3%|▎         | 2/59 [00:02<01:12,  0.79it/s, v_num=0, train_loss_step=-4.59e+8, val_loss=-3.64e+8, train_loss_epoch=-3.84e+8][32m [repeated 7x across cluster][0m
Epoch 16:  12%|█▏        | 7/59 [00:07<00:56,  0.92it/s, v_num=0, train_loss_step=-4.28e+8, val_loss=-3.64e+8, train_loss_epoch=-3.84e+8][32m [repeated 9x across cluster][0m
Trial status: 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:15:36. Total running time: 17min 1s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: c410a352 with val_loss=-363901408.0 and params={'train_loop_config': {'depth': 2, 'ffn_hidden_dim': 2200, 'ffn_num_layers': 2, 'message_hidden_dim': 400}}
+----------------------------------------------------------

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
[36m(RayTrainWorker pid=2908)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/8da927e1/checkpoint_000013)


[36m(RayTrainWorker pid=2908)[0m Epoch 13: 100%|██████████| 59/59 [01:14<00:00,  0.79it/s, v_num=0, train_loss_step=-4.11e+8, val_loss=-3.52e+8, train_loss_epoch=-3.63e+8]
Epoch 16:  95%|█████████▍| 56/59 [00:53<00:02,  1.05it/s, v_num=0, train_loss_step=-4.76e+8, val_loss=-3.64e+8, train_loss_epoch=-3.84e+8]
Epoch 14:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-4.11e+8, val_loss=-3.52e+8, train_loss_epoch=-3.63e+8]
Epoch 16:  97%|█████████▋| 57/59 [00:54<00:01,  1.05it/s, v_num=0, train_loss_step=-3.87e+8, val_loss=-3.64e+8, train_loss_epoch=-3.84e+8]
Epoch 16:  98%|█████████▊| 58/59 [00:54<00:00,  1.06it/s, v_num=0, train_loss_step=-4.11e+8, val_loss=-3.64e+8, train_loss_epoch=-3.84e+8]
Epoch 16: 100%|██████████| 59/59 [00:55<00:00,  1.07it/s, v_num=0, train_loss_step=-4.33e+8, val_loss=-3.64e+8, train_loss_epoch=-3.84e+8]
Validation: |          | 0/? [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=3058)[0m 
Validation:   0%|          | 0/8 [00:00<?, ?it/s][A
V

[36m(RayTrainWorker pid=3058)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/c410a352/checkpoint_000016)


Epoch 16: 100%|██████████| 59/59 [01:00<00:00,  0.98it/s, v_num=0, train_loss_step=-4.33e+8, val_loss=-3.97e+8, train_loss_epoch=-4.14e+8]
Epoch 17:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-4.33e+8, val_loss=-3.97e+8, train_loss_epoch=-4.14e+8]
Epoch 17:   2%|▏         | 1/59 [00:01<01:16,  0.76it/s, v_num=0, train_loss_step=-3.91e+8, val_loss=-3.97e+8, train_loss_epoch=-4.14e+8][32m [repeated 6x across cluster][0m
Validation DataLoader 0:  88%|████████▊ | 7/8 [00:04<00:00,  1.67it/s][A[32m [repeated 4x across cluster][0m
Epoch 17:  10%|█         | 6/59 [00:06<00:59,  0.88it/s, v_num=0, train_loss_step=-4.35e+8, val_loss=-3.97e+8, train_loss_epoch=-4.14e+8][32m [repeated 9x across cluster][0m
Trial status: 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:16:36. Total running time: 18min 1s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: c410a352 with val_loss=-397465952.0 and params={'train_loop_config': {'depth': 2, 'ffn_hidden_dim': 22

[36m(RayTrainWorker pid=3058)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/c410a352/checkpoint_000017)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3058)[0m Epoch 17: 100%|██████████| 59/59 [01:00<00:00,  0.98it/s, v_num=0, train_loss_step=-4.72e+8, val_loss=-3.65e+8, train_loss_epoch=-4.43e+8]
[36m(RayTrainWorker pid=2908)[0m Validation: |          | 0/? [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=2908)[0m Validation:   0%|          | 0/8 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=2908)[0m 
Validation DataLoader 0:  25%|██▌       | 2/8 [00:01<00:05,  1.06it/s][A[32m [repeated 2x across cluster][0m
Epoch 18:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-4.72e+8, val_loss=-3.65e+8, train_loss_epoch=-4.43e+8]
[36m(RayTrainWorker pid=2908)[0m 
Epoch 14: 100%|██████████| 59/59 [01:06<00:00,  0.89it/s, v_num=0, train_loss_step=-2.6e+8, val_loss=-3.52e+8, train_loss_epoch=-3.63e+8] [32m [repeated 3x across cluster][0m
Epoch 18:   2%|▏         | 1/59 [00:01<01:28,  0.66it/s, v_num=0, train_loss_step=-4.83e+8, va

[36m(RayTrainWorker pid=2908)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/8da927e1/checkpoint_000014)


Epoch 15:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-2.6e+8, val_loss=-3.94e+8, train_loss_epoch=-4e+8]
Epoch 18:  12%|█▏        | 7/59 [00:07<00:53,  0.96it/s, v_num=0, train_loss_step=-4.28e+8, val_loss=-3.65e+8, train_loss_epoch=-4.43e+8]
Trial status: 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:17:37. Total running time: 19min 1s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: 8da927e1 with val_loss=-393851296.0 and params={'train_loop_config': {'depth': 2, 'ffn_hidden_dim': 2000, 'ffn_num_layers': 2, 'message_hidden_dim': 500}}
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name              status       ...loop_config/depth     ...ig/ffn_hidden_dim     ...ig/ffn_num_layers     ...essage_hidden_dim     iter     total time (s)     train_loss     train_lo

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
[36m(RayTrainWorker pid=3058)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/c410a352/checkpoint_000018)


Epoch 18: 100%|██████████| 59/59 [01:00<00:00,  0.98it/s, v_num=0, train_loss_step=-3.56e+8, val_loss=-4.62e+8, train_loss_epoch=-4.63e+8]
Epoch 19:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-3.56e+8, val_loss=-4.62e+8, train_loss_epoch=-4.63e+8]
Epoch 15:  85%|████████▍ | 50/59 [00:56<00:10,  0.88it/s, v_num=0, train_loss_step=-5e+8, val_loss=-3.94e+8, train_loss_epoch=-4e+8]
Epoch 15:  85%|████████▍ | 50/59 [00:56<00:10,  0.88it/s, v_num=0, train_loss_step=-4.83e+8, val_loss=-3.94e+8, train_loss_epoch=-4e+8]
Epoch 19:   5%|▌         | 3/59 [00:03<01:12,  0.78it/s, v_num=0, train_loss_step=-5.03e+8, val_loss=-4.62e+8, train_loss_epoch=-4.63e+8][32m [repeated 5x across cluster][0m
Epoch 15:  92%|█████████▏| 54/59 [01:01<00:05,  0.88it/s, v_num=0, train_loss_step=-4.31e+8, val_loss=-3.94e+8, train_loss_epoch=-4e+8]
Trial status: 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:18:37. Total running time: 20min 1s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Curre

[36m(RayTrainWorker pid=2908)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/8da927e1/checkpoint_000015)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


Epoch 15: 100%|██████████| 59/59 [01:13<00:00,  0.80it/s, v_num=0, train_loss_step=-4.74e+8, val_loss=-4.2e+8, train_loss_epoch=-4.37e+8]
Epoch 16:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-4.74e+8, val_loss=-4.2e+8, train_loss_epoch=-4.37e+8]
Epoch 19:  44%|████▍     | 26/59 [00:24<00:31,  1.06it/s, v_num=0, train_loss_step=-5.31e+8, val_loss=-4.62e+8, train_loss_epoch=-4.63e+8][32m [repeated 11x across cluster][0m
Epoch 19:  54%|█████▍    | 32/59 [00:29<00:25,  1.07it/s, v_num=0, train_loss_step=-4.55e+8, val_loss=-4.62e+8, train_loss_epoch=-4.63e+8][32m [repeated 11x across cluster][0m
Epoch 16:  24%|██▎       | 14/59 [00:15<00:49,  0.90it/s, v_num=0, train_loss_step=-4.68e+8, val_loss=-4.2e+8, train_loss_epoch=-4.37e+8][32m [repeated 9x across cluster][0m
Trial status: 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:19:07. Total running time: 20min 31s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: c410a352 with val_loss=-462312288.0

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
[36m(RayTrainWorker pid=3058)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/c410a352/checkpoint_000019)


[36m(RayTrainWorker pid=3058)[0m Epoch 19: 100%|██████████| 59/59 [01:00<00:00,  0.97it/s, v_num=0, train_loss_step=-5.17e+8, val_loss=-4.67e+8, train_loss_epoch=-4.88e+8]


[36m(RayTrainWorker pid=3058)[0m `Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 59/59 [01:01<00:00,  0.97it/s, v_num=0, train_loss_step=-5.17e+8, val_loss=-4.67e+8, train_loss_epoch=-4.88e+8]
Epoch 16:  64%|██████▍   | 38/59 [00:42<00:23,  0.90it/s, v_num=0, train_loss_step=-4.42e+8, val_loss=-4.2e+8, train_loss_epoch=-4.37e+8][32m [repeated 4x across cluster][0m

Trial TorchTrainer_c410a352 completed after 20 iterations at 2025-05-05 11:19:32. Total running time: 20min 56s
+----------------------------------------------------------+
| Trial TorchTrainer_c410a352 result                       |
+----------------------------------------------------------+
| checkpoint_dir_name                    checkpoint_000019 |
| time_this_iter_s                                61.24575 |
| time_total_s                                  1225.46825 |
| training_iteration                                    20 |
| epoch                                                 19 |
| step                                                1180 |
| train_loss           

[36m(RayTrainWorker pid=2908)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/8da927e1/checkpoint_000016)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


Epoch 16: 100%|██████████| 59/59 [01:13<00:00,  0.80it/s, v_num=0, train_loss_step=-5.1e+8, val_loss=-4.59e+8, train_loss_epoch=-4.71e+8]
Epoch 17:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-5.1e+8, val_loss=-4.59e+8, train_loss_epoch=-4.71e+8]


[36m(TorchTrainer pid=8176)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=8176)[0m - (node_id=85d1826776a8b04eb71395ec08130d8b282f617a724b5c6551ccdd9f, ip=172.28.0.12, pid=8293) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=8293)[0m Setting up process group for: env:// [rank=0, world_size=1]


Epoch 17:   2%|▏         | 1/59 [00:00<00:51,  1.12it/s, v_num=0, train_loss_step=-4.51e+8, val_loss=-4.59e+8, train_loss_epoch=-4.71e+8]
Epoch 17:   3%|▎         | 2/59 [00:01<00:50,  1.12it/s, v_num=0, train_loss_step=-3.94e+8, val_loss=-4.59e+8, train_loss_epoch=-4.71e+8]
Epoch 17:   5%|▌         | 3/59 [00:02<00:50,  1.11it/s, v_num=0, train_loss_step=-5.46e+8, val_loss=-4.59e+8, train_loss_epoch=-4.71e+8]

Trial status: 2 RUNNING | 1 TERMINATED | 1 PENDING
Current time: 2025-05-05 11:20:07. Total running time: 21min 31s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: c410a352 with val_loss=-467179616.0 and params={'train_loop_config': {'depth': 2, 'ffn_hidden_dim': 2200, 'ffn_num_layers': 2, 'message_hidden_dim': 400}}
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name         

[36m(RayTrainWorker pid=8293)[0m Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
[36m(RayTrainWorker pid=8293)[0m GPU available: False, used: False
[36m(RayTrainWorker pid=8293)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=8293)[0m HPU available: False, using: 0 HPUs


[36m(RayTrainWorker pid=2908)[0m Epoch 17:  17%|█▋        | 10/59 [00:09<00:45,  1.07it/s, v_num=0, train_loss_step=-4.91e+8, val_loss=-4.59e+8, train_loss_epoch=-4.71e+8]Epoch 17:  17%|█▋        | 10/59 [00:09<00:45,  1.07it/s, v_num=0, train_loss_step=-5.3e+8, val_loss=-4.59e+8, train_loss_epoch=-4.71e+8] 


[36m(RayTrainWorker pid=8293)[0m 2025-05-05 11:20:13.061395: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(RayTrainWorker pid=8293)[0m E0000 00:00:1746444013.115176    8406 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(RayTrainWorker pid=8293)[0m E0000 00:00:1746444013.129929    8406 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(RayTrainWorker pid=8293)[0m 2025-05-05 11:20:13.178608: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
[36m(RayTrainWorker pid=8293)[0m To enable the following instructions: AVX2 FMA, in other operations, rebuil

Epoch 17:  19%|█▊        | 11/59 [00:10<00:47,  1.00it/s, v_num=0, train_loss_step=-5.24e+8, val_loss=-4.59e+8, train_loss_epoch=-4.71e+8]
Epoch 17:  20%|██        | 12/59 [00:12<00:49,  0.94it/s, v_num=0, train_loss_step=-5.07e+8, val_loss=-4.59e+8, train_loss_epoch=-4.71e+8]
Epoch 17:  22%|██▏       | 13/59 [00:14<00:50,  0.91it/s, v_num=0, train_loss_step=-4.75e+8, val_loss=-4.59e+8, train_loss_epoch=-4.71e+8]
Epoch 17:  24%|██▎       | 14/59 [00:15<00:49,  0.91it/s, v_num=0, train_loss_step=-5.01e+8, val_loss=-4.59e+8, train_loss_epoch=-4.71e+8]
Epoch 17:  25%|██▌       | 15/59 [00:16<00:47,  0.92it/s, v_num=0, train_loss_step=-4.74e+8, val_loss=-4.59e+8, train_loss_epoch=-4.71e+8]
Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=8293)[0m Loading `train_dataloader` to estimate number of stepping batches.
[36m(RayTrainWorker pid=8293)[0m 
[36m(RayTrainWorker pid=8293)[0m   | Name            | Type                    | Params | Mode 
[36m(RayTrainWorker pid=8293)[0m --------------------------------------------------------------------
[36m(RayTrainWorker pid=8293)[0m 0 | message_passing | BondMessagePassing      | 8.3 M  | train
[36m(RayTrainWorker pid=8293)[0m 1 | agg             | MeanAggregation         | 0      | train
[36m(RayTrainWorker pid=8293)[0m 2 | bn              | BatchNorm1d             | 4.0 K  | train
[36m(RayTrainWorker pid=8293)[0m 3 | predictor       | BinaryClassificationFFN | 1.6 M  | train
[36m(RayTrainWorker pid=8293)[0m 4 | X_d_transform   | Identity                | 0      | train
[36m(RayTrainWorker pid=8293)[0m 5 | metrics         | ModuleList              | 0      | train
[36m(RayTrainWorker pid=8293)[0m -------------------------------------

Epoch 17:  27%|██▋       | 16/59 [00:17<00:46,  0.93it/s, v_num=0, train_loss_step=-3.9e+8, val_loss=-4.59e+8, train_loss_epoch=-4.71e+8] 
Epoch 17:  29%|██▉       | 17/59 [00:18<00:45,  0.93it/s, v_num=0, train_loss_step=-4.89e+8, val_loss=-4.59e+8, train_loss_epoch=-4.71e+8]
Epoch 17:  31%|███       | 18/59 [00:19<00:43,  0.94it/s, v_num=0, train_loss_step=-4.93e+8, val_loss=-4.59e+8, train_loss_epoch=-4.71e+8]
Epoch 17:  32%|███▏      | 19/59 [00:20<00:42,  0.94it/s, v_num=0, train_loss_step=-5.35e+8, val_loss=-4.59e+8, train_loss_epoch=-4.71e+8]
Epoch 17:  34%|███▍      | 20/59 [00:21<00:41,  0.94it/s, v_num=0, train_loss_step=-4.97e+8, val_loss=-4.59e+8, train_loss_epoch=-4.71e+8]
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:05<00:05,  0.19it/s]
Epoch 17:  36%|███▌      | 21/59 [00:22<00:40,  0.95it/s, v_num=0, train_loss_step=-5.21e+8, val_loss=-4.59e+8, train_loss_epoch=-4.71e+8]
Epoch 17:  37%|███▋      | 22/59 [00:23<00:39,  0.95it/s, v_num=0, train_loss_step=-5.52e+

[36m(RayTrainWorker pid=2908)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/8da927e1/checkpoint_000017)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


Epoch 17: 100%|██████████| 59/59 [01:14<00:00,  0.79it/s, v_num=0, train_loss_step=-5.32e+8, val_loss=-4.35e+8, train_loss_epoch=-5.02e+8]
Epoch 18:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-5.32e+8, val_loss=-4.35e+8, train_loss_epoch=-5.02e+8]
Epoch 18:   2%|▏         | 1/59 [00:01<01:06,  0.87it/s, v_num=0, train_loss_step=-5.53e+8, val_loss=-4.35e+8, train_loss_epoch=-5.02e+8]
Epoch 18:   3%|▎         | 2/59 [00:02<01:01,  0.93it/s, v_num=0, train_loss_step=-5.82e+8, val_loss=-4.35e+8, train_loss_epoch=-5.02e+8]
Epoch 18:   5%|▌         | 3/59 [00:03<01:00,  0.92it/s, v_num=0, train_loss_step=-5.13e+8, val_loss=-4.35e+8, train_loss_epoch=-5.02e+8]
Epoch 18:  10%|█         | 6/59 [00:06<00:55,  0.96it/s, v_num=0, train_loss_step=-5.29e+8, val_loss=-4.35e+8, train_loss_epoch=-5.02e+8][32m [repeated 4x across cluster][0m
Epoch 18:  19%|█▊        | 11/59 [00:11<00:52,  0.92it/s, v_num=0, train_loss_step=-5.18e+8, val_loss=-4.35e+8, train_loss_epoch=-5.02e+8][3

[36m(RayTrainWorker pid=2908)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/8da927e1/checkpoint_000018)


Epoch 19:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-4.27e+8, val_loss=-5.23e+8, train_loss_epoch=-5.24e+8]
Epoch 0:  25%|██▌       | 15/59 [02:01<05:57,  0.12it/s, v_num=0, train_loss_step=-1.33]
Trial status: 2 RUNNING | 1 TERMINATED | 1 PENDING
Current time: 2025-05-05 11:22:37. Total running time: 24min 2s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: 8da927e1 with val_loss=-522688640.0 and params={'train_loop_config': {'depth': 2, 'ffn_hidden_dim': 2000, 'ffn_num_layers': 2, 'message_hidden_dim': 500}}
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name              status         ...loop_config/depth     ...ig/ffn_hidden_dim     ...ig/ffn_num_layers     ...essage_hidden_dim     iter     total time (s)     train_loss     train_loss_step     val/f1       v

[36m(RayTrainWorker pid=2908)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/8da927e1/checkpoint_000019)


Epoch 19: 100%|██████████| 59/59 [01:15<00:00,  0.78it/s, v_num=0, train_loss_step=-5.85e+8, val_loss=-5.34e+8, train_loss_epoch=-5.5e+8] 
Epoch 19: 100%|██████████| 59/59 [01:15<00:00,  0.78it/s, v_num=0, train_loss_step=-5.85e+8, val_loss=-5.34e+8, train_loss_epoch=-5.5e+8]


[36m(RayTrainWorker pid=2908)[0m `Trainer.fit` stopped: `max_epochs=20` reached.



Trial TorchTrainer_8da927e1 completed after 20 iterations at 2025-05-05 11:23:51. Total running time: 25min 15s
+----------------------------------------------------------+
| Trial TorchTrainer_8da927e1 result                       |
+----------------------------------------------------------+
| checkpoint_dir_name                    checkpoint_000019 |
| time_this_iter_s                                75.67562 |
| time_total_s                                  1501.92174 |
| training_iteration                                    20 |
| epoch                                                 19 |
| step                                                1180 |
| train_loss                                   -550448640. |
| train_loss_epoch                             -550448640. |
| train_loss_step                              -585040704. |
| val/f1                                            0.3787 |
| val_loss                                     -533901632. |
+--------------------------------

[36m(TorchTrainer pid=9344)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=9344)[0m - (node_id=85d1826776a8b04eb71395ec08130d8b282f617a724b5c6551ccdd9f, ip=172.28.0.12, pid=9465) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=9465)[0m Setting up process group for: env:// [rank=0, world_size=1]


[36m(RayTrainWorker pid=8293)[0m Epoch 0:  49%|████▉     | 29/59 [03:53<04:01,  0.12it/s, v_num=0, train_loss_step=-5.16]Epoch 0:  49%|████▉     | 29/59 [03:53<04:01,  0.12it/s, v_num=0, train_loss_step=-3.95]


[36m(RayTrainWorker pid=9465)[0m Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
[36m(RayTrainWorker pid=9465)[0m GPU available: False, used: False
[36m(RayTrainWorker pid=9465)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=9465)[0m HPU available: False, using: 0 HPUs


[36m(RayTrainWorker pid=8293)[0m Epoch 0:  51%|█████     | 30/59 [04:00<03:52,  0.12it/s, v_num=0, train_loss_step=-3.95]Epoch 0:  51%|█████     | 30/59 [04:00<03:52,  0.12it/s, v_num=0, train_loss_step=-1.56]


[36m(RayTrainWorker pid=9465)[0m 2025-05-05 11:24:34.277955: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(RayTrainWorker pid=9465)[0m E0000 00:00:1746444274.378895    9581 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(RayTrainWorker pid=9465)[0m E0000 00:00:1746444274.397873    9581 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(RayTrainWorker pid=9465)[0m 2025-05-05 11:24:34.497433: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
[36m(RayTrainWorker pid=9465)[0m To enable the following instructions: AVX2 FMA, in other operations, rebuil


Trial status: 2 TERMINATED | 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:24:37. Total running time: 26min 2s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: 8da927e1 with val_loss=-533901632.0 and params={'train_loop_config': {'depth': 2, 'ffn_hidden_dim': 2000, 'ffn_num_layers': 2, 'message_hidden_dim': 500}}
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name              status         ...loop_config/depth     ...ig/ffn_hidden_dim     ...ig/ffn_num_layers     ...essage_hidden_dim     iter     total time (s)     train_loss     train_loss_step     val/f1       val_loss |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[36m(RayTrainWorker pid=9465)[0m Loading `train_dataloader` to estimate number of stepping batches.
[36m(RayTrainWorker pid=9465)[0m 
[36m(RayTrainWorker pid=9465)[0m   | Name            | Type                    | Params | Mode 
[36m(RayTrainWorker pid=9465)[0m --------------------------------------------------------------------
[36m(RayTrainWorker pid=9465)[0m 0 | message_passing | BondMessagePassing      | 227 K  | train
[36m(RayTrainWorker pid=9465)[0m 1 | agg             | MeanAggregation         | 0      | train
[36m(RayTrainWorker pid=9465)[0m 2 | bn              | BatchNorm1d             | 600    | train
[36m(RayTrainWorker pid=9465)[0m 3 | predictor       | BinaryClassificationFFN | 694 K  | train
[36m(RayTrainWorker pid=9465)[0m 4 | X_d_transform   | Identity                | 0      | train
[36m(RayTrainWorker pid=9465)[0m 5 | metrics         | ModuleList              | 0      | train
[36m(RayTrainWorker pid=9465)[0m -------------------------------------

[36m(RayTrainWorker pid=8293)[0m Epoch 0:  53%|█████▎    | 31/59 [04:08<03:44,  0.12it/s, v_num=0, train_loss_step=-1.56]Epoch 0:  53%|█████▎    | 31/59 [04:08<03:44,  0.12it/s, v_num=0, train_loss_step=-6.11]
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s]
Epoch 0:   0%|          | 0/59 [00:00<?, ?it/s] 
Epoch 0:   8%|▊         | 5/59 [00:04<00:49,  1.09it/s, v_num=0, train_loss_step=0.559][32m [repeated 5x across cluster][0m
Epoch 0:  15%|█▌        | 9/59 [00:10<00:57,  0.87it/s, v_num=0, train_loss_step=0.0545][32m [repeated 5x across cluster][0m
Epoch 0:  25%|██▌       | 15/59 [00:15<00:45,  0.97it/s, v_num=0, train_loss_step=-0.435] [32m [repeated 7x across cluster][0m
Epoch 0:  36%|███▌      | 21/59 [00:21<00:38,  0.98it/s, v_num=0, train_loss_step=0.202][32m [repeated 6x across cluster][0m
Trial status: 2 TERMINATED | 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:25:07. Total running time: 26min 32s
Logical resource usage: 2.0/2 CPUs, 

[36m(RayTrainWorker pid=9465)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/b3fd7458/checkpoint_000000)


Epoch 1:   3%|▎         | 2/59 [00:02<01:09,  0.82it/s, v_num=0, train_loss_step=-0.47, val_loss=-1.69, train_loss_epoch=-0.744][32m [repeated 2x across cluster][0m
Epoch 1:  12%|█▏        | 7/59 [00:08<01:01,  0.85it/s, v_num=0, train_loss_step=-1.99, val_loss=-1.69, train_loss_epoch=-0.744][32m [repeated 6x across cluster][0m
Epoch 0:  69%|██████▉   | 41/59 [05:26<02:23,  0.13it/s, v_num=0, train_loss_step=-20.2][32m [repeated 6x across cluster][0m
Epoch 1:  31%|███       | 18/59 [00:19<00:43,  0.95it/s, v_num=0, train_loss_step=-23.4, val_loss=-1.69, train_loss_epoch=-0.744][32m [repeated 6x across cluster][0m
Trial status: 2 TERMINATED | 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:26:07. Total running time: 27min 32s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: 8da927e1 with val_loss=-533901632.0 and params={'train_loop_config': {'depth': 2, 'ffn_hidden_dim': 2000, 'ffn_num_layers': 2, 'message_hidden_dim': 500}}
+----------------------------------

[36m(RayTrainWorker pid=9465)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/b3fd7458/checkpoint_000001)


Epoch 2:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-318., val_loss=-333., train_loss_epoch=-73.8]
Epoch 2:   2%|▏         | 1/59 [00:01<01:07,  0.85it/s, v_num=0, train_loss_step=-450., val_loss=-333., train_loss_epoch=-73.8]
Epoch 2:  12%|█▏        | 7/59 [00:06<00:47,  1.10it/s, v_num=0, train_loss_step=-295., val_loss=-333., train_loss_epoch=-73.8][32m [repeated 7x across cluster][0m
Epoch 2:  22%|██▏       | 13/59 [00:12<00:42,  1.08it/s, v_num=0, train_loss_step=-661., val_loss=-333., train_loss_epoch=-73.8][32m [repeated 7x across cluster][0m
Trial status: 2 TERMINATED | 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:27:07. Total running time: 28min 32s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: 8da927e1 with val_loss=-533901632.0 and params={'train_loop_config': {'depth': 2, 'ffn_hidden_dim': 2000, 'ffn_num_layers': 2, 'message_hidden_dim': 500}}
+----------------------------------------------------------------------------------

[36m(RayTrainWorker pid=9465)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/b3fd7458/checkpoint_000002)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=9465)[0m 
[36m(RayTrainWorker pid=9465)[0m Validation DataLoader 0: 100%|██████████| 8/8 [00:06<00:00,  1.20it/s][A
[36m(RayTrainWorker pid=9465)[0m                                                                       [AEpoch 2: 100%|██████████| 59/59 [01:04<00:00,  0.92it/s, v_num=0, train_loss_step=-4.47e+3, val_loss=-1.7e+3, train_loss_epoch=-73.8]
[36m(RayTrainWorker pid=9465)[0m Epoch 2: 100%|██████████| 59/59 [01:04<00:00,  0.92it/s, v_num=0, train_loss_step=-4.47e+3, val_loss=-1.7e+3, train_loss_epoch=-1.43e+3]
Epoch 3:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-4.47e+3, val_loss=-1.7e+3, train_loss_epoch=-1.43e+3]
Epoch 3:   2%|▏         | 1/59 [00:00<00:50,  1.14it/s, v_num=0, train_loss_step=-3.66e+3, val_loss=-1.7e+3, train_loss_epoch=-1.43e+3]
Epoch 0:  95%|█████████▍| 56/59 [07:23<00:23,  0.13it/s, v_num=0, train_loss_step=-342.]
Epoch 3:   3%|▎         | 2/59 [00:01<00:50,  1.12it/s, v_num=0, train_loss_step=-4.

[36m(RayTrainWorker pid=9465)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/b3fd7458/checkpoint_000003)
[36m(RayTrainWorker pid=8293)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/33384398/checkpoint_000000)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=9465)[0m Epoch 3: 100%|██████████| 59/59 [01:07<00:00,  0.88it/s, v_num=0, train_loss_step=-1.06e+4, val_loss=-7.32e+3, train_loss_epoch=-6.49e+3]
[36m(RayTrainWorker pid=9465)[0m Validation DataLoader 0:  88%|████████▊ | 7/8 [00:04<00:00,  1.44it/s][A[32m [repeated 5x across cluster][0m
[36m(RayTrainWorker pid=9465)[0m Validation DataLoader 0: 100%|██████████| 8/8 [00:05<00:00,  1.60it/s][A
[36m(RayTrainWorker pid=9465)[0m                                                                       [AEpoch 3: 100%|██████████| 59/59 [01:03<00:00,  0.93it/s, v_num=0, train_loss_step=-1.06e+4, val_loss=-7.32e+3, train_loss_epoch=-1.43e+3]
Epoch 4:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-1.06e+4, val_loss=-7.32e+3, train_loss_epoch=-6.49e+3]
Epoch 4:   2%|▏         | 1/59 [00:01<01:29,  0.64it/s, v_num=0, train_loss_step=-1.41e+4, val_loss=-7.32e+3, train_loss_epoch=-6.49e+3]
Epoch 4:   3%|▎         | 2/59 [00:02<01:25,  0.67it/s

[36m(RayTrainWorker pid=9465)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/b3fd7458/checkpoint_000004)


Epoch 5:  10%|█         | 6/59 [00:05<00:47,  1.13it/s, v_num=0, train_loss_step=-2.94e+4, val_loss=-2e+4, train_loss_epoch=-1.49e+4][32m [repeated 6x across cluster][0m
Epoch 5:  20%|██        | 12/59 [00:11<00:45,  1.04it/s, v_num=0, train_loss_step=-2.59e+4, val_loss=-2e+4, train_loss_epoch=-1.49e+4][32m [repeated 7x across cluster][0m
Epoch 5:  29%|██▉       | 17/59 [00:16<00:41,  1.01it/s, v_num=0, train_loss_step=-1.98e+4, val_loss=-2e+4, train_loss_epoch=-1.49e+4][32m [repeated 6x across cluster][0m
Epoch 1:  19%|█▊        | 11/59 [01:27<06:19,  0.13it/s, v_num=0, train_loss_step=-424., val_loss=-399., train_loss_epoch=-27.2][32m [repeated 6x across cluster][0m
Epoch 5:  47%|████▋     | 28/59 [00:28<00:31,  1.00it/s, v_num=0, train_loss_step=-3.14e+4, val_loss=-2e+4, train_loss_epoch=-1.49e+4][32m [repeated 6x across cluster][0m
Trial status: 2 TERMINATED | 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:30:38. Total running time: 32min 3s
Logical resource usage: 2.0

[36m(RayTrainWorker pid=9465)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/b3fd7458/checkpoint_000005)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=9465)[0m 
[36m(RayTrainWorker pid=9465)[0m Validation DataLoader 0: 100%|██████████| 8/8 [00:05<00:00,  1.39it/s][A
[36m(RayTrainWorker pid=9465)[0m                                                                       [AEpoch 5: 100%|██████████| 59/59 [01:04<00:00,  0.92it/s, v_num=0, train_loss_step=-3.63e+4, val_loss=-3.56e+4, train_loss_epoch=-1.49e+4]Epoch 5: 100%|██████████| 59/59 [01:04<00:00,  0.92it/s, v_num=0, train_loss_step=-3.63e+4, val_loss=-3.56e+4, train_loss_epoch=-2.93e+4]
Epoch 6:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-3.63e+4, val_loss=-3.56e+4, train_loss_epoch=-2.93e+4]
Epoch 6:   8%|▊         | 5/59 [00:04<00:46,  1.17it/s, v_num=0, train_loss_step=-4.12e+4, val_loss=-3.56e+4, train_loss_epoch=-2.93e+4][32m [repeated 5x across cluster][0m
Epoch 6:  17%|█▋        | 10/59 [00:10<00:51,  0.95it/s, v_num=0, train_loss_step=-4.55e+4, val_loss=-3.56e+4, train_loss_epoch=-2.93e+4][32m [repeated 6x across c

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=9465)[0m 
[36m(RayTrainWorker pid=9465)[0m Validation DataLoader 0: 100%|██████████| 8/8 [00:05<00:00,  1.51it/s][A
[36m(RayTrainWorker pid=9465)[0m                                                                       [AEpoch 6: 100%|██████████| 59/59 [01:03<00:00,  0.92it/s, v_num=0, train_loss_step=-6.52e+4, val_loss=-4.61e+4, train_loss_epoch=-2.93e+4]
[36m(RayTrainWorker pid=9465)[0m Epoch 6: 100%|██████████| 59/59 [01:03<00:00,  0.92it/s, v_num=0, train_loss_step=-6.52e+4, val_loss=-4.61e+4, train_loss_epoch=-4.73e+4]


[36m(RayTrainWorker pid=9465)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/b3fd7458/checkpoint_000006)


Epoch 7:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-6.52e+4, val_loss=-4.61e+4, train_loss_epoch=-4.73e+4]
Epoch 7:   5%|▌         | 3/59 [00:03<00:59,  0.95it/s, v_num=0, train_loss_step=-4.53e+4, val_loss=-4.61e+4, train_loss_epoch=-4.73e+4][32m [repeated 3x across cluster][0m
Epoch 7:  14%|█▎        | 8/59 [00:08<00:56,  0.90it/s, v_num=0, train_loss_step=-7.11e+4, val_loss=-4.61e+4, train_loss_epoch=-4.73e+4][32m [repeated 6x across cluster][0m
Epoch 7:  24%|██▎       | 14/59 [00:14<00:45,  0.99it/s, v_num=0, train_loss_step=-6.44e+4, val_loss=-4.61e+4, train_loss_epoch=-4.73e+4][32m [repeated 7x across cluster][0m
Epoch 7:  32%|███▏      | 19/59 [00:19<00:41,  0.98it/s, v_num=0, train_loss_step=-6.44e+4, val_loss=-4.61e+4, train_loss_epoch=-4.73e+4][32m [repeated 5x across cluster][0m
Trial status: 2 TERMINATED | 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:32:38. Total running time: 34min 3s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current b

[36m(RayTrainWorker pid=9465)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/b3fd7458/checkpoint_000007)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=9465)[0m 
[36m(RayTrainWorker pid=9465)[0m Validation DataLoader 0: 100%|██████████| 8/8 [00:06<00:00,  1.21it/s][A
[36m(RayTrainWorker pid=9465)[0m                                                                       [AEpoch 7: 100%|██████████| 59/59 [01:04<00:00,  0.91it/s, v_num=0, train_loss_step=-6.93e+4, val_loss=-9.03e+4, train_loss_epoch=-4.73e+4]
Epoch 7: 100%|██████████| 59/59 [01:05<00:00,  0.91it/s, v_num=0, train_loss_step=-6.93e+4, val_loss=-9.03e+4, train_loss_epoch=-6.69e+4]
Epoch 8:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-6.93e+4, val_loss=-9.03e+4, train_loss_epoch=-6.69e+4]
Epoch 8:   2%|▏         | 1/59 [00:01<01:14,  0.78it/s, v_num=0, train_loss_step=-7.07e+4, val_loss=-9.03e+4, train_loss_epoch=-6.69e+4]
Epoch 8:   3%|▎         | 2/59 [00:02<01:10,  0.81it/s, v_num=0, train_loss_step=-8.67e+4, val_loss=-9.03e+4, train_loss_epoch=-6.69e+4]
Epoch 8:  12%|█▏        | 7/59 [00:06<00:51,  1.01it/s, v_num=0, 

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=9465)[0m 
[36m(RayTrainWorker pid=9465)[0m Validation DataLoader 0: 100%|██████████| 8/8 [00:07<00:00,  1.08it/s][A
[36m(RayTrainWorker pid=9465)[0m                                                                       [AEpoch 8: 100%|██████████| 59/59 [01:04<00:00,  0.92it/s, v_num=0, train_loss_step=-1.11e+5, val_loss=-1.06e+5, train_loss_epoch=-6.69e+4]
[36m(RayTrainWorker pid=9465)[0m Epoch 8: 100%|██████████| 59/59 [01:04<00:00,  0.92it/s, v_num=0, train_loss_step=-1.11e+5, val_loss=-1.06e+5, train_loss_epoch=-9.14e+4]Epoch 8:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-1.11e+5, val_loss=-1.06e+5, train_loss_epoch=-9.14e+4]         Epoch 9:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-1.11e+5, val_loss=-1.06e+5, train_loss_epoch=-9.14e+4]


[36m(RayTrainWorker pid=9465)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/b3fd7458/checkpoint_000008)


Epoch 1:  69%|██████▉   | 41/59 [05:23<02:21,  0.13it/s, v_num=0, train_loss_step=-5.62e+3, val_loss=-399., train_loss_epoch=-27.2]
Epoch 9:  12%|█▏        | 7/59 [00:05<00:43,  1.19it/s, v_num=0, train_loss_step=-1.09e+5, val_loss=-1.06e+5, train_loss_epoch=-9.14e+4][32m [repeated 7x across cluster][0m
Trial status: 2 TERMINATED | 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:34:38. Total running time: 36min 3s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: 8da927e1 with val_loss=-533901632.0 and params={'train_loop_config': {'depth': 2, 'ffn_hidden_dim': 2000, 'ffn_num_layers': 2, 'message_hidden_dim': 500}}
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name              status         ...loop_config/depth     ...ig/ffn_hidden_dim     ...ig/ffn_num_layers     ...es

[36m(RayTrainWorker pid=9465)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/b3fd7458/checkpoint_000009)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=9465)[0m 
[36m(RayTrainWorker pid=9465)[0m Validation DataLoader 0: 100%|██████████| 8/8 [00:05<00:00,  1.47it/s][A
[36m(RayTrainWorker pid=9465)[0m                                                                       [AEpoch 9: 100%|██████████| 59/59 [01:03<00:00,  0.93it/s, v_num=0, train_loss_step=-1.11e+5, val_loss=-1.08e+5, train_loss_epoch=-9.14e+4]Epoch 9: 100%|██████████| 59/59 [01:03<00:00,  0.92it/s, v_num=0, train_loss_step=-1.11e+5, val_loss=-1.08e+5, train_loss_epoch=-1.15e+5]
Epoch 10:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-1.11e+5, val_loss=-1.08e+5, train_loss_epoch=-1.15e+5]
Epoch 10:   8%|▊         | 5/59 [00:05<00:57,  0.93it/s, v_num=0, train_loss_step=-1.38e+5, val_loss=-1.08e+5, train_loss_epoch=-1.15e+5][32m [repeated 5x across cluster][0m
Trial status: 2 TERMINATED | 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:35:38. Total running time: 37min 3s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Cu

[36m(RayTrainWorker pid=9465)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/b3fd7458/checkpoint_000010)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=9465)[0m 
[36m(RayTrainWorker pid=9465)[0m Validation DataLoader 0: 100%|██████████| 8/8 [00:05<00:00,  1.39it/s][A
Epoch 10: 100%|██████████| 59/59 [01:04<00:00,  0.92it/s, v_num=0, train_loss_step=-1.61e+5, val_loss=-1.26e+5, train_loss_epoch=-1.15e+5]
Epoch 10: 100%|██████████| 59/59 [01:04<00:00,  0.92it/s, v_num=0, train_loss_step=-1.61e+5, val_loss=-1.26e+5, train_loss_epoch=-1.38e+5]
Epoch 11:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-1.61e+5, val_loss=-1.26e+5, train_loss_epoch=-1.38e+5]
Epoch 11:   2%|▏         | 1/59 [00:01<01:25,  0.68it/s, v_num=0, train_loss_step=-1.37e+5, val_loss=-1.26e+5, train_loss_epoch=-1.38e+5]
Epoch 1:  97%|█████████▋| 57/59 [07:29<00:15,  0.13it/s, v_num=0, train_loss_step=-2.62e+4, val_loss=-399., train_loss_epoch=-27.2]
Trial status: 2 TERMINATED | 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:36:38. Total running time: 38min 3s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best tri

[36m(RayTrainWorker pid=8293)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/33384398/checkpoint_000001)


Epoch 1: 100%|██████████| 59/59 [08:26<00:00,  0.12it/s, v_num=0, train_loss_step=-5.59e+4, val_loss=-3.58e+4, train_loss_epoch=-6.28e+3]
Epoch 11:  97%|█████████▋| 57/59 [00:56<00:01,  1.01it/s, v_num=0, train_loss_step=-1.81e+5, val_loss=-1.26e+5, train_loss_epoch=-1.38e+5]
Epoch 11:  98%|█████████▊| 58/59 [00:56<00:00,  1.02it/s, v_num=0, train_loss_step=-1.55e+5, val_loss=-1.26e+5, train_loss_epoch=-1.38e+5]
Epoch 11: 100%|██████████| 59/59 [00:57<00:00,  1.03it/s, v_num=0, train_loss_step=-1.68e+5, val_loss=-1.26e+5, train_loss_epoch=-1.38e+5]
Validation: |          | 0/? [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=9465)[0m 
Validation:   0%|          | 0/8 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=9465)[0m 
[36m(RayTrainWorker pid=9465)[0m 
[36m(RayTrainWorker pid=9465)[0m 
Validation DataLoader 0:  38%|███▊      | 3/8 [00:01<00:02,  2.18it/s][A[32m [repeated 3x across cluster][0m
Epoch 2:   0%|       

[36m(RayTrainWorker pid=9465)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/b3fd7458/checkpoint_000011)


Validation DataLoader 0:  88%|████████▊ | 7/8 [00:05<00:00,  1.18it/s][A[32m [repeated 4x across cluster][0m
Epoch 12:   2%|▏         | 1/59 [00:00<00:50,  1.15it/s, v_num=0, train_loss_step=-1.81e+5, val_loss=-1.88e+5, train_loss_epoch=-1.62e+5]
Epoch 12:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-1.68e+5, val_loss=-1.88e+5, train_loss_epoch=-1.62e+5]
Epoch 12:   3%|▎         | 2/59 [00:01<00:48,  1.17it/s, v_num=0, train_loss_step=-1.77e+5, val_loss=-1.88e+5, train_loss_epoch=-1.62e+5]
Epoch 12:   5%|▌         | 3/59 [00:02<00:47,  1.17it/s, v_num=0, train_loss_step=-1.76e+5, val_loss=-1.88e+5, train_loss_epoch=-1.62e+5]
Epoch 12:  12%|█▏        | 7/59 [00:05<00:44,  1.18it/s, v_num=0, train_loss_step=-1.64e+5, val_loss=-1.88e+5, train_loss_epoch=-1.62e+5][32m [repeated 5x across cluster][0m
Epoch 2:   3%|▎         | 2/59 [00:14<06:44,  0.14it/s, v_num=0, train_loss_step=-4.67e+4, val_loss=-3.58e+4, train_loss_epoch=-6.28e+3]
Epoch 2:   3%|▎         | 2/59 

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=9465)[0m 
[36m(RayTrainWorker pid=9465)[0m Validation DataLoader 0: 100%|██████████| 8/8 [00:05<00:00,  1.47it/s][A
[36m(RayTrainWorker pid=9465)[0m                                                                       [AEpoch 12: 100%|██████████| 59/59 [01:03<00:00,  0.92it/s, v_num=0, train_loss_step=-1.85e+5, val_loss=-2.25e+5, train_loss_epoch=-1.62e+5]
[36m(RayTrainWorker pid=9465)[0m Epoch 12: 100%|██████████| 59/59 [01:03<00:00,  0.92it/s, v_num=0, train_loss_step=-1.85e+5, val_loss=-2.25e+5, train_loss_epoch=-1.85e+5]


[36m(RayTrainWorker pid=9465)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/b3fd7458/checkpoint_000012)


Epoch 13:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-1.85e+5, val_loss=-2.25e+5, train_loss_epoch=-1.85e+5]
Epoch 13:   2%|▏         | 1/59 [00:00<00:54,  1.07it/s, v_num=0, train_loss_step=-1.76e+5, val_loss=-2.25e+5, train_loss_epoch=-1.85e+5]
Epoch 13:   3%|▎         | 2/59 [00:01<00:53,  1.07it/s, v_num=0, train_loss_step=-2.05e+5, val_loss=-2.25e+5, train_loss_epoch=-1.85e+5]
Epoch 13:  12%|█▏        | 7/59 [00:06<00:51,  1.00it/s, v_num=0, train_loss_step=-2.03e+5, val_loss=-2.25e+5, train_loss_epoch=-1.85e+5][32m [repeated 6x across cluster][0m
Epoch 13:  20%|██        | 12/59 [00:12<00:48,  0.97it/s, v_num=0, train_loss_step=-1.84e+5, val_loss=-2.25e+5, train_loss_epoch=-1.85e+5][32m [repeated 6x across cluster][0m
Epoch 13:  31%|███       | 18/59 [00:17<00:40,  1.02it/s, v_num=0, train_loss_step=-2.01e+5, val_loss=-2.25e+5, train_loss_epoch=-1.85e+5][32m [repeated 6x across cluster][0m
Epoch 13:  39%|███▉      | 23/59 [00:23<00:36,  0.99it/s, v_num=

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=9465)[0m 
[36m(RayTrainWorker pid=9465)[0m Validation DataLoader 0: 100%|██████████| 8/8 [00:05<00:00,  1.50it/s][A
[36m(RayTrainWorker pid=9465)[0m                                                                       [AEpoch 13: 100%|██████████| 59/59 [01:03<00:00,  0.93it/s, v_num=0, train_loss_step=-2.18e+5, val_loss=-2.17e+5, train_loss_epoch=-1.85e+5]
[36m(RayTrainWorker pid=9465)[0m Epoch 13: 100%|██████████| 59/59 [01:03<00:00,  0.92it/s, v_num=0, train_loss_step=-2.18e+5, val_loss=-2.17e+5, train_loss_epoch=-2.04e+5]


[36m(RayTrainWorker pid=9465)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/b3fd7458/checkpoint_000013)


Epoch 14:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-2.18e+5, val_loss=-2.17e+5, train_loss_epoch=-2.04e+5]
Epoch 2:  29%|██▉       | 17/59 [02:12<05:28,  0.13it/s, v_num=0, train_loss_step=-1.13e+5, val_loss=-3.58e+4, train_loss_epoch=-6.28e+3]
Epoch 14:   8%|▊         | 5/59 [00:06<01:07,  0.80it/s, v_num=0, train_loss_step=-2.19e+5, val_loss=-2.17e+5, train_loss_epoch=-2.04e+5][32m [repeated 5x across cluster][0m
Epoch 14:  19%|█▊        | 11/59 [00:11<00:49,  0.96it/s, v_num=0, train_loss_step=-2.25e+5, val_loss=-2.17e+5, train_loss_epoch=-2.04e+5][32m [repeated 7x across cluster][0m
Epoch 14:  29%|██▉       | 17/59 [00:17<00:42,  1.00it/s, v_num=0, train_loss_step=-2.21e+5, val_loss=-2.17e+5, train_loss_epoch=-2.04e+5][32m [repeated 7x across cluster][0m
Trial status: 2 TERMINATED | 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:40:09. Total running time: 41min 34s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: 8da927e1 with val_los

[36m(RayTrainWorker pid=9465)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/b3fd7458/checkpoint_000014)


Epoch 15:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-2.05e+5, val_loss=-2.48e+5, train_loss_epoch=-2.22e+5]
Epoch 15:   8%|▊         | 5/59 [00:04<00:46,  1.16it/s, v_num=0, train_loss_step=-2.41e+5, val_loss=-2.48e+5, train_loss_epoch=-2.22e+5][32m [repeated 5x across cluster][0m
Epoch 15:  19%|█▊        | 11/59 [00:09<00:41,  1.16it/s, v_num=0, train_loss_step=-2.43e+5, val_loss=-2.48e+5, train_loss_epoch=-2.22e+5][32m [repeated 7x across cluster][0m
Epoch 15:  25%|██▌       | 15/59 [00:14<00:43,  1.02it/s, v_num=0, train_loss_step=-2.4e+5, val_loss=-2.48e+5, train_loss_epoch=-2.22e+5] [32m [repeated 4x across cluster][0m
Trial status: 2 TERMINATED | 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:41:09. Total running time: 42min 34s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: 8da927e1 with val_loss=-533901632.0 and params={'train_loop_config': {'depth': 2, 'ffn_hidden_dim': 2000, 'ffn_num_layers': 2, 'message_hidden_dim': 500}}
+---

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=9465)[0m 
[36m(RayTrainWorker pid=9465)[0m Validation DataLoader 0: 100%|██████████| 8/8 [00:05<00:00,  1.35it/s][A
[36m(RayTrainWorker pid=9465)[0m                                                                       [AEpoch 15: 100%|██████████| 59/59 [01:03<00:00,  0.93it/s, v_num=0, train_loss_step=-2.46e+5, val_loss=-2.26e+5, train_loss_epoch=-2.22e+5]
[36m(RayTrainWorker pid=9465)[0m Epoch 15: 100%|██████████| 59/59 [01:03<00:00,  0.93it/s, v_num=0, train_loss_step=-2.46e+5, val_loss=-2.26e+5, train_loss_epoch=-2.39e+5]


[36m(RayTrainWorker pid=9465)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/b3fd7458/checkpoint_000015)


Epoch 16:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-2.46e+5, val_loss=-2.26e+5, train_loss_epoch=-2.39e+5]
Epoch 16:   8%|▊         | 5/59 [00:04<00:47,  1.15it/s, v_num=0, train_loss_step=-2.59e+5, val_loss=-2.26e+5, train_loss_epoch=-2.39e+5][32m [repeated 5x across cluster][0m
Epoch 16:  17%|█▋        | 10/59 [00:10<00:52,  0.94it/s, v_num=0, train_loss_step=-2.46e+5, val_loss=-2.26e+5, train_loss_epoch=-2.39e+5][32m [repeated 6x across cluster][0m
Trial status: 2 TERMINATED | 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:42:09. Total running time: 43min 34s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: 8da927e1 with val_loss=-533901632.0 and params={'train_loop_config': {'depth': 2, 'ffn_hidden_dim': 2000, 'ffn_num_layers': 2, 'message_hidden_dim': 500}}
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[36m(RayTrainWorker pid=9465)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/b3fd7458/checkpoint_000016)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=9465)[0m 
[36m(RayTrainWorker pid=9465)[0m Validation DataLoader 0: 100%|██████████| 8/8 [00:05<00:00,  1.49it/s][A
[36m(RayTrainWorker pid=9465)[0m                                                                       [AEpoch 16: 100%|██████████| 59/59 [01:03<00:00,  0.92it/s, v_num=0, train_loss_step=-2.63e+5, val_loss=-2.75e+5, train_loss_epoch=-2.39e+5]Epoch 16: 100%|██████████| 59/59 [01:03<00:00,  0.92it/s, v_num=0, train_loss_step=-2.63e+5, val_loss=-2.75e+5, train_loss_epoch=-2.54e+5]
Epoch 17:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-2.63e+5, val_loss=-2.75e+5, train_loss_epoch=-2.54e+5]
Epoch 17:   2%|▏         | 1/59 [00:00<00:48,  1.19it/s, v_num=0, train_loss_step=-2.48e+5, val_loss=-2.75e+5, train_loss_epoch=-2.54e+5]
Epoch 17:   3%|▎         | 2/59 [00:01<00:50,  1.12it/s, v_num=0, train_loss_step=-2.59e+5, val_loss=-2.75e+5, train_loss_epoch=-2.54e+5]
Epoch 17:   5%|▌         | 3/59 [00:03<01:01,  0.91it/s, v_n

[36m(RayTrainWorker pid=9465)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/b3fd7458/checkpoint_000017)


Epoch 18:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-2.83e+5, val_loss=-2.54e+5, train_loss_epoch=-2.67e+5]
Epoch 18:   2%|▏         | 1/59 [00:01<01:05,  0.89it/s, v_num=0, train_loss_step=-2.82e+5, val_loss=-2.54e+5, train_loss_epoch=-2.67e+5]
Trial status: 2 TERMINATED | 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:44:09. Total running time: 45min 34s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: 8da927e1 with val_loss=-533901632.0 and params={'train_loop_config': {'depth': 2, 'ffn_hidden_dim': 2000, 'ffn_num_layers': 2, 'message_hidden_dim': 500}}
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name              status         ...loop_config/depth     ...ig/ffn_hidden_dim     ...ig/ffn_num_layers     ...essage_hidden_dim     iter     total ti

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=9465)[0m 
[36m(RayTrainWorker pid=9465)[0m Validation DataLoader 0: 100%|██████████| 8/8 [00:06<00:00,  1.30it/s][A
[36m(RayTrainWorker pid=9465)[0m                                                                       [AEpoch 18: 100%|██████████| 59/59 [01:04<00:00,  0.92it/s, v_num=0, train_loss_step=-2.43e+5, val_loss=-2.62e+5, train_loss_epoch=-2.67e+5]
[36m(RayTrainWorker pid=9465)[0m Epoch 18: 100%|██████████| 59/59 [01:04<00:00,  0.92it/s, v_num=0, train_loss_step=-2.43e+5, val_loss=-2.62e+5, train_loss_epoch=-2.79e+5]


[36m(RayTrainWorker pid=9465)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/b3fd7458/checkpoint_000018)


[36m(RayTrainWorker pid=8293)[0m Epoch 2:  98%|█████████▊| 58/59 [07:35<00:07,  0.13it/s, v_num=0, train_loss_step=-4.69e+5, val_loss=-3.58e+4, train_loss_epoch=-6.28e+3]Epoch 2:  98%|█████████▊| 58/59 [07:35<00:07,  0.13it/s, v_num=0, train_loss_step=-4.83e+5, val_loss=-3.58e+4, train_loss_epoch=-6.28e+3]
Epoch 19:   0%|          | 0/59 [00:00<?, ?it/s, v_num=0, train_loss_step=-2.43e+5, val_loss=-2.62e+5, train_loss_epoch=-2.79e+5]
Epoch 19:   2%|▏         | 1/59 [00:00<00:52,  1.11it/s, v_num=0, train_loss_step=-2.86e+5, val_loss=-2.62e+5, train_loss_epoch=-2.79e+5]
Epoch 19:   3%|▎         | 2/59 [00:01<00:49,  1.16it/s, v_num=0, train_loss_step=-2.87e+5, val_loss=-2.62e+5, train_loss_epoch=-2.79e+5]
Epoch 19:   5%|▌         | 3/59 [00:02<00:47,  1.17it/s, v_num=0, train_loss_step=-2.87e+5, val_loss=-2.62e+5, train_loss_epoch=-2.79e+5]
Epoch 19:   7%|▋         | 4/59 [00:03<00:47,  1.17it/s, v_num=0, train_loss_step=-2.73e+5, val_loss=-2.62e+5, train_loss_epoch=-2.79e+5]
Epoch 

[36m(RayTrainWorker pid=8293)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/33384398/checkpoint_000002)


[36m(RayTrainWorker pid=8293)[0m Epoch 2: 100%|██████████| 59/59 [08:23<00:00,  0.12it/s, v_num=0, train_loss_step=-5.78e+5, val_loss=-2.29e+5, train_loss_epoch=-1.6e+5] 
Epoch 19:  83%|████████▎ | 49/59 [00:48<00:09,  1.02it/s, v_num=0, train_loss_step=-2.82e+5, val_loss=-2.62e+5, train_loss_epoch=-2.79e+5]
Epoch 19:  85%|████████▍ | 50/59 [00:48<00:08,  1.02it/s, v_num=0, train_loss_step=-2.94e+5, val_loss=-2.62e+5, train_loss_epoch=-2.79e+5]
Epoch 19:  86%|████████▋ | 51/59 [00:49<00:07,  1.03it/s, v_num=0, train_loss_step=-3.08e+5, val_loss=-2.62e+5, train_loss_epoch=-2.79e+5]
Epoch 19:  88%|████████▊ | 52/59 [00:49<00:06,  1.04it/s, v_num=0, train_loss_step=-2.98e+5, val_loss=-2.62e+5, train_loss_epoch=-2.79e+5]
Epoch 19:  90%|████████▉ | 53/59 [00:50<00:05,  1.05it/s, v_num=0, train_loss_step=-2.89e+5, val_loss=-2.62e+5, train_loss_epoch=-2.79e+5]
Epoch 19:  92%|█████████▏| 54/59 [00:50<00:04,  1.06it/s, v_num=0, train_loss_step=-2.64e+5, val_loss=-2.62e+5, train_loss_epoch=-2

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=9465)[0m 
[36m(RayTrainWorker pid=9465)[0m Validation DataLoader 0: 100%|██████████| 8/8 [00:05<00:00,  1.49it/s][A
[36m(RayTrainWorker pid=9465)[0m                                                                       [AEpoch 19: 100%|██████████| 59/59 [01:00<00:00,  0.97it/s, v_num=0, train_loss_step=-2.89e+5, val_loss=-2.83e+5, train_loss_epoch=-2.79e+5]
[36m(RayTrainWorker pid=9465)[0m Epoch 19: 100%|██████████| 59/59 [01:00<00:00,  0.97it/s, v_num=0, train_loss_step=-2.89e+5, val_loss=-2.83e+5, train_loss_epoch=-2.89e+5]


[36m(RayTrainWorker pid=9465)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35/b3fd7458/checkpoint_000019)
[36m(RayTrainWorker pid=9465)[0m `Trainer.fit` stopped: `max_epochs=20` reached.


[36m(RayTrainWorker pid=9465)[0m Epoch 19: 100%|██████████| 59/59 [01:01<00:00,  0.97it/s, v_num=0, train_loss_step=-2.89e+5, val_loss=-2.83e+5, train_loss_epoch=-2.89e+5]
[36m(RayTrainWorker pid=8293)[0m Epoch 3:   2%|▏         | 1/59 [00:07<07:14,  0.13it/s, v_num=0, train_loss_step=-5.78e+5, val_loss=-2.29e+5, train_loss_epoch=-1.6e+5]Epoch 3:   2%|▏         | 1/59 [00:07<07:14,  0.13it/s, v_num=0, train_loss_step=-5.3e+5, val_loss=-2.29e+5, train_loss_epoch=-1.6e+5] 

Trial TorchTrainer_b3fd7458 completed after 20 iterations at 2025-05-05 11:46:13. Total running time: 47min 37s
+----------------------------------------------------------+
| Trial TorchTrainer_b3fd7458 result                       |
+----------------------------------------------------------+
| checkpoint_dir_name                    checkpoint_000019 |
| time_this_iter_s                                 61.0162 |
| time_total_s                                  1317.78094 |
| training_iteration                  

[36m(TorchTrainer pid=14937)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=14937)[0m - (node_id=85d1826776a8b04eb71395ec08130d8b282f617a724b5c6551ccdd9f, ip=172.28.0.12, pid=15066) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=15066)[0m Setting up process group for: env:// [rank=0, world_size=1]


[36m(RayTrainWorker pid=8293)[0m Epoch 3:  10%|█         | 6/59 [00:48<07:11,  0.12it/s, v_num=0, train_loss_step=-6.54e+5, val_loss=-2.29e+5, train_loss_epoch=-1.6e+5]Epoch 3:  10%|█         | 6/59 [00:48<07:11,  0.12it/s, v_num=0, train_loss_step=-2.91e+5, val_loss=-2.29e+5, train_loss_epoch=-1.6e+5]


[36m(RayTrainWorker pid=15066)[0m Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
[36m(RayTrainWorker pid=15066)[0m GPU available: False, used: False
[36m(RayTrainWorker pid=15066)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=15066)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=15066)[0m 2025-05-05 11:46:56.679300: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(RayTrainWorker pid=15066)[0m E0000 00:00:1746445616.736201   15183 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(RayTrainWorker pid=15066)[0m E0000 00:00:1746445616.751603   15183 cuda_blas.cc:1418] Unable to register cuBLAS factory: At

[36m(RayTrainWorker pid=8293)[0m Epoch 3:  12%|█▏        | 7/59 [00:55<06:49,  0.13it/s, v_num=0, train_loss_step=-2.91e+5, val_loss=-2.29e+5, train_loss_epoch=-1.6e+5]Epoch 3:  12%|█▏        | 7/59 [00:55<06:49,  0.13it/s, v_num=0, train_loss_step=-6.75e+5, val_loss=-2.29e+5, train_loss_epoch=-1.6e+5]


2025-05-05 11:47:01,830	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/content/chemprop/examples/hpopt/ray_results/TorchTrainer_2025-05-05_10-58-35' in 0.0137s.


Trial status: 3 TERMINATED | 2 RUNNING | 1 PENDING
Current time: 2025-05-05 11:47:01. Total running time: 48min 26s
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Current best trial: 8da927e1 with val_loss=-533901632.0 and params={'train_loop_config': {'depth': 2, 'ffn_hidden_dim': 2000, 'ffn_num_layers': 2, 'message_hidden_dim': 500}}
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name              status         ...loop_config/depth     ...ig/ffn_hidden_dim     ...ig/ffn_num_layers     ...essage_hidden_dim     iter     total time (s)          train_loss     train_loss_step     val/f1            val_loss |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[36m(RayTrainWorker pid=15066)[0m Loading `train_dataloader` to estimate number of stepping batches.
[36m(RayTrainWorker pid=15066)[0m 
[36m(RayTrainWorker pid=15066)[0m   | Name            | Type                    | Params | Mode 
[36m(RayTrainWorker pid=15066)[0m --------------------------------------------------------------------
[36m(RayTrainWorker pid=15066)[0m 0 | message_passing | BondMessagePassing      | 3.1 M  | train
[36m(RayTrainWorker pid=15066)[0m 1 | agg             | MeanAggregation         | 0      | train
[36m(RayTrainWorker pid=15066)[0m 2 | bn              | BatchNorm1d             | 2.4 K  | train
[36m(RayTrainWorker pid=15066)[0m 3 | predictor       | BinaryClassificationFFN | 1.7 M  | train
[36m(RayTrainWorker pid=15066)[0m 4 | X_d_transform   | Identity                | 0      | train
[36m(RayTrainWorker pid=15066)[0m 5 | metrics         | ModuleList              | 0      | train
[36m(RayTrainWorker pid=15066)[0m --------------------------

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:05<00:05,  0.20it/s]


## Hyperparameter optimization results

In [None]:
results

ResultGrid<[
  Result(
    metrics={'train_loss': 0.09904231131076813, 'train_loss_step': 0.16821686923503876, 'val/rmse': 0.8613682389259338, 'val/mae': 0.7006751298904419, 'val_loss': 0.7419552206993103, 'train_loss_epoch': 0.09904231131076813, 'epoch': 19, 'step': 40},
    path='/home/knathan/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-10-22_09-03-37/f1a6e41a',
    filesystem='local',
    checkpoint=Checkpoint(filesystem=local, path=/home/knathan/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-10-22_09-03-37/f1a6e41a/checkpoint_000019)
  ),
  Result(
    metrics={'train_loss': 0.06969495117664337, 'train_loss_step': 0.11989812552928925, 'val/rmse': 0.902579665184021, 'val/mae': 0.7176367044448853, 'val_loss': 0.8146500587463379, 'train_loss_epoch': 0.06969495117664337, 'epoch': 19, 'step': 40},
    path='/home/knathan/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-10-22_09-03-37/d775c15d',
    filesystem='local',
    checkpoint=Checkpoint(filesystem=local, pat

In [None]:
# results of all trials
result_df = results.get_dataframe()
result_df

Unnamed: 0,train_loss,train_loss_step,val/rmse,val/mae,val_loss,train_loss_epoch,epoch,step,timestamp,checkpoint_dir_name,...,pid,hostname,node_ip,time_since_restore,iterations_since_restore,config/train_loop_config/depth,config/train_loop_config/ffn_hidden_dim,config/train_loop_config/ffn_num_layers,config/train_loop_config/message_hidden_dim,logdir
0,0.099042,0.168217,0.861368,0.700675,0.741955,0.099042,19,40,1729602279,checkpoint_000019,...,24873,Knathan-Laptop,172.31.231.162,49.881516,20,2,2000,2,500,f1a6e41a
1,0.069695,0.119898,0.90258,0.717637,0.81465,0.069695,19,40,1729602299,checkpoint_000019,...,24953,Knathan-Laptop,172.31.231.162,56.653336,20,2,2200,2,400,d775c15d


In [None]:
# best configuration
best_result = results.get_best_result()
best_config = best_result.config
best_config['train_loop_config']

{'depth': 2,
 'ffn_hidden_dim': 2000,
 'ffn_num_layers': 2,
 'message_hidden_dim': 500}

In [None]:
# best model checkpoint path
best_result = results.get_best_result()
best_checkpoint_path = Path(best_result.checkpoint.path) / "checkpoint.ckpt"
print(f"Best model checkpoint path: {best_checkpoint_path}")

Best model checkpoint path: /home/knathan/chemprop/examples/hpopt/ray_results/TorchTrainer_2024-10-22_09-03-37/f1a6e41a/checkpoint_000019/checkpoint.ckpt


In [None]:
ray.shutdown()