### Initialize

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install chemprop==1.6.* rdkit-pypi==2022.9.* lightgbm==4.0.* xgboost==2.0.* omegaconf==2.3.* einops==0.7.* lightning_lite==1.8. pytorch-lightning==1.8.* category_encoders==2.6.*

Collecting chemprop==1.6.*
  Downloading chemprop-1.6.1-py3-none-any.whl (166 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.4/166.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rdkit-pypi==2022.9.*
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lightgbm==4.0.*
  Downloading lightgbm-4.0.0-py3-none-manylinux_2_28_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m70.5 MB/s[0m eta [36m0:00:00[0m
Collecting omegaconf==2.3.*
  Downloading omegaconf-2.3.0-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops==0.7.*
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━

In [3]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.4.0-py3-none-any.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.6/409.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.12.1-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.8/226.8 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.0-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.0 alembic-1.12.1 colorlog-6.7.0 optuna-3.4.0


In [4]:
import sys
sys.path.append("/content/drive/MyDrive/")
from ChemXTree.MPNN.mpnn.MPNN_Pipeline import MPNNPipeline
import pandas as pd
import optuna
from optuna.samplers import RandomSampler
from ChemXTree.GMFU import TabularModel
from ChemXTree.GMFU.model import GateModulationFeatureUnitConfig
from ChemXTree.GMFU.configuration import (
    DataConfig, OptimizerConfig, TrainerConfig
    )
from ChemXTree.GMFU.model.blocks.heads import LinearHeadConfig
from ChemXTree.GMFU.utils import get_class_weighted_cross_entropy
from ChemXTree.ChemXTree_Pipeline import ChemXTreePipeline
from sklearn.metrics import roc_auc_score

In [5]:
# Global Variables
DATASET_NAME = "CYP2C9"
BASE_PATH = "/content/drive/MyDrive/ChemXTree/Datasets/" + DATASET_NAME + "/"
CONFIG_PATH = BASE_PATH + "optimized_config.json"
CHECKPOINTS_DIR = BASE_PATH + DATASET_NAME + "_checkpoints"
train_file = f"{BASE_PATH}train{DATASET_NAME}.csv"
test_file = f"{BASE_PATH}test{DATASET_NAME}.csv"
val_file = f"{BASE_PATH}valid{DATASET_NAME}.csv"

### Ensemble Pipeline

In [6]:
# Set up the pipeline
MPNN_KWARGS = {
    "num_iters": 1,
    "ensemble_size": 1,
    "run_env": "colab"
}
GMFU_KWARGS = {
    "data_config": {
        "target": ["targets"],
        "categorical_cols": [],
        "num_workers": 4
    },
    "model_config": {
        "task": "classification",
        "n_heads": 2,
        "head_config": {
            "layers": "",
            "dropout": 0.1,
            "initialization": "kaiming"
        },
        "metrics": ['accuracy', "auroc"],
        "metrics_params": [
            {"task": "multiclass", "num_classes": 2},
            {"task": "multiclass", "num_classes": 2}
        ]
    },
    "trainer_config": {
        "auto_lr_find": True,
        "max_epochs": 10,
        "gpus": 1,
        "checkpoints": None,
        "trainer_kwargs": {
            "accelerator": "gpu",
            "devices": 1,
            "strategy": "dp",
            "num_nodes": 1
        }
    },
    "optimizer_config": {
        "optimizer": "AdamW",
        "optimizer_params": {},
        "lr_scheduler": "ReduceLROnPlateau",
        "lr_scheduler_params": {
            "mode": "min",
            "factor": 0.2,
            "patience": 5,
            "cooldown": 10,
            "verbose": True
        },
        "lr_scheduler_monitor_metric": "valid_loss"
    }
}


# Example usage
pipeline = ChemXTreePipeline(
    dataset_name=DATASET_NAME,
    base_path=BASE_PATH,
    mpnn_kwargs=MPNN_KWARGS,
    gmfu_kwargs=GMFU_KWARGS,
    score_threshold=0.69
    )
# score = pipeline._run_ensemble()
# Or alternatively, call the class
score = pipeline()
print(f"ChemXTree AUCROC score: {score}")

Creating search space using parameters ['linked_hidden_size', 'ffn_num_layers', 'dropout', 'depth'].
No manual trials loaded as part of hyperparameter search
Initiating trial with seed 0
Loaded 0 previous trials
Parameters assigned with TPE directed search
  0%|          | 0/1 [00:00<?, ?trial/s, best loss=?]Command line

python /usr/local/bin/chemprop_hyperopt --dataset_type classification --num_iters 1 --data_path /content/drive/MyDrive/ChemXTree/Datasets/CYP2C9/trainCYP2C9.csv --config_save_path /content/drive/MyDrive/ChemXTree/Datasets/CYP2C9/optimized_config.json

Args

{'activation': 'ReLU',
 'adding_bond_types': True,
 'adding_h': False,
 'aggregation': 'mean',
 'aggregation_norm': 100,
 'atom_constraints': [],
 'atom_descriptor_scaling': True,
 'atom_descriptors': None,
 'atom_descriptors_path': None,
 'atom_descriptors_size': 0,
 'atom_features_size': 0,
 'atom_messages': False,
 'atom_targets': [],
 'batch_size': 50,
 'bias': False,
 'bias_solvent': False,
 'bond_constraints

2023-11-20 09:48:53,762 - {ChemXTree.GMFU.tabular_model:100} - INFO - Experiment Tracking is turned off
INFO:ChemXTree.GMFU.tabular_model:Experiment Tracking is turned off
INFO:lightning_lite.utilities.seed:Global seed set to 42
2023-11-20 09:48:54,215 - {ChemXTree.GMFU.tabular_model:441} - INFO - Preparing the DataLoaders
INFO:ChemXTree.GMFU.tabular_model:Preparing the DataLoaders
2023-11-20 09:48:54,222 - {GMFU.tabular_data_modules.datamodule:285} - INFO - Setting up the datamodule for classification task
INFO:GMFU.tabular_data_modules.datamodule:Setting up the datamodule for classification task
2023-11-20 09:48:54,702 - {ChemXTree.GMFU.tabular_model:484} - INFO - Preparing the Model: GateModulationFeatureUnitEnsembleModel
INFO:ChemXTree.GMFU.tabular_model:Preparing the Model: GateModulationFeatureUnitEnsembleModel
2023-11-20 09:48:56,492 - {ChemXTree.GMFU.tabular_model:255} - INFO - Preparing the Trainer
INFO:ChemXTree.GMFU.tabular_model:Preparing the Trainer
2023-11-20 09:48:56,555

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
INFO:pytorch_lightning.tuner.lr_finder:LR finder stopped early after 80 steps due to diverging loss.
INFO:pytorch_lightning.tuner.lr_finder:Learning rate set to 3.630780547701014e-06
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/.lr_find_f7ff0955-8d65-43f5-8cb8-e6261f9bdd3a.ckpt
INFO:pytorch_lightning.utilities.rank_zero:Restored all states from the checkpoint file at /content/.lr_find_f7ff0955-8d65-43f5-8cb8-e6261f9bdd3a.ckpt
2023-11-20 09:50:59,100 - {ChemXTree.GMFU.tabular_model:539} - INFO - Suggested LR: 3.630780547701014e-06. For plot and detailed analysis, use `find_learning_rate` method.
INFO:ChemXTree.GMFU.tabular_model:Suggested LR: 3.630780547701014e-06. For plot and detailed analysis, use `find_learning_rate` method.
2023-11-20 09:50:59,110 - {ChemXTree.GMFU.tabular_model:544} - INFO - Training Started
INFO:ChemXTree.GMFU.tabular_mode

Output()

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.


2023-11-20 09:52:58,270 - {ChemXTree.GMFU.tabular_model:546} - INFO - Training the model completed
INFO:ChemXTree.GMFU.tabular_model:Training the model completed


Output()



Model not saved. AUC: 0.6223606784354448
Ensemble completed with score: 0.6223606784354448
ChemXTree AUCROC score: 0.6223606784354448
