## INIT

In [None]:
from google.colab import drive
try:
  drive.mount('/content/drive')
except:
  print("can't mount drive. Be sure you are not using a local runtime")
  pass


Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Colab\ Notebooks/chemprop

/content/drive/MyDrive/Colab Notebooks/chemprop


In [None]:
%pip install rdkit descriptastorus astartes lightning
import chemprop

Collecting rdkit
  Downloading rdkit-2025.9.3-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.2 kB)
Collecting descriptastorus
  Downloading descriptastorus-2.8.0-py3-none-any.whl.metadata (364 bytes)
Collecting astartes
  Downloading astartes-1.3.3-py3-none-any.whl.metadata (26 kB)
Collecting lightning
  Downloading lightning-2.6.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.9/44.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting pandas-flavor (from descriptastorus)
  Downloading pandas_flavor-0.8.1-py3-none-any.whl.metadata (6.6 kB)
Collecting lightning-utilities<2.0,>=0.10.0 (from lightning)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Collecting torchmetrics<3.0,>0.7.0 (from lightning)
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting pytorch-lightning (from lightning)
  Download

# Training Classification

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/chemprop/chemprop/blob/main/examples/training_classification.ipynb)

In [None]:
# Install chemprop from GitHub if running in Google Colab
import os

if os.getenv("COLAB_RELEASE_TAG"):
    try:
        import chemprop
    except ImportError:
        !git clone https://github.com/chemprop/chemprop.git
        %cd chemprop
        !pip install .
        %cd examples

# Import packages

In [None]:
import pandas as pd
from pathlib import Path

from lightning import pytorch as pl

from chemprop import data, featurizers, models, nn

# Change data inputs here

In [None]:
chemprop_dir = Path.cwd().parent
input_path = chemprop_dir / "tests" / "data" / "classification" / "mol.csv" # path to your data .csv file
num_workers = 0 # number of workers for dataloader. 0 means using main process for data loading
smiles_column = 'smiles' # name of the column containing SMILES strings
target_columns = ['NR-AhR', 'NR-ER', 'SR-ARE', 'SR-MMP'] # classification of activity (either 0 or 1)

In [None]:
chemprop_dir = Path.cwd().parent
input_path = "/content/drive/MyDrive/insilicodrug/insilicodrug_dataset.csv" # path to your data .csv file
num_workers = 0 # number of workers for dataloader. 0 means using main process for data loading
smiles_column = 'smiles' # name of the column containing SMILES strings
target_columns = ['Carcinogenicity'] # list of names of the columns containing targets

## Load data

In [None]:
%pip install molvs
from rdkit import Chem
from rdkit.Chem import Descriptors
from molvs import standardize_smiles, validate_smiles

import pandas as pd
from tqdm import tqdm
tqdm.pandas()

def getMol(smi, missingVal = None):
    try:
        return Chem.MolFromSmiles(smi)
    except Exception:
        return missingVal

Collecting molvs
  Downloading MolVS-0.1.1.tar.gz (61 kB)
[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/61.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.9/61.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: molvs
  Building wheel for molvs (setup.py) ... [?25l[?25hdone
  Created wheel for molvs: filename=MolVS-0.1.1-py3-none-any.whl size=32374 sha256=101e973e5ead5b31059dc4c2049c3a296ff73c21c00851fd78e30f9f678b01db
  Stored in directory: /root/.cache/pip/wheels/6b/a0/74/34bf3a6e5dddae8acf68fd9074fbb035115913764a2247c22c
Successfully built molvs
Installing collected packages: molvs
Successfully installed molvs-0.1.1


In [None]:
df_input = pd.read_csv(input_path)
target_columns = [col for col in df_input.columns if len(df_input[col].dropna().unique()) == 2]

df_input = df_input[df_input[smiles_column].apply(getMol, missingVal = None).notna()]
df_input[smiles_column] = df_input[smiles_column].progress_apply(standardize_smiles)
df_input = df_input[[smiles_column] + target_columns]
df_input = df_input[~ df_input[target_columns].isna().all(axis = 1)]
df_input

[10:55:30] Explicit valence for atom # 1 Si, 5, is greater than permitted
[10:55:31] Explicit valence for atom # 4 Al, 6, is greater than permitted
[10:55:31] Explicit valence for atom # 4 Al, 6, is greater than permitted
[10:55:32] Explicit valence for atom # 3 N, 5, is greater than permitted
[10:55:33] Explicit valence for atom # 15 P, 7, is greater than permitted
[10:55:33] Explicit valence for atom # 25 P, 7, is greater than permitted
[10:55:33] Explicit valence for atom # 25 P, 7, is greater than permitted
[10:55:33] Explicit valence for atom # 29 P, 7, is greater than permitted
[10:55:33] Explicit valence for atom # 5 N, 4, is greater than permitted
[10:55:33] Explicit valence for atom # 4 P, 7, is greater than permitted
[10:55:34] Explicit valence for atom # 28 P, 7, is greater than permitted
[10:55:34] Explicit valence for atom # 20 Al, 6, is greater than permitted
[10:55:34] Explicit valence for atom # 11 P, 7, is greater than permitted
[10:55:34] Explicit valence for atom # 5

Unnamed: 0,smiles,Carcinogenicity,Ames Mutagenicity,Respiratory toxicity,Eye irritation,Eye corrosion,Cardiotoxicity1,Cardiotoxicity10,Cardiotoxicity30,Cardiotoxicity5,...,CYP3A4_balanced,CYP1A2_balanced,CYP2C19_balanced,Hepatotoxicity_clean,rep_tox,fdamdd2,f20,f30,skin_disorder(ochem),rat_tox_bin
1,B12B3B4B1C234,,,,,,,,,,...,,,,,,,,,,0.0
2,Bc1c(Br)cncc1Br,,,,,,,,,,...,,,,,,,,,,
3,Br/C(=N\Nc1nnn[nH]1)c1ccncc1,,,,,,,,,,...,,,,,,,,,,
5,Br/C=C/Br,,1.0,,,,,,,,...,,,,,,,,,,
8,BrBr,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96193,CCCCc1ncc(/C=C(\Cc2cccs2)C(=O)O)n1Cc1ccc(C(=O)...,,,,,,,,,,...,,,,,,,,,,
96194,CN1C(C(=O)Nc2nccs2)=C(O)c2ccccc2S1(=O)=O,,,,,,,,,,...,,,,,,,,,,
96195,O=C(O)CCC(=O)Nc1ccc(S(=O)(=O)Nc2nccs2)cc1,,,,,,,,,,...,,,,,,,,,,
96196,O=C(O)c1ccccc1C(=O)Nc1ccc(S(=O)(=O)Nc2nccs2)cc1,,,,,,,,,,...,,,,,,,,,,


## Get SMILES and targets

In [None]:
smis = df_input.loc[:, smiles_column].values
ys = df_input.loc[:, target_columns].values

In [None]:
# Take a look at the first 5 SMILES strings and target columns
smis[:5], ys[:5]

(array(['B12B3B4B1C234', 'Bc1c(Br)cncc1Br',
        'Br/C(=N\\Nc1nnn[nH]1)c1ccncc1', 'Br/C=C/Br', 'BrBr'], dtype=object),
 array([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan,  0.],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
          1., nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan,  0.,  0.,  0., nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
    

## Get molecule datapoints

In [None]:
all_data = [data.MoleculeDatapoint.from_smi(smi, y) for smi, y in zip(smis, ys)]



## Perform data splitting for training, validation, and testing

In [None]:
# available split types
list(data.SplitType.keys())

['SCAFFOLD_BALANCED',
 'RANDOM_WITH_REPEATED_SMILES',
 'RANDOM',
 'KENNARD_STONE',
 'KMEANS']

In [None]:
mols = [d.mol for d in all_data]  # RDkit Mol objects are use for structure based splits
train_indices, val_indices, test_indices = data.make_split_indices(mols, "random", (0.8, 0.1, 0.1))
train_data, val_data, test_data = data.split_data_by_indices(
    all_data, train_indices, val_indices, test_indices
)



## Get MoleculeDataset

In [None]:
featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()

train_dset = data.MoleculeDataset(train_data[0], featurizer)
val_dset = data.MoleculeDataset(val_data[0], featurizer)
test_dset = data.MoleculeDataset(test_data[0], featurizer)

## Get DataLoader

In [None]:
train_loader = data.build_dataloader(train_dset, num_workers=num_workers)
val_loader = data.build_dataloader(val_dset, num_workers=num_workers, shuffle=False)
test_loader = data.build_dataloader(test_dset, num_workers=num_workers, shuffle=False)



# Change Message-Passing Neural Network (MPNN) inputs here

## Message Passing
A `Message passing` constructs molecular graphs using message passing to learn node-level hidden representations.

Options are `mp = nn.BondMessagePassing()` or `mp = nn.AtomMessagePassing()`

In [None]:
mp = nn.BondMessagePassing()

## Aggregation
An `Aggregation` is responsible for constructing a graph-level representation from the set of node-level representations after message passing.

Available options can be found in ` nn.agg.AggregationRegistry`, including
- `agg = nn.MeanAggregation()`
- `agg = nn.SumAggregation()`
- `agg = nn.NormAggregation()`

In [None]:
print(nn.agg.AggregationRegistry)

ClassRegistry {
    'mean': <class 'chemprop.nn.agg.MeanAggregation'>,
    'sum': <class 'chemprop.nn.agg.SumAggregation'>,
    'norm': <class 'chemprop.nn.agg.NormAggregation'>
}


In [None]:
agg = nn.MeanAggregation()

## Feed-Forward Network (FFN)

A `FFN` takes the aggregated representations and make target predictions.

Available options can be found in `nn.PredictorRegistry`.

For regression:
- `ffn = nn.RegressionFFN()`
- `ffn = nn.MveFFN()`
- `ffn = nn.EvidentialFFN()`

For classification:
- `ffn = nn.BinaryClassificationFFN()`
- `ffn = nn.BinaryDirichletFFN()`
- `ffn = nn.MulticlassClassificationFFN()`
- `ffn = nn.MulticlassDirichletFFN()`

For spectral:
- `ffn = nn.SpectralFFN()` # will be available in future version

In [None]:
print(nn.PredictorRegistry)

ClassRegistry {
    'regression': <class 'chemprop.nn.predictors.RegressionFFN'>,
    'regression-mve': <class 'chemprop.nn.predictors.MveFFN'>,
    'regression-evidential': <class 'chemprop.nn.predictors.EvidentialFFN'>,
    'regression-quantile': <class 'chemprop.nn.predictors.QuantileFFN'>,
    'classification': <class 'chemprop.nn.predictors.BinaryClassificationFFN'>,
    'classification-dirichlet': <class 'chemprop.nn.predictors.BinaryDirichletFFN'>,
    'multiclass': <class 'chemprop.nn.predictors.MulticlassClassificationFFN'>,
    'multiclass-dirichlet': <class 'chemprop.nn.predictors.MulticlassDirichletFFN'>,
    'spectral': <class 'chemprop.nn.predictors.SpectralFFN'>
}


In [None]:
ffn = nn.BinaryClassificationFFN(n_tasks = len(target_columns))

## Batch Norm
A `Batch Norm` normalizes the outputs of the aggregation by re-centering and re-scaling.

Whether to use batch norm

In [None]:
batch_norm = False

## Metrics
`Metrics` are the ways to evaluate the performance of model predictions.

Available options can be found in `metrics.MetricRegistry`, including

In [None]:
print(nn.metrics.MetricRegistry)

ClassRegistry {
    'mse': <class 'chemprop.nn.metrics.MSE'>,
    'mae': <class 'chemprop.nn.metrics.MAE'>,
    'rmse': <class 'chemprop.nn.metrics.RMSE'>,
    'bounded-mse': <class 'chemprop.nn.metrics.BoundedMSE'>,
    'bounded-mae': <class 'chemprop.nn.metrics.BoundedMAE'>,
    'bounded-rmse': <class 'chemprop.nn.metrics.BoundedRMSE'>,
    'r2': <class 'chemprop.nn.metrics.R2Score'>,
    'binary-mcc': <class 'chemprop.nn.metrics.BinaryMCCMetric'>,
    'multiclass-mcc': <class 'chemprop.nn.metrics.MulticlassMCCMetric'>,
    'roc': <class 'chemprop.nn.metrics.BinaryAUROC'>,
    'prc': <class 'chemprop.nn.metrics.BinaryAUPRC'>,
    'accuracy': <class 'chemprop.nn.metrics.BinaryAccuracy'>,
    'f1': <class 'chemprop.nn.metrics.BinaryF1Score'>
}


In [None]:
# AUROC used by default
metric_list = ['accuracy', 'roc', 'prc', 'binary-mcc', 'f1']
metric_list = [nn.metrics.MetricRegistry[metric]() for metric in metric_list]
metric_list #= [chemprop.nn.metrics.BinaryAccuracy() , chemprop.nn.metrics.BinaryMCCMetric()] # Only the first metric is used for training and early stopping

[BinaryAccuracy(),
 BinaryAUROC(),
 BinaryAUPRC(),
 BinaryMCCMetric(task_weights=[[1.0]]),
 BinaryF1Score()]

## Constructs MPNN

In [None]:
mpnn = models.MPNN(mp, agg, ffn, batch_norm, metric_list)

mpnn

MPNN(
  (message_passing): BondMessagePassing(
    (W_i): Linear(in_features=86, out_features=300, bias=False)
    (W_h): Linear(in_features=300, out_features=300, bias=False)
    (W_o): Linear(in_features=372, out_features=300, bias=True)
    (dropout): Dropout(p=0.0, inplace=False)
    (tau): ReLU()
    (V_d_transform): Identity()
    (graph_transform): Identity()
  )
  (agg): MeanAggregation()
  (bn): Identity()
  (predictor): BinaryClassificationFFN(
    (ffn): MLP(
      (0): Sequential(
        (0): Linear(in_features=300, out_features=300, bias=True)
      )
      (1): Sequential(
        (0): ReLU()
        (1): Dropout(p=0.0, inplace=False)
        (2): Linear(in_features=300, out_features=56, bias=True)
      )
    )
    (criterion): BCELoss(task_weights=[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0

# Set up trainer

In [None]:
trainer = pl.Trainer(
    logger=False,
    enable_checkpointing=True, # Use `True` if you want to save model checkpoints. The checkpoints will be saved in the `checkpoints` folder.
    enable_progress_bar=True,
    accelerator="gpu",
    devices=1,
    max_epochs=20, # number of epochs to train for
)

INFO: üí° Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:lightning.pytorch.utilities.rank_zero:üí° Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores


# Start training

In [None]:
trainer.fit(mpnn, train_loader, val_loader)

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: Loading `train_dataloader` to estimate number of stepping batches.
INFO:lightning.pytorch.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.


Output()

INFO: `Trainer.fit` stopped: `max_epochs=20` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=20` reached.


# Test results

In [None]:
results = trainer.test(mpnn, test_loader)

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

In [None]:
pd.DataFrame(results).to_csv('/content/drive/MyDrive/insilicodrug/insilicodrug_results_mpnn.tsv', sep = '\t')

## SIDER

In [None]:
chemprop_dir = Path.cwd().parent
input_path = "/content/drive/MyDrive/insilicodrug/sider.csv" # path to your data .csv file
num_workers = 0 # number of workers for dataloader. 0 means using main process for data loading
smiles_column = 'smiles' # name of the column containing SMILES strings
target_columns = pd.read_csv(input_path).columns[1:].to_list()

In [None]:
df_input = pd.read_csv(input_path)
target_columns = [col for col in df_input.columns if len(df_input[col].dropna().unique()) == 2]

df_input = df_input[df_input[smiles_column].apply(getMol, missingVal = None).notna()]
df_input[smiles_column] = df_input[smiles_column].progress_apply(standardize_smiles)
df_input = df_input[[smiles_column] + target_columns]
df_input = df_input[~ df_input[target_columns].isna().all(axis = 1)]
df_input

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1427/1427 [00:04<00:00, 326.28it/s]


Unnamed: 0,smiles,Hepatobiliary disorders,Metabolism and nutrition disorders,Product issues,Eye disorders,Investigations,Musculoskeletal and connective tissue disorders,Gastrointestinal disorders,Social circumstances,Immune system disorders,...,"Congenital, familial and genetic disorders",Infections and infestations,"Respiratory, thoracic and mediastinal disorders",Psychiatric disorders,Renal and urinary disorders,"Pregnancy, puerperium and perinatal conditions",Ear and labyrinth disorders,Cardiac disorders,Nervous system disorders,"Injury, poisoning and procedural complications"
0,NCCNCCNCCNCCN,1,1,0,0,1,1,1,0,0,...,0,0,1,1,0,0,1,1,1,0
1,CC(C)(C)c1cc(C(C)(C)C)c(NC(=O)c2c[nH]c3ccccc3c...,0,1,0,0,1,1,1,0,0,...,0,1,1,0,0,0,1,0,1,0
2,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H...,0,1,0,1,1,0,1,0,1,...,0,0,0,1,0,0,0,0,1,0
3,C#C[C@]1(O)CCC2C3CCC4=CC(=O)CCC4C3C(=C)CC21CC,1,1,0,1,1,1,1,0,1,...,1,1,1,1,1,1,0,0,1,1
4,NC(=O)N1c2ccccc2CC(O)c2ccccc21,1,1,0,1,1,1,1,0,1,...,0,1,1,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1422,C[C@H]1CN(C[C@H](Cc2ccccc2)C(=O)NCC(=O)O)CC[C@...,0,1,0,0,0,1,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1423,CC[C@H]1OC(=O)[C@H](C)C(=O)[C@H](C)[C@@H](O[C@...,1,1,0,1,1,1,1,0,1,...,0,1,1,1,1,0,1,1,1,1
1424,CCOc1ccc(Cc2cc([C@@H]3O[C@H](CO)[C@@H](O)[C@H]...,1,1,0,0,1,1,1,0,1,...,0,1,0,0,1,0,0,1,1,1
1425,O=c1[nH]c2ccccc2n1C1CCN(CCCC(c2ccc(F)cc2)c2ccc...,0,1,0,1,1,1,1,0,0,...,0,0,0,1,1,0,0,1,1,1


In [None]:
smis = df_input.loc[:, smiles_column].values
ys = df_input.loc[:, target_columns].values
all_data = [data.MoleculeDatapoint.from_smi(smi, y) for smi, y in zip(smis, ys)]
mols = [d.mol for d in all_data]  # RDkit Mol objects are use for structure based splits
train_indices, val_indices, test_indices = data.make_split_indices(mols, "random", (0.8, 0.1, 0.1))
train_data, val_data, test_data = data.split_data_by_indices(
    all_data, train_indices, val_indices, test_indices
)



In [None]:
featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()

train_dset = data.MoleculeDataset(train_data[0], featurizer)
val_dset = data.MoleculeDataset(val_data[0], featurizer)
test_dset = data.MoleculeDataset(test_data[0], featurizer)

train_loader = data.build_dataloader(train_dset, num_workers=num_workers)
val_loader = data.build_dataloader(val_dset, num_workers=num_workers, shuffle=False)
test_loader = data.build_dataloader(test_dset, num_workers=num_workers, shuffle=False)

mp = nn.BondMessagePassing()
agg = nn.MeanAggregation()
ffn = nn.BinaryClassificationFFN(n_tasks = len(target_columns))
batch_norm = False

metric_list = ['accuracy', 'roc', 'prc', 'binary-mcc', 'f1']
metric_list = [nn.metrics.MetricRegistry[metric]() for metric in metric_list]
mpnn = models.MPNN(mp, agg, ffn, batch_norm, metric_list)

In [None]:
trainer = pl.Trainer(
    logger=False,
    enable_checkpointing=True, # Use `True` if you want to save model checkpoints. The checkpoints will be saved in the `checkpoints` folder.
    enable_progress_bar=True,
    accelerator="gpu",
    devices=1,
    max_epochs=20, # number of epochs to train for
)

INFO: üí° Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:lightning.pytorch.utilities.rank_zero:üí° Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores


In [None]:
trainer.fit(mpnn, train_loader, val_loader)

/usr/local/lib/python3.12/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:881: Checkpoint directory /content/drive/MyDrive/Colab Notebooks/chemprop/checkpoints exists and is not empty.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: Loading `train_dataloader` to estimate number of stepping batches.
INFO:lightning.pytorch.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.


Output()

INFO: `Trainer.fit` stopped: `max_epochs=20` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=20` reached.


In [None]:
results = trainer.test(mpnn, test_loader)

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

In [None]:
pd.DataFrame(results).to_csv('/content/drive/MyDrive/insilicodrug/sider_results_mpnn.tsv', sep = '\t')

## Tox21

In [None]:
chemprop_dir = Path.cwd().parent
input_path = "/content/drive/MyDrive/insilicodrug/tox21.csv" # path to your data .csv file
num_workers = 0 # number of workers for dataloader. 0 means using main process for data loading
smiles_column = 'smiles' # name of the column containing SMILES strings
target_columns = pd.read_csv(input_path).columns[:-2].to_list()

In [None]:
df_input = pd.read_csv(input_path)
target_columns = [col for col in df_input.columns if len(df_input[col].dropna().unique()) == 2]

df_input = df_input[df_input[smiles_column].apply(getMol, missingVal = None).notna()]
df_input[smiles_column] = df_input[smiles_column].progress_apply(standardize_smiles)
df_input = df_input[[smiles_column] + target_columns]
df_input = df_input[~ df_input[target_columns].isna().all(axis = 1)]
df_input

[16:14:57] Explicit valence for atom # 8 Al, 6, is greater than permitted
[16:14:58] Explicit valence for atom # 3 Al, 6, is greater than permitted
[16:14:58] Explicit valence for atom # 4 Al, 6, is greater than permitted
[16:14:58] Explicit valence for atom # 4 Al, 6, is greater than permitted
[16:14:58] Explicit valence for atom # 9 Al, 6, is greater than permitted
[16:14:58] Explicit valence for atom # 5 Al, 6, is greater than permitted
[16:14:58] Explicit valence for atom # 16 Al, 6, is greater than permitted
[16:14:58] Explicit valence for atom # 20 Al, 6, is greater than permitted
 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 5666/7823 [00:07<00:03, 594.32it/s][16:15:06] Can't kekulize mol.  Unkekulized atoms: 3 10
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7823/7823 [00:11<00:00, 680.37it/s]


Unnamed: 0,smiles,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,CCN1C(=O)NC(c2ccccc2)C1=O,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0
2,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,,,,,,,,0.0,,0.0,,
3,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0
4,CC(O)(P(=O)(O)O)P(=O)(O)O,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7826,CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nnn...,,,,,,,,0.0,,0.0,,
7827,CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(...,1.0,1.0,0.0,0.0,1.0,0.0,,,0.0,0.0,,0.0
7828,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
7829,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,1.0,1.0,0.0,,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


In [None]:
smis = df_input.loc[:, smiles_column].values
ys = df_input.loc[:, target_columns].values
all_data = [data.MoleculeDatapoint.from_smi(smi, y) for smi, y in zip(smis, ys)]
mols = [d.mol for d in all_data]  # RDkit Mol objects are use for structure based splits
train_indices, val_indices, test_indices = data.make_split_indices(mols, "random", (0.8, 0.1, 0.1))
train_data, val_data, test_data = data.split_data_by_indices(
    all_data, train_indices, val_indices, test_indices
)



In [None]:
featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()

train_dset = data.MoleculeDataset(train_data[0], featurizer)
val_dset = data.MoleculeDataset(val_data[0], featurizer)
test_dset = data.MoleculeDataset(test_data[0], featurizer)

train_loader = data.build_dataloader(train_dset, num_workers=num_workers)
val_loader = data.build_dataloader(val_dset, num_workers=num_workers, shuffle=False)
test_loader = data.build_dataloader(test_dset, num_workers=num_workers, shuffle=False)

mp = nn.BondMessagePassing()
agg = nn.MeanAggregation()
ffn = nn.BinaryClassificationFFN(n_tasks = len(target_columns))
batch_norm = False

metric_list = ['accuracy', 'roc', 'prc', 'binary-mcc', 'f1']
metric_list = [nn.metrics.MetricRegistry[metric]() for metric in metric_list]
mpnn = models.MPNN(mp, agg, ffn, batch_norm, metric_list)

In [None]:
trainer = pl.Trainer(
    logger=False,
    enable_checkpointing=True, # Use `True` if you want to save model checkpoints. The checkpoints will be saved in the `checkpoints` folder.
    enable_progress_bar=True,
    accelerator="gpu",
    devices=1,
    max_epochs=20, # number of epochs to train for
)

INFO: üí° Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:lightning.pytorch.utilities.rank_zero:üí° Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores


In [None]:
trainer.fit(mpnn, train_loader, val_loader)

/usr/local/lib/python3.12/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:881: Checkpoint directory /content/drive/MyDrive/Colab Notebooks/chemprop/checkpoints exists and is not empty.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: Loading `train_dataloader` to estimate number of stepping batches.
INFO:lightning.pytorch.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.


Output()

INFO: `Trainer.fit` stopped: `max_epochs=20` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=20` reached.


In [None]:
results = trainer.test(mpnn, test_loader)

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

In [None]:
pd.DataFrame(results).to_csv('/content/drive/MyDrive/insilicodrug/tox21_results_mpnn.tsv', sep = '\t')