In [1]:
import numpy as np
import pandas as pd
from lightning import pytorch as pl
from chemprop import data, featurizers, models, nn, utils
import random
import os
import torch
def set_random_seed(seed:int)-> None:
    random.seed(seed)
    os.environ['PYTHONHASHSEED']= str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark =False
    torch.backends.cudnn.deterministic =True
seed = 0
set_random_seed(seed)

In [2]:
df_input = pd.read_csv('dataset.csv')
targetname = ['PCE']
smis = df_input.loc[:, 'SMILES'].values
ys = df_input.loc[:, targetname].values
mols = [utils.make_mol(smi, keep_h=False, add_h=False) for smi in smis]
extra_mol_descriptors = np.array(df_input.iloc[:,2:])
datapoints = [
    data.MoleculeDatapoint(mol, y, V_f=None, E_f=None, V_d=None ,x_d=X_d)
    for mol, y,  X_d in zip(
        mols,
        ys,
        extra_mol_descriptors,
    )
]

In [3]:
featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer(extra_atom_fdim=0, extra_bond_fdim=0)

train_indices, val_indices, test_indices = data.make_split_indices(mols, "random", (0.8, 0.1, 0.1))
train_data, val_data, test_data = data.split_data_by_indices(
    datapoints, train_indices, val_indices, test_indices
)

train_dset = data.MoleculeDataset(train_data, featurizer)
val_dset = data.MoleculeDataset(val_data, featurizer)
test_dset = data.MoleculeDataset(test_data, featurizer)

In [4]:
targets_scaler = train_dset.normalize_targets()
extra_datapoint_descriptors_scaler = train_dset.normalize_inputs("X_d")

val_dset.normalize_targets(targets_scaler)
val_dset.normalize_inputs("X_d", extra_datapoint_descriptors_scaler)

In [5]:
# Featurize the train and val datasets to save computation time.
train_dset.cache = True
val_dset.cache = True

train_loader = data.build_dataloader(train_dset)
val_loader = data.build_dataloader(val_dset, shuffle=False)
test_loader = data.build_dataloader(test_dset, shuffle=False)

In [6]:
mp = nn.BondMessagePassing(depth=6
                           #, d_h=300
                           )

In [7]:
ffn_input_dim = mp.output_dim + extra_mol_descriptors.shape[1]

output_transform = nn.UnscaleTransform.from_standard_scaler(targets_scaler)
ffn = nn.RegressionFFN(input_dim=ffn_input_dim, hidden_dim=1000, dropout=0, n_layers=3, output_transform=output_transform)

In [8]:
X_d_transform = nn.ScaleTransform.from_standard_scaler(extra_datapoint_descriptors_scaler)

chemprop_model = models.MPNN(mp, nn.NormAggregation(), ffn, X_d_transform=X_d_transform)

## Training and prediction

In [9]:
trainer = pl.Trainer(
    logger=False, enable_checkpointing=False, enable_progress_bar=True, max_epochs=1000
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [10]:
trainer.fit(chemprop_model, train_loader, val_loader)

You are using a CUDA device ('NVIDIA GeForce RTX 3060 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
d:\anaconda3\envs\chemprop2\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.

  | Name            | Type               | Params
-------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K 
1 | agg             | NormAggregation    | 0     
2 | bn   

Sanity Checking DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]

d:\anaconda3\envs\chemprop2\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 999: 100%|██████████| 3/3 [00:00<00:00, 60.16it/s, train_loss=0.000356, val_loss=8.370] 

`Trainer.fit` stopped: `max_epochs=1000` reached.


Epoch 999: 100%|██████████| 3/3 [00:00<00:00, 58.98it/s, train_loss=0.000356, val_loss=8.370]


In [11]:
results = trainer.test(chemprop_model, test_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
d:\anaconda3\envs\chemprop2\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 125.33it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
 batch_averaged_test/mse    3.2876479625701904
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


In [12]:
new_mols = pd.read_csv('new-mol-data.csv')
new_smiles = new_mols['SMILES']
test_mols = [utils.make_mol(new, keep_h=False, add_h=False) for new in new_smiles]
test_feats = np.array(new_mols.iloc[:,2:])
test_data = [
    data.MoleculeDatapoint(mol, V_f=None, E_f=None, V_d=None ,x_d=X_d)
    for mol, X_d in zip(
        test_mols,
        test_feats,
    )
]

featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()
test_dset = data.MoleculeDataset(test_data, featurizer=featurizer)
test_loader = data.build_dataloader(test_dset, shuffle=False)

In [13]:
with torch.inference_mode():
    trainer = pl.Trainer(
        logger=None,
        enable_progress_bar=True,
        accelerator="gpu",
        devices=1
    )
    test_preds = trainer.predict(chemprop_model, test_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


d:\anaconda3\envs\chemprop2\Lib\site-packages\lightning\pytorch\trainer\connectors\logger_connector\logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
Missing logger folder: e:\other\Colleague\zhangqi\PTZ\article\code\chemprop\chemprop_v2\lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
d:\anaconda3\envs\chemprop2\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 200.54it/s]


In [14]:
# torch.save(chemprop_model,'mpnn_v2.ckpt')