In [1]:
import json
import random
import sys

import deepchem as dc
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    balanced_accuracy_score,
    f1_score,
    log_loss,
    matthews_corrcoef,
    roc_auc_score,
)
from sklearn.model_selection import RandomizedSearchCV

2024-03-28 15:22:02.343826: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-28 15:22:03.323735: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2024-03-28 15:22:03.323867: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
  from .autonotebook import tqdm as no

In [2]:
random_seed = 42
np.random.seed(random_seed)
random.seed(random_seed)
with open("../data/ding_et_al/split.json") as f:
    split_df = json.load(f)
data_path = "../data/ding_et_al/all_data.csv"

In [3]:
# --- MPNN setup and methods ---
def train(model, train_data, val_data, epochs, patience):
    val_epochs = []
    val_scores = []
    train_scores = []
    best_val_score = float("inf")
    stop_epoch = epochs
    print("Training MPNN...")
    for i in range(epochs):
        model.fit(train_data, nb_epoch=10, checkpoint_interval=0)
        train_pred = model.predict(train_data).squeeze()[:, 1]
        train_scores.append(log_loss(train_data.y, train_pred))
        val_pred = model.predict(val_data).squeeze()[:, 1]
        val_scores.append(log_loss(val_data.y, val_pred))

        # ----- Early Stopping -----
        if i > patience and min(val_scores[-patience:]) > best_val_score:
            print(f"Early stopping after {i - patience} epochs")
            stop_epoch = i - patience
            model.restore()
            break

        if val_scores[-1] < best_val_score and i > 50:
            best_val_score = val_scores[-1]
            model.save_checkpoint()

        if i % 10 == 0:
            print(f"epoch: {i}, train_loss {train_scores[-1]}, val_loss {val_scores[-1]}")

    return model


In [4]:
# --- load data ---
df = pd.read_csv(data_path)
smiles = df["m1"].values
featurizer = dc.feat.WeaveFeaturizer()
loader = dc.data.InMemoryLoader(tasks=["y2"], featurizer=featurizer)
dataset = loader.create_dataset(list(zip(smiles, df["y2"].values)))
dc_dataset_train = dataset.select(split_df["train"])
dc_dataset_val = dataset.select(split_df["val"])
dc_dataset_test = dataset.select(split_df["test"])

In [5]:
# --- setup MPNN --- #
hyperparams = {
    "batch_size": 32,
    "dropout": 0.3,
    "learning_rate": 0.005,
}
epochs = 300
patience = 100
dc_model = dc.models.MPNNModel(
    1,     
    n_pair_feat=14,
    n_atom_feat=75,
    n_hidden=75,
    T=1, M=1,
    mode="classification",
    **hyperparams)

# --- Train and Evaluate MPNN --- #
dc_model = train(dc_model, dc_dataset_train, dc_dataset_val, epochs, patience)
probs = dc_model.predict(dc_dataset_test).squeeze()[:, 1]
val_probs = dc_model.predict(dc_dataset_val).squeeze()[:, 1]
labels_val = dc_dataset_val.y
labels_test = dc_dataset_test.y
print(f"Val AUC (MPNN): {roc_auc_score(labels_val, val_probs)}")
print(f"Test AUC (MPNN): {roc_auc_score(labels_test, probs)}")
test_preds = probs > 0.5
val_preds = val_probs > 0.5
print(f"Val Accuracy (MPNN): {balanced_accuracy_score(labels_val, val_preds)}")
print(f"Test Accuracy (MPNN): {balanced_accuracy_score(labels_test, test_preds)}")
print(f"Val F1 (MPNN): {f1_score(labels_val, val_preds)}")
print(f"Test F1 (MPNN): {f1_score(labels_test, test_preds)}")
print(f"Val MCC (MPNN): {matthews_corrcoef(labels_val, val_preds)}")
print(f"Test MCC (MPNN): {matthews_corrcoef(labels_test, test_preds)}")


2024-03-28 15:25:12.181865: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-03-28 15:25:12.189497: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2024-03-28 15:25:12.190558: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2024-03-28 15:25:12.191191: I tensorflow/core/platform/cpu_feature

Training MPNN...
epoch: 0, train_loss 0.3550150146852182, val_loss 0.40534802964482936
epoch: 10, train_loss 0.2353612735051012, val_loss 0.22369020820271193
epoch: 20, train_loss 0.22344774573010254, val_loss 0.3162742426634277
epoch: 30, train_loss 0.18112056649964078, val_loss 0.24157185369048373
epoch: 40, train_loss 0.19279622150835934, val_loss 0.2567350655667475
epoch: 50, train_loss 0.17792206960800946, val_loss 0.279741281650849
epoch: 60, train_loss 0.19978904883097046, val_loss 0.3192327377402127


Save MPNN predictions

In [1]:
dc_model.predict(dc_dataset_test)

NameError: name 'dc_model' is not defined