In [None]:
import json
import random
import sys

import deepchem as dc
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    balanced_accuracy_score,
    f1_score,
    log_loss,
    matthews_corrcoef,
    roc_auc_score,
)
from sklearn.model_selection import RandomizedSearchCV

In [None]:
random_seed = 42
np.random.seed(random_seed)
random.seed(random_seed)
with open("../data/ding_et_al/split.json") as f:
    split_df = json.load(f)
data_path = "../data/ding_et_al/all_data.csv"

In [None]:
# --- GCN setup and methods ---
def train(model, train_data, val_data, epochs, patience):
    val_epochs = []
    val_scores = []
    train_scores = []
    best_val_score = float("inf")
    stop_epoch = epochs
    print("Training GCN...")
    for i in range(epochs):
        model.fit(train_data, nb_epoch=1, checkpoint_interval=0)
        train_pred = model.predict(train_data).squeeze()[:, 1]
        train_scores.append(log_loss(train_data.y, train_pred))
        val_pred = model.predict(val_data).squeeze()[:, 1]
        val_scores.append(log_loss(val_data.y, val_pred))

        # ----- Early Stopping -----
        if i > patience and min(val_scores[-patience:]) > best_val_score:
            print(f"Early stopping after {i - patience} epochs")
            stop_epoch = i - patience
            model.restore()
            break

        if val_scores[-1] < best_val_score and i > 100:
            best_val_score = val_scores[-1]
            model.save_checkpoint()

        if i % 100 == 0:
            print(f"epoch: {i}, train_loss {train_scores[-1]}, val_loss {val_scores[-1]}")

    return model


# --- load data ---
df = pd.read_csv(data_path)
smiles = df["m1"].values
featurizer = dc.feat.MolGraphConvFeaturizer()
loader = dc.data.InMemoryLoader(tasks=["y2"], featurizer=featurizer)
dataset = loader.create_dataset(list(zip(smiles, df["y2"].values)))
dc_dataset_train = dataset.select(split_df["train"])
dc_dataset_val = dataset.select(split_df["val"])
dc_dataset_test = dataset.select(split_df["test"])

In [None]:
# --- setup GCN --- #
hyperparams = {
    "batch_size": 64,
    "dense_layer_size": 32,
    "graph_conv_layers": [32, 32],
    "dropout": [0.3, 0.3, 0.3],
    "learning_rate": 0.005,
}
epochs = 5000
patience = 500
dc_model = dc.models.GraphConvModel(1, mode="classification", batch_normalize=True, **hyperparams)

# --- Train and Evaluate GCN --- #
dc_model = train(dc_model, dc_dataset_train, dc_dataset_val, epochs, patience)
probs = dc_model.predict(dc_dataset_test).squeeze()[:, 1]
val_probs = dc_model.predict(dc_dataset_val).squeeze()[:, 1]
labels_val = dc_dataset_val.y
labels_test = dc_dataset_test.y
print(f"Val AUC (GCN): {roc_auc_score(labels_val, val_probs)}")
print(f"Test AUC (GCN): {roc_auc_score(labels_test, probs)}")
test_preds = probs > 0.5
val_preds = val_probs > 0.5
print(f"Val Accuracy (GCN): {balanced_accuracy_score(labels_val, val_preds)}")
print(f"Test Accuracy (GCN): {balanced_accuracy_score(labels_test, test_preds)}")
print(f"Val F1 (GCN): {f1_score(labels_val, val_preds)}")
print(f"Test F1 (GCN): {f1_score(labels_test, test_preds)}")
print(f"Val MCC (GCN): {matthews_corrcoef(labels_val, val_preds)}")
print(f"Test MCC (GCN): {matthews_corrcoef(labels_test, test_preds)}")

# Make a dictionary that maps SMILES strings to their corresponding embeddings
gcn_X = dc_model.predict_embedding(dataset)[: len(dataset)]

Save GCN predictions

In [None]:
with open("../data/gcn_x.npy", "wb") as f:
    np.save(f, gcn_X)