# Train and Test a CatBoost Classifier

In [None]:
import json
import random
import sys

import deepchem as dc
import numpy as np
import pandas as pd
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    balanced_accuracy_score,
    f1_score,
    log_loss,
    matthews_corrcoef,
    roc_auc_score,
)
from sklearn.model_selection import RandomizedSearchCV

## Load data

In [None]:
def load_data(filename):
    df = pd.read_csv(filename)
    df = df.to_dict(orient="records")
    return df

In [None]:
random_seed = 42
np.random.seed(random_seed)
random.seed(random_seed)
lnps = load_data("../data/ding_et_al/all_data.csv")
with open("../data/ding_et_al/split.json") as f:
    split_df = json.load(f)

In [None]:
# --- This function we use for 'our' methods ---
# we simply generate a feature matrix using only the embeddings from the m1 molecule
def generate_simple_feature_matrix(data_df, fp_dict):
    X = []
    y = []
    processed_data = []
    for item in data_df:
        result = {}
        result["label"] = item["y2"]
        result["m1_fingerprint"] = fp_dict[item["m1"]]
        processed_data.append(result)
        X_item = result["m1_fingerprint"]
        X.append(X_item)
        y.append(item["y2"])
    return (X, y, processed_data)


# -- Functions from Ding et al. to process data --
def convert_to_one_hot(val, min, max, step):
    result = []
    for i in range(int((max - min) / step)):
        if i * step <= val < (i + 1) * step:
            result.append(1)
        else:
            result.append(0)
    return result


def generate_feature_matrix(data_df, fp_dict):
    X = []
    y = []
    processed_data = []

    for item in data_df:
        result = {}
        result["label"] = item["y2"]
        result["p1_feature"] = convert_to_one_hot(item["p1"], min=0, max=100, step=5)
        result["p2_feature"] = convert_to_one_hot(item["p2"], min=0, max=100, step=5)
        result["p3_feature"] = convert_to_one_hot(item["p3"], min=0, max=100, step=5)
        result["p4_feature"] = convert_to_one_hot(item["p4"], min=0, max=1.5, step=0.25)
        result["m1_fingerprint"] = fp_dict[item["m1"]]
        result["m2_fingerprint"] = fp_dict[item["m2"]]
        result["m3_fingerprint"] = fp_dict[item["m3"]]
        result["m4_fingerprint"] = fp_dict[item["m4"]]

        processed_data.append(result)
        X_item = (
            result["p1_feature"]
            + result["p2_feature"]
            + result["p3_feature"]
            + result["p4_feature"]
            + result["m1_fingerprint"]
            + result["m2_fingerprint"]
            + result["m3_fingerprint"]
            + result["m4_fingerprint"]
        )
        X.append(X_item)
        y.append(item["y2"])
    return (X, y, processed_data)

In [None]:
# --- Load in fingerprints --- #
with open("../data/ding_et_al/mol2fp_grover_large.json", "r") as f:
    df_fp_grover_large = json.load(f)
with open("../data/ding_et_al/mol2fp_grover.json", "r") as f:
    df_fp_grover = json.load(f)
with open("../data/ding_et_al/mol2fp.json", "r") as f:
    df_fp = json.load(f)
with open("../data/mol2fp_cfp_all_data.json", "r") as f:
    df_fp_cfp = json.load(f)
with open("../data/mol2fp_MegaMB_base_all_data.json", "r") as f:
    df_fp_mmb = json.load(f)
with open("../data/mol2fp_MegaMB_finetuned_all_data.json", "r") as f:
    df_fp_mmb_ft = json.load(f)
gcn_X = np.load("../data/gcn_x.npy")

fp_X, y, _ = generate_feature_matrix(lnps, df_fp)
grover_X, _, _ = generate_feature_matrix(lnps, df_fp_grover)
grover_large_X, _, _ = generate_feature_matrix(lnps, df_fp_grover_large)
cfp_X, _, _ = generate_simple_feature_matrix(lnps, df_fp_cfp)
mmb_X, _, _ = generate_simple_feature_matrix(lnps, df_fp_mmb)
mmb_ft_X, _, _ = generate_simple_feature_matrix(lnps, df_fp_mmb_ft)

In [None]:
# --- Make Hybrid fingerprints ----
fp_grover_X = pd.concat([pd.DataFrame(fp_X), pd.DataFrame(grover_X)], axis=1)
fp_grover_large_X = pd.concat([pd.DataFrame(fp_X), pd.DataFrame(grover_large_X)], axis=1)
cfp_mmb_ft_X = pd.concat([pd.DataFrame(cfp_X), pd.DataFrame(mmb_ft_X)], axis=1)
gcn_cfp_X = pd.concat([pd.DataFrame(gcn_X), pd.DataFrame(cfp_X)], axis=1)
gcn_mmb_ft_X = pd.concat([pd.DataFrame(gcn_X), pd.DataFrame(mmb_ft_X)], axis=1)
gcn_mmb_ft_cfp = pd.concat(
    [pd.DataFrame(gcn_X), pd.DataFrame(mmb_ft_X), pd.DataFrame(cfp_X)], axis=1
)

## Train and test CatBoost
> Use different embeddings for each experiment

In [None]:
def select_catboost(parameter_grid, X, y, split_df):
    X_train = np.array(X)[split_df["train"],]
    X_val = np.array(X)[split_df["val"],]
    X_test = np.array(X)[split_df["test"],]
    y_train = np.array(y)[split_df["train"]]
    y_val = np.array(y)[split_df["val"]]
    y_test = np.array(y)[split_df["test"]]

    val_auc = []
    test_auc = []
    val_acc = []
    test_acc = []
    val_f1 = []
    test_f1 = []
    val_mcc = []
    test_mcc = []
    for depth in parameter_grid["depth"]:
        for learning_rate in parameter_grid["learning_rate"]:
            for iterations in parameter_grid["iterations"]:
                model = CatBoostClassifier(
                    depth=depth,
                    iterations=iterations,
                    learning_rate=learning_rate,
                    verbose=False,
                    random_seed=random_seed,
                )
                model.fit(X_train, y_train)
                model_prob_val = model.predict_proba(X_val)
                model_prob_test = model.predict_proba(X_test)
                val_auc.append(roc_auc_score(y_val, model_prob_val[:, 1]))
                test_auc.append(roc_auc_score(y_test, model_prob_test[:, 1]))
                pred_val = model.predict(X_val)
                pred_test = model.predict(X_test)
                val_acc.append(balanced_accuracy_score(y_val, pred_val))
                test_acc.append(balanced_accuracy_score(y_test, pred_test))
                val_f1.append(f1_score(y_val, pred_val))
                test_f1.append(f1_score(y_test, pred_test))
                val_mcc.append(matthews_corrcoef(y_val, pred_val))
                test_mcc.append(matthews_corrcoef(y_test, pred_test))
    return (val_auc, test_auc, val_acc, test_acc, val_f1, test_f1, val_mcc, test_mcc)


param_grid = {"depth": [3, 4, 5], "learning_rate": [0.01, 0.1], "iterations": [1000, 2000, 3000]}


def catboost(X, y, split_df):
    """
    Run Catboost without any hparam optimization
    """
    X_train = np.array(X)[split_df["train"],]
    X_val = np.array(X)[split_df["valid"],]
    X_test = np.array(X)[split_df["test"],]
    y_train = np.array(y)[split_df["train"]]
    y_val = np.array(y)[split_df["valid"]]
    y_test = np.array(y)[split_df["test"]]
    # concatenate train and val
    X_train = np.concatenate((X_train, X_val), axis=0)
    y_train = np.concatenate((y_train, y_val), axis=0)
    model = CatBoostClassifier(
        depth=5, iterations=5000, learning_rate=0.01, verbose=False, random_seed=random_seed
    )
    model.fit(X_train, y_train)
    model_prob_val = model.predict_proba(X_val)
    model_prob_test = model.predict_proba(X_test)
    val_auc = roc_auc_score(y_val, model_prob_val[:, 1])
    test_auc = roc_auc_score(y_test, model_prob_test[:, 1])
    pred_val = model.predict(X_val)
    pred_test = model.predict(X_test)
    val_acc = balanced_accuracy_score(y_val, pred_val)
    test_acc = balanced_accuracy_score(y_test, pred_test)
    val_f1 = f1_score(y_val, pred_val)
    test_f1 = f1_score(y_test, pred_test)
    return (val_auc, test_auc, val_acc, test_acc, val_f1, test_f1)


def run_catboost(X, y, split_df, parameter_grid):
    val_auc, test_auc, val_acc, test_acc, val_f1, test_f1, val_mcc, test_mcc = select_catboost(
        parameter_grid, X, y, split_df
    )
    # --- Compute the best result from the validation set ---
    best_idx = np.argmax(val_auc)
    val_auc, test_auc, val_acc, test_acc, val_f1, test_f1, val_mcc, test_mcc = (
        val_auc[best_idx],
        test_auc[best_idx],
        val_acc[best_idx],
        test_acc[best_idx],
        val_f1[best_idx],
        test_f1[best_idx],
        val_mcc[best_idx],
        test_mcc[best_idx],
    )
    print("VAL")
    print(f"AUC: {val_auc}")
    print(f"Balanced Accuracy: {val_acc}")
    print(f"F1: {val_f1}")
    print(f"MCC: {val_mcc}")
    print("TEST")
    print(f"AUC: {test_auc}")
    print(f"Balanced Accuracy: {test_acc}")
    print(f"F1: {test_f1}")
    print(f"MCC: {test_mcc}")
    # latex table row
    print(
        f"{val_auc:.3f} & {test_auc:.3f} & {val_acc:.3f} & {test_acc:.3f} & {val_f1:.3f} & {test_f1:.3f} & {val_mcc:.3f} & {test_mcc:.3f} \\"
    )

In [None]:
# --- Expert --- #
run_catboost(fp_X, y, split_df, param_grid)

In [None]:
# --- MegaMolBART  base --- #
run_catboost(mmb_X, y, split_df, param_grid)

In [None]:
# --- MegaMolBART Fine-tuned #
run_catboost(mmb_ft_X, y, split_df, param_grid)

In [None]:
# --- CFP --- #
run_catboost(cfp_X, y, split_df, param_grid)

In [None]:
# --- Grover --- #
run_catboost(grover_X, y, split_df, param_grid)

In [None]:
# --- CFP-MMB Fine-tuned --- #
run_catboost(cfp_mmb_ft_X, y, split_df, param_grid)

In [None]:
# --- Expert-Grover --- #
run_catboost(fp_grover_X, y, split_df, param_grid)

In [None]:
# --- Grover-Large --- #
run_catboost(grover_large_X, y, split_df, param_grid)

In [None]:
# --- Expert-Grover-Large --- #
run_catboost(fp_grover_large_X, y, split_df, param_grid)

In [None]:
# --- GCN --- #
run_catboost(gcn_X, y, split_df, param_grid)

In [None]:
# --- GCN-CFP --- #
run_catboost(gcn_cfp_X, y, split_df, param_grid)

In [None]:
# --- GCN-MMB Fine-tuned --- #
run_catboost(gcn_mmb_ft_X, y, split_df, param_grid)

In [None]:
# --- GCN-MMB Fine-tuned-CFP --- #
run_catboost(gcn_mmb_ft_cfp, y, split_df, param_grid)