In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import wandb
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MACCSkeys
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import PredefinedSplit

In [2]:
import sys
sys.path.append('../../../../code')

from metrics import get_hi_metrics

Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/simon/miniconda3/envs/lohi_benchmark/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [3]:
train = pd.read_csv('../../../../data/hi/kdr/train_1.csv', index_col=0)
test = pd.read_csv('../../../../data/hi/kdr/test_1.csv', index_col=0)

train

Unnamed: 0,smiles,value
0,Brc1ccc(-c2nc3ccc(Nc4ccnc5ccccc45)cc3[nH]2)cc1,True
1064,CCc1ccc(-c2nc3ccc(Nc4ccnc5ccccc45)cc3[nH]2)cc1,False
1065,CCc1ccc(-c2nc3ccc(Nc4ncnc5ccccc45)cc3[nH]2)cc1,False
3722,COc1ccccc1-c1nc2ccc(Nc3ccnc4ccccc34)cc2[nH]1,False
4159,Cc1ccc(-c2nc3ccc(Nc4ccnc5ccccc45)cc3[nH]2)cc1,False
...,...,...
454,CC(C)S(=O)(=O)c1ccccc1Nc1nc(Nc2cccc(NC(=O)CN)c...,False
1850,COC(=O)c1cn2ncnc(Oc3ccc4[nH]c(C)cc4c3F)c2c1C,True
4120,Cc1cc2c(F)c(Oc3ncnn4cc(OCCCNS(C)(=O)=O)c(C)c34...,True
2979,COc1cc2c(Oc3ccc(N/C=C4\C(=O)NC(=O)N(c5ccc(C)cc...,True


# Hyperparameter Optimization

In [4]:
def run_gb_gridsearch(train_fps, test_fps):
    split_index = [-1] * len(train_fps) + [0] * len(test_fps)
    pds = PredefinedSplit(test_fold = split_index)

    X = train_fps + test_fps
    y = train['value'].to_list() + test['value'].to_list()

    params = {
    'n_estimators': [10, 50, 100, 150, 200, 250, 500],
    'learning_rate': [0.01, 0.1, 0.3, 0.5, 0.7, 1.0],
    'subsample': [0.4, 0.7, 0.9, 1.0],
    'min_samples_split': [2, 3, 5, 7],
    'min_samples_leaf': [1, 3, 5],
    'max_depth': [2, 3, 4],
    'max_features': [None, 'sqrt']
    }
    knn = GradientBoostingClassifier()

    grid_search = RandomizedSearchCV(knn, params, cv=pds, n_iter=30, refit=False, scoring='average_precision', verbose=3)
    grid_search.fit(X, y)

    best_params = grid_search.best_params_
    print(best_params)
    knn = GradientBoostingClassifier(**best_params)
    knn.fit(train_fps, train['value'])

    test_preds = knn.predict_proba(test_fps)[:, 1]
    test_metrics = get_hi_metrics(test, test_preds)
    return test_metrics


In [5]:
train_mols = [Chem.MolFromSmiles(x) for x in train['smiles']]
train_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in train_mols]

test_mols = [Chem.MolFromSmiles(x) for x in test['smiles']]
test_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in test_mols]

test_metrics = run_gb_gridsearch(train_maccs_fps, test_maccs_fps)
print(test_metrics)

Fitting 1 folds for each of 30 candidates, totalling 30 fits
[CV 1/1] END learning_rate=0.1, max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=7, n_estimators=10, subsample=0.4;, score=0.495 total time=   0.6s
[CV 1/1] END learning_rate=0.5, max_depth=3, max_features=None, min_samples_leaf=3, min_samples_split=7, n_estimators=500, subsample=1.0;, score=0.590 total time=   1.3s
[CV 1/1] END learning_rate=0.5, max_depth=2, max_features=None, min_samples_leaf=3, min_samples_split=7, n_estimators=150, subsample=0.4;, score=0.572 total time=   0.7s
[CV 1/1] END learning_rate=0.01, max_depth=3, max_features=sqrt, min_samples_leaf=1, min_samples_split=7, n_estimators=500, subsample=1.0;, score=0.491 total time=   0.8s
[CV 1/1] END learning_rate=0.7, max_depth=2, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=100, subsample=1.0;, score=0.535 total time=   0.6s
[CV 1/1] END learning_rate=0.7, max_depth=3, max_features=sqrt, min_samples_leaf=1, min_s

# Final Evaluation

In [6]:
def fit_predict(train, test):
    train_mols = [Chem.MolFromSmiles(x) for x in train['smiles']]
    train_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in train_mols]

    test_mols = [Chem.MolFromSmiles(x) for x in test['smiles']]
    test_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in test_mols]

    gb = GradientBoostingClassifier(
        n_estimators=250,
        subsample=0.4,
        min_samples_split=5,
        min_samples_leaf=1,
        max_features='sqrt',
        max_depth=2,
        learning_rate=1.0
    )
    gb.fit(train_maccs_fps, train['value'])

    train_result = train.copy()
    train_result['preds'] = train_result['value']

    test_result = test.copy()
    test_result['preds'] = gb.predict_proba(test_maccs_fps)[:, 1]

    return train_result, test_result


In [7]:
for i in [1, 2, 3]:
    train = pd.read_csv(f'../../../../data/hi/kdr/train_{i}.csv')
    test = pd.read_csv(f'../../../../data/hi/kdr/test_{i}.csv')

    train_preds, test_preds = fit_predict(train, test)
    train_preds.to_csv(f'../../../../predictions/hi/kdr/gb_maccs/train_{i}.csv')
    test_preds.to_csv(f'../../../../predictions/hi/kdr/gb_maccs/test_{i}.csv')
