In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import wandb
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from scipy.stats import spearmanr
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import PredefinedSplit

In [2]:
import sys
sys.path.append('../../../../code')

from metrics import get_lo_metrics

Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/steshin/miniconda3/envs/lohi_benchmark/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [3]:
train = pd.read_csv('../../../../data/lo/kcnh2/train_1.csv', index_col=0)
test = pd.read_csv('../../../../data/lo/kcnh2/test_1.csv', index_col=0)

test

Unnamed: 0,smiles,value,cluster
0,C=C(C)COc1ccccc1CN1CCC2(CC1)CCN(C(=O)c1ccncc1)CC2,5.794709,20
1,C=CCOC[C@H]1CC[C@@H](N2CC(NC(=O)CNc3nn(C)c4ccc...,5.300943,31
2,C=CCO[C@H]1CC[C@@H](N2CC(NC(=O)CNc3n[nH]c4ccc(...,5.130710,31
3,CC(=O)N1CCC(C2N[C@@H](c3nc(-c4ccccc4)c[nH]3)Cc...,5.008730,34
4,CC(=O)NC1CCN(CCc2ccc(Oc3nc4ccccc4s3)cc2)CC1,5.045709,12
...,...,...,...
401,c1ccc(-c2c[nH]c([C@H]3Cc4c([nH]c5ccccc45)[C@@H...,6.419075,34
402,c1ccc(-c2c[nH]c([C@H]3Cc4c([nH]c5ccccc45)[C@H]...,6.136083,34
403,c1ccc(-c2ccc(-c3c[nH]c([C@H]4Cc5c([nH]c6ccccc5...,7.744727,34
404,c1ccc(CCCNCCN(c2ccccc2)c2ccccc2)cc1,6.217567,11


# Hyperparameter Optimization

In [4]:
def spearman_scorer(clf, X, y):
    if len(X) == len(train):
        y_pred = clf.predict(X)
        metrics = get_lo_metrics(train, y_pred)
        return metrics['spearman']
    elif len(X) == len(test):
        y_pred = clf.predict(X)
        metrics = get_lo_metrics(test, y_pred)
        return metrics['spearman']
    else:
        raise ValueError


In [5]:
def run_gb_gridsearch(train_fps, test_fps):
    split_index = [-1] * len(train_fps) + [0] * len(test_fps)
    pds = PredefinedSplit(test_fold = split_index)

    X = train_fps + test_fps
    y = train['value'].to_list() + test['value'].to_list()

    params = {
    'n_estimators': [10, 50, 100, 150, 200, 250, 500],
    'learning_rate': [0.01, 0.1, 0.3, 0.5, 0.7, 1.0],
    'subsample': [0.4, 0.7, 0.9, 1.0],
    'min_samples_split': [2, 3, 5, 7],
    'min_samples_leaf': [1, 3, 5],
    'max_depth': [2, 3, 4],
    'max_features': [None, 'sqrt']
    }
    knn = GradientBoostingRegressor()

    grid_search = RandomizedSearchCV(knn, params, cv=pds, n_iter=30, refit=False, scoring=spearman_scorer, verbose=3)
    grid_search.fit(X, y)

    best_params = grid_search.best_params_
    print(best_params)
    knn = GradientBoostingRegressor(**best_params)
    knn.fit(train_fps, train['value'])

    test_preds = knn.predict(test_fps)
    test_metrics = get_lo_metrics(test, test_preds)
    return test_metrics


In [6]:
train_mols = [Chem.MolFromSmiles(x) for x in train['smiles']]
train_morgan_fps = [AllChem.GetMorganFingerprintAsBitVect(x, 2, 1024) for x in train_mols]

test_mols = [Chem.MolFromSmiles(x) for x in test['smiles']]
test_morgan_fps = [AllChem.GetMorganFingerprintAsBitVect(x, 2, 1024) for x in test_mols]

test_metrics = run_gb_gridsearch(train_morgan_fps, test_morgan_fps)
print(test_metrics)

Fitting 1 folds for each of 30 candidates, totalling 30 fits
[CV 1/1] END learning_rate=0.01, max_depth=4, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=250, subsample=0.7;, score=0.359 total time=  18.6s
[CV 1/1] END learning_rate=0.01, max_depth=2, max_features=sqrt, min_samples_leaf=5, min_samples_split=3, n_estimators=500, subsample=0.4;, score=0.229 total time=   2.1s




[CV 1/1] END learning_rate=0.5, max_depth=3, max_features=sqrt, min_samples_leaf=3, min_samples_split=3, n_estimators=10, subsample=0.7;, score=0.156 total time=   1.5s
[CV 1/1] END learning_rate=0.01, max_depth=4, max_features=None, min_samples_leaf=5, min_samples_split=3, n_estimators=500, subsample=0.7;, score=0.411 total time=  35.6s
[CV 1/1] END learning_rate=0.1, max_depth=4, max_features=None, min_samples_leaf=5, min_samples_split=2, n_estimators=100, subsample=0.7;, score=0.382 total time=   8.3s
[CV 1/1] END learning_rate=0.01, max_depth=2, max_features=sqrt, min_samples_leaf=3, min_samples_split=3, n_estimators=250, subsample=0.4;, score=0.285 total time=   1.8s




[CV 1/1] END learning_rate=0.7, max_depth=2, max_features=sqrt, min_samples_leaf=5, min_samples_split=7, n_estimators=10, subsample=0.4;, score=0.001 total time=   1.5s
[CV 1/1] END learning_rate=1.0, max_depth=2, max_features=None, min_samples_leaf=5, min_samples_split=3, n_estimators=500, subsample=0.4;, score=-0.047 total time=  11.4s
[CV 1/1] END learning_rate=1.0, max_depth=2, max_features=sqrt, min_samples_leaf=3, min_samples_split=3, n_estimators=200, subsample=1.0;, score=0.207 total time=   1.9s
[CV 1/1] END learning_rate=1.0, max_depth=3, max_features=None, min_samples_leaf=1, min_samples_split=7, n_estimators=250, subsample=0.7;, score=0.206 total time=  14.4s
[CV 1/1] END learning_rate=0.1, max_depth=4, max_features=sqrt, min_samples_leaf=3, min_samples_split=3, n_estimators=500, subsample=0.7;, score=0.370 total time=   2.9s
[CV 1/1] END learning_rate=0.5, max_depth=2, max_features=None, min_samples_leaf=3, min_samples_split=7, n_estimators=500, subsample=0.4;, score=0.253



[CV 1/1] END learning_rate=0.7, max_depth=3, max_features=sqrt, min_samples_leaf=1, min_samples_split=7, n_estimators=10, subsample=0.4;, score=0.052 total time=   1.5s
[CV 1/1] END learning_rate=0.01, max_depth=4, max_features=sqrt, min_samples_leaf=3, min_samples_split=2, n_estimators=200, subsample=0.4;, score=0.321 total time=   1.9s
[CV 1/1] END learning_rate=0.01, max_depth=2, max_features=sqrt, min_samples_leaf=3, min_samples_split=2, n_estimators=250, subsample=0.7;, score=0.280 total time=   1.9s
[CV 1/1] END learning_rate=0.01, max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=0.7;, score=0.323 total time=   1.8s
[CV 1/1] END learning_rate=0.3, max_depth=2, max_features=None, min_samples_leaf=5, min_samples_split=5, n_estimators=100, subsample=0.4;, score=0.264 total time=   3.5s
[CV 1/1] END learning_rate=0.7, max_depth=2, max_features=None, min_samples_leaf=5, min_samples_split=3, n_estimators=50, subsample=0.4;, score=0.14



[CV 1/1] END learning_rate=0.3, max_depth=2, max_features=None, min_samples_leaf=5, min_samples_split=2, n_estimators=10, subsample=0.7;, score=0.106 total time=   1.8s
[CV 1/1] END learning_rate=0.7, max_depth=3, max_features=None, min_samples_leaf=3, min_samples_split=5, n_estimators=150, subsample=0.4;, score=0.127 total time=   5.9s
[CV 1/1] END learning_rate=0.01, max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=7, n_estimators=500, subsample=0.7;, score=0.306 total time=   2.9s
{'subsample': 0.7, 'n_estimators': 500, 'min_samples_split': 3, 'min_samples_leaf': 5, 'max_features': None, 'max_depth': 4, 'learning_rate': 0.01}
{'r2': -0.8798180876954403, 'spearman': 0.3805236471923937, 'mae': 0.9609210981670802}


# Final Evaluation

In [7]:
def fit_predict(train, test):
    train_mols = [Chem.MolFromSmiles(x) for x in train['smiles']]
    train_morgan_fps = [AllChem.GetMorganFingerprintAsBitVect(x, 2, 1024) for x in train_mols]

    test_mols = [Chem.MolFromSmiles(x) for x in test['smiles']]
    test_morgan_fps = [AllChem.GetMorganFingerprintAsBitVect(x, 2, 1024) for x in test_mols]

    gb = GradientBoostingRegressor(
        n_estimators=500,
        subsample=0.7,
        min_samples_split=3,
        min_samples_leaf=5,
        max_features=None,
        max_depth=4,
        learning_rate=0.01
    )
    gb.fit(train_morgan_fps, train['value'])

    train_result = train.copy()
    train_result['preds'] = gb.predict(train_morgan_fps)

    test_result = test.copy()
    test_result['preds'] = gb.predict(test_morgan_fps)

    return train_result, test_result


In [8]:
for i in [1, 2, 3]:
    train = pd.read_csv(f'../../../../data/lo/kcnh2/train_{i}.csv')
    test = pd.read_csv(f'../../../../data/lo/kcnh2/test_{i}.csv')

    train_preds, test_preds = fit_predict(train, test)
    train_preds.to_csv(f'../../../../predictions/lo/kcnh2/gb_ecfp4/train_{i}.csv')
    test_preds.to_csv(f'../../../../predictions/lo/kcnh2/gb_ecfp4/test_{i}.csv')
