In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import wandb
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MACCSkeys
from scipy.stats import spearmanr
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import PredefinedSplit

In [2]:
import sys
sys.path.append('../../../../code')

from metrics import get_lo_metrics

Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/steshin/miniconda3/envs/lohi_benchmark/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [3]:
train = pd.read_csv('../../../../data/lo/drd2/train_1.csv', index_col=0)
test = pd.read_csv('../../../../data/lo/drd2/test_1.csv', index_col=0)

test

Unnamed: 0,smiles,value,cluster
0,Brc1ccc(-[n+]2cc[n+](Cc3ccccc3)cc2)c2cc[nH]c12,7.717691,11
1,Brc1cccc(N2CCN(Cc3cnn4ccccc34)CC2)n1,6.748370,26
2,C#CC1=CCC(N(CCC)CCCCn2cc(-c3ccc(-c4ccccc4)cc3)...,6.490481,14
3,C#CCN(CCN1CCN(c2ccccc2)CC1)C1CCc2ccc(O)cc2C1,6.609065,32
4,C1=C(c2ccccc2)CCN(Cc2cnn(-c3ccccc3)c2)C1,7.473269,12
...,...,...,...
262,c1ccc2c(c1)N=C(N1CCNCC1)c1ccccc1S2,7.420216,6
263,c1cnc(N2CCN(CCCOc3ccc(-c4nc5ccccc5[nH]4)cc3)CC...,6.568636,35
264,c1cnc(N2CCN(CCCOc3ccc(-c4nc5ccccc5o4)cc3)CC2)nc1,6.701147,35
265,c1cnc(N2CCN(Cc3c[nH]c4ncccc34)CC2)nc1,5.931443,12


# Hyperparameter Optimization

In [4]:
def spearman_scorer(clf, X, y):
    if len(X) == len(train):
        y_pred = clf.predict(X)
        metrics = get_lo_metrics(train, y_pred)
        return metrics['spearman']
    elif len(X) == len(test):
        y_pred = clf.predict(X)
        metrics = get_lo_metrics(test, y_pred)
        return metrics['spearman']
    else:
        raise ValueError


In [5]:
def run_gb_gridsearch(train_fps, test_fps):
    split_index = [-1] * len(train_fps) + [0] * len(test_fps)
    pds = PredefinedSplit(test_fold = split_index)

    X = train_fps + test_fps
    y = train['value'].to_list() + test['value'].to_list()

    params = {
    'n_estimators': [10, 50, 100, 150, 200, 250, 500],
    'learning_rate': [0.01, 0.1, 0.3, 0.5, 0.7, 1.0],
    'subsample': [0.4, 0.7, 0.9, 1.0],
    'min_samples_split': [2, 3, 5, 7],
    'min_samples_leaf': [1, 3, 5],
    'max_depth': [2, 3, 4],
    'max_features': [None, 'sqrt']
    }
    knn = GradientBoostingRegressor()

    grid_search = RandomizedSearchCV(knn, params, cv=pds, n_iter=30, refit=False, scoring=spearman_scorer, verbose=3)
    grid_search.fit(X, y)

    best_params = grid_search.best_params_
    print(best_params)
    knn = GradientBoostingRegressor(**best_params)
    knn.fit(train_fps, train['value'])

    test_preds = knn.predict(test_fps)
    test_metrics = get_lo_metrics(test, test_preds)
    return test_metrics


In [6]:
train_mols = [Chem.MolFromSmiles(x) for x in train['smiles']]
train_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in train_mols]

test_mols = [Chem.MolFromSmiles(x) for x in test['smiles']]
test_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in test_mols]

test_metrics = run_gb_gridsearch(train_maccs_fps, test_maccs_fps)
print(test_metrics)

Fitting 1 folds for each of 30 candidates, totalling 30 fits
[CV 1/1] END learning_rate=0.7, max_depth=3, max_features=sqrt, min_samples_leaf=1, min_samples_split=3, n_estimators=100, subsample=1.0;, score=0.184 total time=   0.5s
[CV 1/1] END learning_rate=1.0, max_depth=3, max_features=None, min_samples_leaf=5, min_samples_split=3, n_estimators=10, subsample=1.0;, score=0.106 total time=   0.5s




[CV 1/1] END learning_rate=1.0, max_depth=3, max_features=None, min_samples_leaf=5, min_samples_split=7, n_estimators=500, subsample=0.4;, score=0.149 total time=   1.5s
[CV 1/1] END learning_rate=1.0, max_depth=4, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=250, subsample=0.4;, score=-0.023 total time=   0.6s
[CV 1/1] END learning_rate=0.7, max_depth=2, max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=500, subsample=0.7;, score=0.160 total time=   0.7s
[CV 1/1] END learning_rate=0.3, max_depth=4, max_features=None, min_samples_leaf=5, min_samples_split=3, n_estimators=200, subsample=0.9;, score=0.242 total time=   1.5s
[CV 1/1] END learning_rate=0.7, max_depth=2, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=1.0;, score=0.035 total time=   0.8s
[CV 1/1] END learning_rate=0.3, max_depth=4, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=50, subsample=0.4;, score=0.138



[CV 1/1] END learning_rate=0.7, max_depth=2, max_features=sqrt, min_samples_leaf=1, min_samples_split=3, n_estimators=250, subsample=0.9;, score=0.166 total time=   0.6s




[CV 1/1] END learning_rate=0.01, max_depth=2, max_features=None, min_samples_leaf=5, min_samples_split=3, n_estimators=150, subsample=0.9;, score=0.035 total time=   0.9s
[CV 1/1] END learning_rate=0.5, max_depth=2, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200, subsample=1.0;, score=0.214 total time=   0.6s
[CV 1/1] END learning_rate=0.01, max_depth=3, max_features=None, min_samples_leaf=3, min_samples_split=7, n_estimators=200, subsample=0.4;, score=0.108 total time=   0.9s
[CV 1/1] END learning_rate=0.5, max_depth=2, max_features=None, min_samples_leaf=1, min_samples_split=3, n_estimators=10, subsample=0.4;, score=0.090 total time=   0.5s
[CV 1/1] END learning_rate=0.3, max_depth=3, max_features=sqrt, min_samples_leaf=3, min_samples_split=3, n_estimators=250, subsample=0.4;, score=0.210 total time=   0.6s
[CV 1/1] END learning_rate=0.1, max_depth=2, max_features=None, min_samples_leaf=3, min_samples_split=5, n_estimators=250, subsample=0.4;, score=0.18

# Final Evaluation

In [7]:
def fit_predict(train, test):
    train_mols = [Chem.MolFromSmiles(x) for x in train['smiles']]
    train_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in train_mols]

    test_mols = [Chem.MolFromSmiles(x) for x in test['smiles']]
    test_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in test_mols]

    gb = GradientBoostingRegressor(
        n_estimators=200,
        subsample=0.9,
        min_samples_split=3,
        min_samples_leaf=5,
        max_features=None,
        max_depth=4,
        learning_rate=0.3
    )
    gb.fit(train_maccs_fps, train['value'])

    train_result = train.copy()
    train_result['preds'] = gb.predict(train_maccs_fps)

    test_result = test.copy()
    test_result['preds'] = gb.predict(test_maccs_fps)

    return train_result, test_result


In [8]:
for i in [1, 2, 3]:
    train = pd.read_csv(f'../../../../data/lo/drd2/train_{i}.csv')
    test = pd.read_csv(f'../../../../data/lo/drd2/test_{i}.csv')

    train_preds, test_preds = fit_predict(train, test)
    train_preds.to_csv(f'../../../../predictions/lo/drd2/gb_maccs/train_{i}.csv')
    test_preds.to_csv(f'../../../../predictions/lo/drd2/gb_maccs/test_{i}.csv')
