In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import wandb
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MACCSkeys
from sklearn.svm import SVR
from scipy.stats import spearmanr
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit

In [2]:
import sys
sys.path.append('../../../../code')

from metrics import get_lo_metrics

Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/steshin/miniconda3/envs/lohi_benchmark/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [3]:
train = pd.read_csv('../../../../data/lo/drd2/train_1.csv', index_col=0)
test = pd.read_csv('../../../../data/lo/drd2/test_1.csv', index_col=0)

train

Unnamed: 0,smiles,value,cluster
0,Brc1ccc(CNCCN2CCN(Cc3cc4ccccc4[nH]3)CC2)cc1,5.283913,0
1,Brc1ccc(N2CCN(Cc3ccccc3)CC2)c2cc[nH]c12,7.437357,0
2,Brc1ccc(NCCN2CCN(CCc3c[nH]c4ccccc34)CC2)cc1,7.288705,0
3,Brc1ccc(NCCN2CCN(Cc3cc4ccccc4[nH]3)CC2)cc1,6.035740,0
4,C#Cc1cccn1C1CCN(Cc2ccccc2)CC1,5.190490,0
...,...,...,...
2201,c1ccc(OCC2CN(Cc3c[nH]c4ccccc34)CCO2)cc1,6.396856,0
2202,c1ccc(OCCCNCCOc2ccccc2)cc1,6.598272,0
2203,c1ccc2c(C3CCNC3)cccc2c1,6.576754,0
2204,c1ccc2c(c1)CCN1CCc3[nH]c4ccccc4c3C21,5.830620,0


# Hyperparameter Optimization

In [4]:
def spearman_scorer(clf, X, y):
    if len(X) == len(train):
        y_pred = clf.predict(X)
        metrics = get_lo_metrics(train, y_pred)
        return metrics['spearman']
    elif len(X) == len(test):
        y_pred = clf.predict(X)
        metrics = get_lo_metrics(test, y_pred)
        return metrics['spearman']
    else:
        raise ValueError


In [5]:
def run_svc_gridsearch(train_fps, test_fps):
    split_index = [-1] * len(train_fps) + [0] * len(test_fps)
    pds = PredefinedSplit(test_fold = split_index)

    X = train_fps + test_fps
    y = train['value'].to_list() + test['value'].to_list()

    params = {
        'C': [0.1, 0.5, 1.0, 2.0, 5.0, 7.0, 10.0, 12.0, 15.0, 17.0, 20.0],
    }
    svc = SVR()

    grid_search = GridSearchCV(svc, params, cv=pds, refit=False, scoring=spearman_scorer, verbose=3)
    grid_search.fit(X, y)

    best_params = grid_search.best_params_
    print(best_params)
    svc = SVR(**best_params)
    svc.fit(train_fps, train['value'])

    test_preds = svc.predict(test_fps)
    test_metrics = get_lo_metrics(test, test_preds)
    return test_metrics


In [6]:
train_mols = [Chem.MolFromSmiles(x) for x in train['smiles']]
train_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in train_mols]

test_mols = [Chem.MolFromSmiles(x) for x in test['smiles']]
test_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in test_mols]

test_metrics = run_svc_gridsearch(train_maccs_fps, test_maccs_fps)
print(test_metrics)

Fitting 1 folds for each of 11 candidates, totalling 11 fits
[CV 1/1] END .............................C=0.1;, score=0.181 total time=   0.8s
[CV 1/1] END .............................C=0.5;, score=0.178 total time=   0.7s
[CV 1/1] END .............................C=1.0;, score=0.211 total time=   0.8s
[CV 1/1] END .............................C=2.0;, score=0.171 total time=   0.8s
[CV 1/1] END .............................C=5.0;, score=0.200 total time=   0.8s
[CV 1/1] END .............................C=7.0;, score=0.167 total time=   0.8s
[CV 1/1] END ............................C=10.0;, score=0.184 total time=   0.8s
[CV 1/1] END ............................C=12.0;, score=0.170 total time=   0.8s
[CV 1/1] END ............................C=15.0;, score=0.166 total time=   0.9s
[CV 1/1] END ............................C=17.0;, score=0.169 total time=   0.9s
[CV 1/1] END ............................C=20.0;, score=0.172 total time=   0.9s
{'C': 1.0}
{'r2': -0.5670737668511453, 'spearman

# Final Evaluation

In [7]:
def fit_predict(train, test):
    train_mols = [Chem.MolFromSmiles(x) for x in train['smiles']]
    train_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in train_mols]

    test_mols = [Chem.MolFromSmiles(x) for x in test['smiles']]
    test_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in test_mols]

    svc = SVR(
        C=1.0
    )
    svc.fit(train_maccs_fps, train['value'])

    train_result = train.copy()
    train_result['preds'] = svc.predict(train_maccs_fps)

    test_result = test.copy()
    test_result['preds'] = svc.predict(test_maccs_fps)

    return train_result, test_result


In [8]:
for i in [1, 2, 3]:
    train = pd.read_csv(f'../../../../data/lo/drd2/train_{i}.csv')
    test = pd.read_csv(f'../../../../data/lo/drd2/test_{i}.csv')

    train_preds, test_preds = fit_predict(train, test)
    train_preds.to_csv(f'../../../../predictions/lo/drd2/svr_maccs/train_{i}.csv')
    test_preds.to_csv(f'../../../../predictions/lo/drd2/svr_maccs/test_{i}.csv')
