In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import wandb
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MACCSkeys
from sklearn.svm import SVR
from scipy.stats import spearmanr
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit

In [2]:
import sys
sys.path.append('../../../../code')

from metrics import get_lo_metrics

Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/simon/miniconda3/envs/lohi_benchmark/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [3]:
train = pd.read_csv('../../../../data/lo/kdr/train_1.csv', index_col=0)
test = pd.read_csv('../../../../data/lo/kdr/test_1.csv', index_col=0)

train

Unnamed: 0,smiles,value,cluster
0,C/C(=N\OC(C)C)c1ccc2c(c1)c1c3c(c4c(c1n2CC(C)C)...,7.897940,0
1,C/C(=N\OCC(C)C)c1ccc2[nH]c3c4c(c5c(c3c2c1)CNC5...,8.129819,0
2,C=CC(=O)Nc1cc2c(Nc3c(F)cc(Br)cc3F)ncnc2cc1OCC1...,6.826814,0
3,C=CC(=O)Nc1cc2c(Nc3cc(Cl)c(Br)cc3F)ncnc2cc1OCC...,6.376751,0
4,C=CC(=O)Nc1cc2c(Nc3cc(Cl)c(Cl)cc3Cl)ncnc2cc1OC...,6.102373,0
...,...,...,...
495,c1ccc(-c2ccc(Nc3nnc(-c4cccnc4CCc4ccncc4)o3)cc2...,5.579879,0
496,c1ccc(Nc2ncc3c(n2)-c2ccccc2SC3)cc1,5.086133,0
497,c1ccc(Oc2ccc(Nc3ncnc4ccccc34)cc2)cc1,5.565271,0
498,c1ccc2c(c1)c(-c1cncc(-c3ccsc3)c1)cn2CCN1CCOCC1,7.214670,0


# Hyperparameter Optimization

In [4]:
def spearman_scorer(clf, X, y):
    if len(X) == len(train):
        y_pred = clf.predict(X)
        metrics = get_lo_metrics(train, y_pred)
        return metrics['spearman']
    elif len(X) == len(test):
        y_pred = clf.predict(X)
        metrics = get_lo_metrics(test, y_pred)
        return metrics['spearman']
    else:
        raise ValueError


In [5]:
def run_svc_gridsearch(train_fps, test_fps):
    split_index = [-1] * len(train_fps) + [0] * len(test_fps)
    pds = PredefinedSplit(test_fold = split_index)

    X = train_fps + test_fps
    y = train['value'].to_list() + test['value'].to_list()

    params = {
        'C': [0.1, 0.5, 1.0, 2.0, 5.0, 7.0, 10.0, 12.0, 15.0, 17.0, 20.0],
    }
    svc = SVR()

    grid_search = GridSearchCV(svc, params, cv=pds, refit=False, scoring=spearman_scorer, verbose=3)
    grid_search.fit(X, y)

    best_params = grid_search.best_params_
    print(best_params)
    svc = SVR(**best_params)
    svc.fit(train_fps, train['value'])

    test_preds = svc.predict(test_fps)
    test_metrics = get_lo_metrics(test, test_preds)
    return test_metrics


In [6]:
train_mols = [Chem.MolFromSmiles(x) for x in train['smiles']]
train_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in train_mols]

test_mols = [Chem.MolFromSmiles(x) for x in test['smiles']]
test_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in test_mols]

test_metrics = run_svc_gridsearch(train_maccs_fps, test_maccs_fps)
print(test_metrics)

Fitting 1 folds for each of 11 candidates, totalling 11 fits
[CV 1/1] END ............................C=0.1;, score=-0.016 total time=   0.1s
[CV 1/1] END ............................C=0.5;, score=-0.008 total time=   0.1s
[CV 1/1] END ............................C=1.0;, score=-0.007 total time=   0.1s
[CV 1/1] END .............................C=2.0;, score=0.031 total time=   0.2s
[CV 1/1] END .............................C=5.0;, score=0.111 total time=   0.2s
[CV 1/1] END .............................C=7.0;, score=0.117 total time=   0.2s
[CV 1/1] END ............................C=10.0;, score=0.121 total time=   0.2s
[CV 1/1] END ............................C=12.0;, score=0.114 total time=   0.2s
[CV 1/1] END ............................C=15.0;, score=0.110 total time=   0.2s
[CV 1/1] END ............................C=17.0;, score=0.113 total time=   0.2s
[CV 1/1] END ............................C=20.0;, score=0.116 total time=   0.2s
{'C': 10.0}
{'r2': -1.4820625917605594, 'spearma

# Final Evaluation

In [7]:
def fit_predict(train, test):
    train_mols = [Chem.MolFromSmiles(x) for x in train['smiles']]
    train_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in train_mols]

    test_mols = [Chem.MolFromSmiles(x) for x in test['smiles']]
    test_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in test_mols]

    svc = SVR(
        C=10.0
    )
    svc.fit(train_maccs_fps, train['value'])

    train_result = train.copy()
    train_result['preds'] = svc.predict(train_maccs_fps)

    test_result = test.copy()
    test_result['preds'] = svc.predict(test_maccs_fps)

    return train_result, test_result


In [8]:
for i in [1, 2, 3]:
    train = pd.read_csv(f'../../../../data/lo/kdr/train_{i}.csv')
    test = pd.read_csv(f'../../../../data/lo/kdr/test_{i}.csv')

    train_preds, test_preds = fit_predict(train, test)
    train_preds.to_csv(f'../../../../predictions/lo/kdr/svr_maccs/train_{i}.csv')
    test_preds.to_csv(f'../../../../predictions/lo/kdr/svr_maccs/test_{i}.csv')
