In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import wandb
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MACCSkeys
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit

In [2]:
import sys
sys.path.append('../../../../code')

from metrics import get_hi_metrics

Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/steshin/miniconda3/envs/lohi_benchmark/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [3]:
train = pd.read_csv('../../../../data/hi/drd2/train_1.csv', index_col=0)
test = pd.read_csv('../../../../data/hi/drd2/test_1.csv', index_col=0)

train

Unnamed: 0,smiles,value
383,CC(C)Oc1ccccc1N1CCN(Cc2cccc(C(=O)N3CCCCC3)c2)CC1,True
386,CC(C)Oc1ccccc1N1CCN(Cc2cccc(CN3CCCCC3=O)c2)CC1,True
389,CC(C)Oc1ccccc1N1CCN(Cc2ccccc2CN2CCCCC2=O)CC1,True
2695,COc1ccccc1N1CCN(CC2COCC(c3ccccc3)(c3ccccc3)O2)CC1,True
2995,COc1ccccc1N1CCN(C[C@H]2OCCOC2(c2ccccc2)c2ccccc...,False
...,...,...
5444,O=C1c2ccccc2C(=O)N1CCCCN1CCCN(C(c2ccccc2)c2ccc...,True
4391,O=C(CCC(=O)c1ccccc1)NCCc1c[nH]c2ccccc12,False
4397,O=C(CCCC(=O)c1ccccc1)NCCc1c[nH]c2ccccc12,False
5999,OC12C3C4CC5C6C4C1C6C(C53)N2CC1CCCCC1,False


# Hyperparameter Optimization

In [4]:
def run_svc_gridsearch(train_fps, test_fps):
    split_index = [-1] * len(train_fps) + [0] * len(test_fps)
    pds = PredefinedSplit(test_fold = split_index)

    X = train_fps + test_fps
    y = train['value'].to_list() + test['value'].to_list()

    params = {
    'C': [0.1, 0.5, 1.0, 2.0, 5.0],
    }
    svc = SVC()

    grid_search = GridSearchCV(svc, params, cv=pds, refit=False, scoring='average_precision', verbose=3)
    grid_search.fit(X, y)

    best_params = grid_search.best_params_
    print(best_params)
    svc = SVC(**best_params)
    svc.fit(train_fps, train['value'])

    test_preds = svc.predict(test_fps)
    test_metrics = get_hi_metrics(test, test_preds)
    return test_metrics


In [5]:
train_mols = [Chem.MolFromSmiles(x) for x in train['smiles']]
train_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in train_mols]

test_mols = [Chem.MolFromSmiles(x) for x in test['smiles']]
test_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in test_mols]

test_metrics = run_svc_gridsearch(train_maccs_fps, test_maccs_fps)
print(test_metrics)

Fitting 1 folds for each of 5 candidates, totalling 5 fits
[CV 1/1] END .............................C=0.1;, score=0.636 total time=   1.8s
[CV 1/1] END .............................C=0.5;, score=0.649 total time=   1.7s
[CV 1/1] END .............................C=1.0;, score=0.653 total time=   1.6s
[CV 1/1] END .............................C=2.0;, score=0.660 total time=   1.6s
[CV 1/1] END .............................C=5.0;, score=0.674 total time=   1.6s
{'C': 5.0}
{'roc_auc': 0.5831501831501832, 'bedroc': 0.5765895736056725, 'prc_auc': 0.6609967992684043}


# Final Evaluation

In [6]:
def fit_predict(train, test):
    train_mols = [Chem.MolFromSmiles(x) for x in train['smiles']]
    train_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in train_mols]

    test_mols = [Chem.MolFromSmiles(x) for x in test['smiles']]
    test_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in test_mols]

    svc = SVC(
        C=5.0
    )
    svc.fit(train_maccs_fps, train['value'])

    train_result = train.copy()
    train_result['preds'] = svc.predict(train_maccs_fps)

    test_result = test.copy()
    test_result['preds'] = svc.predict(test_maccs_fps)

    return train_result, test_result


In [7]:
for i in [1, 2, 3]:
    train = pd.read_csv(f'../../../../data/hi/drd2/train_{i}.csv')
    test = pd.read_csv(f'../../../../data/hi/drd2/test_{i}.csv')

    train_preds, test_preds = fit_predict(train, test)
    train_preds.to_csv(f'../../../../predictions/hi/drd2/svc_maccs/train_{i}.csv')
    test_preds.to_csv(f'../../../../predictions/hi/drd2/svc_maccs/test_{i}.csv')
