In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import wandb
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MACCSkeys
from sklearn.svm import SVR
from scipy.stats import spearmanr
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit

In [2]:
import sys
sys.path.append('../../../../code')

from metrics import get_lo_metrics

Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/steshin/miniconda3/envs/lohi_benchmark/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [3]:
train = pd.read_csv('../../../../data/lo/kcnh2/train_1.csv', index_col=0)
test = pd.read_csv('../../../../data/lo/kcnh2/test_1.csv', index_col=0)

train

Unnamed: 0,smiles,value,cluster
0,Brc1ccc2c(NC3=NC[C@@]4(CN5CCC4CC5)O3)ncnn12,5.601886,0
1,Brc1cnc2nc(N3CCN4CCC3CC4)oc2c1,5.638083,0
2,C#CCOc1cnc(C(=O)Nc2cc(F)c(F)c([C@@]3(C)N=C(N)S...,5.161088,0
3,C#Cc1cnc(Nc2cnc(C#N)c(O[C@H](C)CN(C)C)n2)cc1NC,5.096856,0
4,C#Cc1cnc(Nc2cnc(C#N)cn2)cc1NC[C@@H]1CNCCO1,5.086133,0
...,...,...,...
3308,c1cnc2c(N3CCN(CCCCc4ccc(OCCCN5CCCCCC5)cc4)CC3)...,5.799727,0
3309,c1cnc2c(N3CCN(CCCc4ccc(OCCCN5CCCCCC5)cc4)CC3)c...,5.999566,0
3310,c1cnc2c(N3CCN(CCc4ccc(OCCCN5CCCCCC5)cc4)CC3)cc...,5.099945,0
3311,c1cncc(-c2ccc(-c3noc(C4CN5CCC4CC5)n3)o2)c1,5.193752,0


# Hyperparameter Optimization

In [4]:
def spearman_scorer(clf, X, y):
    if len(X) == len(train):
        y_pred = clf.predict(X)
        metrics = get_lo_metrics(train, y_pred)
        return metrics['spearman']
    elif len(X) == len(test):
        y_pred = clf.predict(X)
        metrics = get_lo_metrics(test, y_pred)
        return metrics['spearman']
    else:
        raise ValueError


In [5]:
def run_svc_gridsearch(train_fps, test_fps):
    split_index = [-1] * len(train_fps) + [0] * len(test_fps)
    pds = PredefinedSplit(test_fold = split_index)

    X = train_fps + test_fps
    y = train['value'].to_list() + test['value'].to_list()

    params = {
        'C': [0.1, 0.5, 1.0, 2.0, 5.0, 7.0, 10.0, 12.0, 15.0, 17.0, 20.0],
    }
    svc = SVR()

    grid_search = GridSearchCV(svc, params, cv=pds, refit=False, scoring=spearman_scorer, verbose=3)
    grid_search.fit(X, y)

    best_params = grid_search.best_params_
    print(best_params)
    svc = SVR(**best_params)
    svc.fit(train_fps, train['value'])

    test_preds = svc.predict(test_fps)
    test_metrics = get_lo_metrics(test, test_preds)
    return test_metrics


In [6]:
train_mols = [Chem.MolFromSmiles(x) for x in train['smiles']]
train_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in train_mols]

test_mols = [Chem.MolFromSmiles(x) for x in test['smiles']]
test_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in test_mols]

test_metrics = run_svc_gridsearch(train_maccs_fps, test_maccs_fps)
print(test_metrics)

Fitting 1 folds for each of 11 candidates, totalling 11 fits
[CV 1/1] END .............................C=0.1;, score=0.126 total time=   1.3s
[CV 1/1] END .............................C=0.5;, score=0.071 total time=   1.3s
[CV 1/1] END .............................C=1.0;, score=0.112 total time=   1.3s
[CV 1/1] END .............................C=2.0;, score=0.168 total time=   1.4s
[CV 1/1] END .............................C=5.0;, score=0.152 total time=   1.5s
[CV 1/1] END .............................C=7.0;, score=0.129 total time=   1.6s
[CV 1/1] END ............................C=10.0;, score=0.092 total time=   1.6s
[CV 1/1] END ............................C=12.0;, score=0.089 total time=   1.7s
[CV 1/1] END ............................C=15.0;, score=0.091 total time=   1.7s
[CV 1/1] END ............................C=17.0;, score=0.094 total time=   1.8s
[CV 1/1] END ............................C=20.0;, score=0.096 total time=   1.8s
{'C': 2.0}
{'r2': -1.0152228166440151, 'spearman

# Final Evaluation

In [7]:
def fit_predict(train, test):
    train_mols = [Chem.MolFromSmiles(x) for x in train['smiles']]
    train_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in train_mols]

    test_mols = [Chem.MolFromSmiles(x) for x in test['smiles']]
    test_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in test_mols]

    svc = SVR(
        C=2.0
    )
    svc.fit(train_maccs_fps, train['value'])

    train_result = train.copy()
    train_result['preds'] = svc.predict(train_maccs_fps)

    test_result = test.copy()
    test_result['preds'] = svc.predict(test_maccs_fps)

    return train_result, test_result


In [8]:
for i in [1, 2, 3]:
    train = pd.read_csv(f'../../../../data/lo/kcnh2/train_{i}.csv')
    test = pd.read_csv(f'../../../../data/lo/kcnh2/test_{i}.csv')

    train_preds, test_preds = fit_predict(train, test)
    train_preds.to_csv(f'../../../../predictions/lo/kcnh2/svr_maccs/train_{i}.csv')
    test_preds.to_csv(f'../../../../predictions/lo/kcnh2/svr_maccs/test_{i}.csv')
