In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem

from scipy.stats import spearmanr
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import PredefinedSplit, GridSearchCV

In [2]:
import sys
sys.path.append('../../../../code')

from metrics import get_lo_metrics

Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/simon/miniconda3/envs/lohi_benchmark/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [3]:
train = pd.read_csv('../../../../data/lo/kdr/train_1.csv', index_col=0)
test = pd.read_csv('../../../../data/lo/kdr/test_1.csv', index_col=0)

test

Unnamed: 0,smiles,value,cluster
0,Brc1ccc(-c2nc3ccc(Nc4ncnc5ccccc45)cc3[nH]2)cc1,6.419075,51
1,C=CC(=O)Nc1ccc(-c2ccc(NC(=O)Nc3ccc(F)cc3)cc2)cn1,8.047208,32
2,C=CC(=O)Nc1ccc(-c2ccc(NC(=O)Nc3cccc(C(C)C)c3)c...,8.508638,32
3,C=CC(=O)Nc1ccc(-c2ccc(NC(=O)Nc3cccc(Cl)c3)cc2)cn1,8.474955,32
4,C=CC(=O)Nc1cccc(-c2ccc(NC(=O)Nc3c(C)cccc3C)cc2)n1,6.380687,32
...,...,...,...
432,c1ccc2c(-c3cnn4cc(-c5ccc(N6CCNCC6)cc5)cnc34)cc...,6.666150,39
433,c1ccc2c(-c3cnn4cc(-c5ccc(N6CCOCC6)cc5)cnc34)cc...,5.273191,39
434,c1ccc2c(-c3cnn4cc(-c5ccc(OCCN6CCOCC6)cc5)cnc34...,5.616364,39
435,c1ccc2c(-c3nc4cc(-n5ccnc5)ccc4[nH]3)[nH]nc2c1,7.075721,45


In [4]:
def spearman_scorer(clf, X, y):
    if len(X) == len(train):
        y_pred = clf.predict(X)
        metrics = get_lo_metrics(train, y_pred)
        return metrics['spearman']
    elif len(X) == len(test):
        y_pred = clf.predict(X)
        metrics = get_lo_metrics(test, y_pred)
        return metrics['spearman']
    else:
        raise ValueError


In [5]:
from scipy.spatial.distance import jaccard


def run_knn_gridsearch_tanimoto(train_fps, test_fps):
    split_index = [-1] * len(train_fps) + [0] * len(test_fps)
    pds = PredefinedSplit(test_fold = split_index)

    X = train_fps + test_fps
    y = train['value'].to_list() + test['value'].to_list()

    params = {
        'n_neighbors': [1, 3, 5, 7, 10, 12],
        'weights': ['uniform', 'distance'],
    }
    knn = KNeighborsRegressor(metric=jaccard)

    grid_search = GridSearchCV(knn, params, cv=pds, refit=False, scoring=spearman_scorer, verbose=3)
    grid_search.fit(X, y)

    best_params = grid_search.best_params_
    print(best_params)
    knn = KNeighborsRegressor(n_neighbors=best_params['n_neighbors'], weights=best_params['weights'], metric=jaccard)
    knn.fit(train_fps, train['value'])

    test_preds = knn.predict(test_fps)
    test_metrics = get_lo_metrics(test, test_preds)
    return test_metrics


In [6]:
train_mols = [Chem.MolFromSmiles(x) for x in train['smiles']]
train_morgan_fps = [AllChem.GetMorganFingerprintAsBitVect(x, 2, 1024) for x in train_mols]

test_mols = [Chem.MolFromSmiles(x) for x in test['smiles']]
test_morgan_fps = [AllChem.GetMorganFingerprintAsBitVect(x, 2, 1024) for x in test_mols]

test_metrics = run_knn_gridsearch_tanimoto(train_morgan_fps, test_morgan_fps)
print(test_metrics)

Fitting 1 folds for each of 12 candidates, totalling 12 fits




[CV 1/1] END ....n_neighbors=1, weights=uniform;, score=0.000 total time=   1.5s




[CV 1/1] END ..n_neighbors=1, weights=distance;, score=-0.054 total time=   1.6s
[CV 1/1] END ....n_neighbors=3, weights=uniform;, score=0.062 total time=   2.0s
[CV 1/1] END ...n_neighbors=3, weights=distance;, score=0.151 total time=   1.9s
[CV 1/1] END ....n_neighbors=5, weights=uniform;, score=0.029 total time=   2.1s
[CV 1/1] END ...n_neighbors=5, weights=distance;, score=0.086 total time=   2.1s
[CV 1/1] END ....n_neighbors=7, weights=uniform;, score=0.034 total time=   2.2s
[CV 1/1] END ...n_neighbors=7, weights=distance;, score=0.074 total time=   2.0s
[CV 1/1] END ...n_neighbors=10, weights=uniform;, score=0.008 total time=   2.2s
[CV 1/1] END ..n_neighbors=10, weights=distance;, score=0.038 total time=   2.2s
[CV 1/1] END ..n_neighbors=12, weights=uniform;, score=-0.012 total time=   2.2s
[CV 1/1] END ..n_neighbors=12, weights=distance;, score=0.038 total time=   2.1s
{'n_neighbors': 3, 'weights': 'distance'}
{'r2': -1.4841772665695618, 'spearman': 0.15093838393720363, 'mae':

# Final Evaluation

In [7]:
def fit_predict(train, test):
    train_mols = [Chem.MolFromSmiles(x) for x in train['smiles']]
    train_morgan_fps = [AllChem.GetMorganFingerprintAsBitVect(x, 2, 1024) for x in train_mols]

    test_mols = [Chem.MolFromSmiles(x) for x in test['smiles']]
    test_morgan_fps = [AllChem.GetMorganFingerprintAsBitVect(x, 2, 1024) for x in test_mols]

    knn = KNeighborsRegressor(n_neighbors=3, weights='distance', metric=jaccard)
    knn.fit(train_morgan_fps, train['value'])

    train_result = train.copy()
    train_result['preds'] = knn.predict(train_morgan_fps)

    test_result = test.copy()
    test_result['preds'] = knn.predict(test_morgan_fps)

    return train_result, test_result


In [8]:
for i in [1, 2, 3]:
    train = pd.read_csv(f'../../../../data/lo/kdr/train_{i}.csv')
    test = pd.read_csv(f'../../../../data/lo/kdr/test_{i}.csv')

    train_preds, test_preds = fit_predict(train, test)
    train_preds.to_csv(f'../../../../predictions/lo/kdr/knn_ecfp4/train_{i}.csv')
    test_preds.to_csv(f'../../../../predictions/lo/kdr/knn_ecfp4/test_{i}.csv')
