In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit.Chem import AllChem

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import PredefinedSplit, GridSearchCV

In [4]:
import sys
sys.path.append('../../code')

from metrics import get_hi_metrics

Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading some PyTorch models, missing a dependency. No module named 'torch'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch'
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'torch'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [5]:
train = pd.read_csv('../../data/raw/hiv_naive_train.csv')
test = pd.read_csv('../../data/raw/hiv_naive_test.csv')

train

Unnamed: 0.1,Unnamed: 0,smiles,value
0,0,COC(=O)c1ccccc1C1CN=NC12Cc1cc3c(cc1C2=O)CCC3,0
1,1,CN(C)c1ccc(C=C(C#N)c2cccc(Cl)c2)cc1,0
2,2,CCOc1ccc(N=Cc2c3ccccc3nc3ccccc23)cc1,0
3,3,Clc1ccccc1CSc1cnnc2ccccc12,0
4,4,CCOC(=O)C1ON2OC(OC3CCCCC3c3ccccc3)CC3OC(=O)C1C32,0
...,...,...,...
30840,30840,CC(=O)n1c(=O)c2cc3c(=O)n(-c4cccc(C#N)c4)c(=O)c...,0
30841,30841,COc1cc(C=C2C(=O)N(C(=O)c3ccc(Cl)cc3)N=C2C)cc(O...,0
30842,30842,Cc1ccc(C2=C(C#N)C(=O)N3CCN=C3S2)cc1,0
30843,30843,Cn1c(=O)c2nsnc2n(C)c1=O,0


In [6]:
def get_fingerprints(smiles):
    mols = [Chem.MolFromSmiles(x) for x in smiles]
    return [AllChem.GetMorganFingerprintAsBitVect(x, 2, 1024) for x in mols]

train_fps = get_fingerprints(train['smiles'])
test_fps = get_fingerprints(test['smiles'])



In [8]:
from scipy.spatial.distance import jaccard


def run_knn_gridsearch_tanimoto(train_fps, test_fps):
    split_index = [-1] * len(train_fps) + [0] * len(test_fps)
    pds = PredefinedSplit(test_fold = split_index)

    X = train_fps + test_fps
    y = train['value'].to_list() + test['value'].to_list()

    params = {
        'n_neighbors': [1, 3, 5, 7],
        'weights': ['distance'],
    }
    knn = KNeighborsClassifier(metric=jaccard)

    grid_search = GridSearchCV(knn, params, cv=pds, refit=False, scoring='average_precision', verbose=3)
    grid_search.fit(X, y)

    best_params = grid_search.best_params_
    knn = KNeighborsClassifier(n_neighbors=best_params['n_neighbors'], weights=best_params['weights'], metric=jaccard)
    knn.fit(train_fps, train['value'])

    train_preds = knn.predict_proba(train_fps)[:, 1]
    train_metrics = get_hi_metrics(train, train_preds)

    test_preds = knn.predict_proba(test_fps)[:, 1]
    test_metrics = get_hi_metrics(test, test_preds)
    return train_metrics, test_metrics


In [9]:
train_mols = [Chem.MolFromSmiles(x) for x in train['smiles']]
train_morgan_fps = [AllChem.GetMorganFingerprintAsBitVect(x, 2, 1024) for x in train_mols]

test_mols = [Chem.MolFromSmiles(x) for x in test['smiles']]
test_morgan_fps = [AllChem.GetMorganFingerprintAsBitVect(x, 2, 1024) for x in test_mols]

train_metrics, test_metrics = run_knn_gridsearch_tanimoto(train_morgan_fps, test_morgan_fps)
print(train_metrics)
print(test_metrics)



Fitting 1 folds for each of 4 candidates, totalling 4 fits
[CV 1/1] END ...n_neighbors=1, weights=distance;, score=0.209 total time=56.5min
[CV 1/1] END ...n_neighbors=3, weights=distance;, score=0.409 total time=57.0min
[CV 1/1] END ...n_neighbors=5, weights=distance;, score=0.462 total time=57.2min
[CV 1/1] END ...n_neighbors=7, weights=distance;, score=0.474 total time=57.4min


KeyboardInterrupt: 