In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit.Chem import AllChem

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import PredefinedSplit, GridSearchCV

In [2]:
import sys
sys.path.append('../../../../code')

from metrics import get_hi_metrics

Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/steshin/miniconda3/envs/lohi_benchmark/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [3]:
train = pd.read_csv('../../../../data/hi/hiv/train_1.csv')
test = pd.read_csv('../../../../data/hi/hiv/test_1.csv')

train

Unnamed: 0.1,Unnamed: 0,smiles,value
0,4,O=S(=O)(O)CCS(=O)(=O)O,0
1,21,CC(C)CCS(=O)(=O)O,0
2,90,O=S(=O)(O)CCO,0
3,106,O=S(=O)(O)CO,0
4,117,O=S(=O)(O)CCCCBr,0
...,...,...,...
15691,40932,COC(=O)c1cc2cc3c(c(O)c2c(=O)o1)OC1(Oc2c(O)c4c(...,0
15692,40973,CCCCC1C(OCOc2ccccc2)COC(=O)N1C(C)c1ccccc1,0
15693,41024,CC(C)=CC1CC(C)C2CCC(C)C3C(=O)C(O)=C(C)C(=O)C123,0
15694,41026,CCOC(=O)C12C(=O)C(C)CCC1C(C)CC2C=C(C)C,0


In [6]:
from scipy.spatial.distance import jaccard


def run_knn_gridsearch_tanimoto(train_fps, test_fps):
    split_index = [-1] * len(train_fps) + [0] * len(test_fps)
    pds = PredefinedSplit(test_fold = split_index)

    X = train_fps + test_fps
    y = train['value'].to_list() + test['value'].to_list()

    params = {
        'n_neighbors': [3, 5, 7, 10, 12, 15],
        'weights': ['uniform', 'distance'],
    }
    knn = KNeighborsClassifier(metric=jaccard)

    grid_search = GridSearchCV(knn, params, cv=pds, refit=False, scoring='average_precision', verbose=3)
    grid_search.fit(X, y)

    best_params = grid_search.best_params_
    knn = KNeighborsClassifier(n_neighbors=best_params['n_neighbors'], weights=best_params['weights'], metric=jaccard)
    knn.fit(train_fps, train['value'])

    test_preds = knn.predict_proba(test_fps)[:, 1]
    test_metrics = get_hi_metrics(test, test_preds)
    return test_metrics


In [5]:
train_mols = [Chem.MolFromSmiles(x) for x in train['smiles']]
train_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in train_mols]

test_mols = [Chem.MolFromSmiles(x) for x in test['smiles']]
test_maccs_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in test_mols]

test_metrics = run_knn_gridsearch_tanimoto(train_maccs_fps, test_maccs_fps)
print(test_metrics)



Fitting 1 folds for each of 12 candidates, totalling 12 fits
[CV 1/1] END ....n_neighbors=3, weights=uniform;, score=0.099 total time=18.3min
[CV 1/1] END ...n_neighbors=3, weights=distance;, score=0.118 total time=18.3min
[CV 1/1] END ....n_neighbors=5, weights=uniform;, score=0.099 total time=18.7min
[CV 1/1] END ...n_neighbors=5, weights=distance;, score=0.121 total time=18.7min
[CV 1/1] END ....n_neighbors=7, weights=uniform;, score=0.094 total time=18.4min
[CV 1/1] END ...n_neighbors=7, weights=distance;, score=0.116 total time=18.6min
[CV 1/1] END ...n_neighbors=10, weights=uniform;, score=0.092 total time=18.6min
[CV 1/1] END ..n_neighbors=10, weights=distance;, score=0.115 total time=18.5min
[CV 1/1] END ...n_neighbors=12, weights=uniform;, score=0.091 total time=18.7min
[CV 1/1] END ..n_neighbors=12, weights=distance;, score=0.113 total time=18.6min
[CV 1/1] END ...n_neighbors=15, weights=uniform;, score=0.086 total time=18.6min


# Final Evaluation

In [4]:
def fit_predict(train, test):
    train_mols = [Chem.MolFromSmiles(x) for x in train['smiles']]
    train_morgan_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in train_mols]

    test_mols = [Chem.MolFromSmiles(x) for x in test['smiles']]
    test_morgan_fps = [Chem.MACCSkeys.GenMACCSKeys(x) for x in test_mols]

    knn = KNeighborsClassifier(n_neighbors=7, weights='distance', metric=jaccard, n_jobs=4)
    knn.fit(train_morgan_fps, train['value'])

    train_result = train.copy()
    train_result['preds'] = train_result['value']

    test_result = test.copy()
    print('Predicting...')
    test_result['preds'] = knn.predict_proba(test_morgan_fps)[:, 1]

    return train_result, test_result


In [7]:
for i in [1, 2, 3]:
    train = pd.read_csv(f'../../../../data/hi/hiv/train_{i}.csv')
    test = pd.read_csv(f'../../../../data/hi/hiv/test_{i}.csv')

    train_preds, test_preds = fit_predict(train, test)
    train_preds.to_csv(f'../../../../predictions/hi/hiv/knn_maccs/train_{i}.csv')
    test_preds.to_csv(f'../../../../predictions/hi/hiv/knn_maccs/test_{i}.csv')




Predicting...




Predicting...




Predicting...
