In [1]:
#Load the Models:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC


#Load the Metrics:
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss


from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

import json
import os
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm
tqdm.pandas()

with open("params.json", 'r') as f: 
    params = json.load(f)

random_seed = 42

In [2]:
def generate_fingerprint(smiles,radius,bits):
    try:
        mol=Chem.MolFromSmiles(smiles)
        fp=AllChem.GetMorganFingerprintAsBitVect(mol,radius,bits)
        return(np.array(fp))
    except:
        print(f'{smiles} failed in RDkit')
        return (np.nan)

In [4]:
datasets=[i for i in os.listdir('data_cleaned') if i[-4:]=='.csv']

data_map={
    'HIV.csv': {'target':'HIV_active','structure':'smiles'},
    'bace.csv':{'target':'active','structure':'mol'},
    'tox21.csv':{'target':'NR-AhR','structure':'smiles'},
    'clintox.csv':{'target':'CT_TOX','structure':'smiles'},
    'sol_del.csv':{'target':'binned_sol','structure':'smiles'},
    'deepchem_Lipophilicity.csv':{'target':'drug_like','structure':'smiles'}   
}

df = pd.read_csv(os.path.join( "data_cleaned",datasets[0]))

radius=2
bits=1024
df=pd.read_csv('data_cleaned/HIV.csv')

X=[generate_fingerprint(mol,2,1024) for mol in tqdm(df['smiles'])]
y=df['HIV_active'].to_list()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)



grid = GridSearchCV(eval("SVC")(random_state = random_seed), params["models"]["SVC"]["param_grid"], refit = True, verbose = 3)
grid.fit(X_train, y_train)


100%|██████████| 41127/41127 [00:34<00:00, 1203.32it/s]


Fitting 5 folds for each of 42 candidates, totalling 210 fits
[CV 1/5] END ...........C=0.0001, kernel=linear;, score=0.965 total time=  35.2s
[CV 2/5] END ...........C=0.0001, kernel=linear;, score=0.965 total time=  35.9s
[CV 3/5] END ...........C=0.0001, kernel=linear;, score=0.965 total time=  34.6s
[CV 4/5] END ...........C=0.0001, kernel=linear;, score=0.965 total time=  34.3s
[CV 5/5] END ...........C=0.0001, kernel=linear;, score=0.965 total time=  33.7s
[CV 1/5] END ............C=0.001, kernel=linear;, score=0.965 total time=  39.7s
[CV 2/5] END ............C=0.001, kernel=linear;, score=0.965 total time=  39.6s
[CV 3/5] END ............C=0.001, kernel=linear;, score=0.965 total time=  41.5s
[CV 4/5] END ............C=0.001, kernel=linear;, score=0.965 total time=  45.2s
[CV 5/5] END ............C=0.001, kernel=linear;, score=0.965 total time=  42.4s
[CV 1/5] END ..............C=0.1, kernel=linear;, score=0.969 total time=  53.5s
[CV 2/5] END ..............C=0.1, kernel=linear

In [9]:
clf.fit()

TypeError: fit() missing 2 required positional arguments: 'X' and 'y'