In [1]:
import deepchem as dc
import pandas as pd
import numpy as np
from rdkit import Chem

import sklearn
import xgboost as xgb
from sklearn.model_selection import GridSearchCV,KFold

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

import matplotlib.pyplot as plt

Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (D:\anaconda\envs\PI3K\lib\site-packages\deepchem\models\torch_models\__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'pytorch_lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [2]:
data = pd.read_csv('../../data/refined_gabaa.csv')

# Feature extraction & Data splitting

In [3]:
featurizer = dc.feat.MACCSKeysFingerprint()
X = featurizer.featurize(data['smiles'])
Y = data['label']
dataset = dc.data.DiskDataset.from_numpy(X=X,y=Y,ids=data['smiles'])

In [4]:
splitter = dc.splits.ScaffoldSplitter()
train_dataset,test_dataset = splitter.train_test_split(dataset=dataset,frac_train=0.7,seed=200)

# GridSearchCV

In [5]:
scoring = {'AUC':make_scorer(roc_auc_score),
           'F1':make_scorer(f1_score),
           'ACC':make_scorer(accuracy_score),
           'Recall':make_scorer(recall_score),
           'Precision':make_scorer(precision_score)}

param_grid = {'n_estimators':[200],
                  'max_depth':[3,5,7,9],
                  'colsample_bytree':[0.5,0.7,0.9],
                 'learning_rate':[0.01,0.05, 0.1,0.15,0.2]
                 }

xgb_classifier = xgb.XGBClassifier()

cv = KFold(n_splits=5, shuffle=True, random_state=200)

gs = GridSearchCV(
                   xgb_classifier,
                   param_grid,
                 scoring = scoring,
                  cv = cv,
                 n_jobs = -1,
               refit = 'AUC',
               return_train_score = True)

gs_fit = gs.fit(train_dataset.X, train_dataset.y.ravel())





In [7]:
val_AUC = gs.best_score_
val_F1 = gs.cv_results_['mean_test_F1'][gs.best_index_]
val_ACC = gs.cv_results_['mean_test_ACC'][gs.best_index_]
val_Recall = gs.cv_results_['mean_test_Recall'][gs.best_index_]
val_Precision = gs.cv_results_['mean_test_Precision'][gs.best_index_]

# GridSearchCV_Result

In [8]:
print('Best parameters: ', gs.best_params_)
print('Best score (AUC): ', gs.best_score_)
print('F1: ', gs.cv_results_['mean_test_F1'][gs.best_index_])
print('ACC: ', gs.cv_results_['mean_test_ACC'][gs.best_index_])
print('Recall: ',gs.cv_results_['mean_test_Recall'][gs.best_index_])
print('Precision: ', gs.cv_results_['mean_test_Precision'][gs.best_index_])

Best parameters:  {'colsample_bytree': 0.5, 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200}
Best score (AUC):  0.8886828332316705
F1:  0.855545846504142
ACC:  0.8928384833557248
Recall:  0.8653871151591825
Precision:  0.8503860994807748


In [9]:
xgb_model = gs_fit.best_estimator_
xgb_model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5,
              enable_categorical=False, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

# Evaluate model

In [10]:
y_test_pred_proba = xgb_model.predict_proba(test_dataset.X)

In [11]:
test_pred_list = []
for test_score in y_test_pred_proba:
    test_score = test_score[1]
    test_pred_list.append(test_score)

In [12]:
test_pred_array = np.array(test_pred_list)

In [13]:
test_AUC = roc_auc_score(test_dataset.y,test_pred_list)
test_F1 = f1_score(test_dataset.y,np.round(test_pred_array))
test_ACC = accuracy_score(test_dataset.y,np.round(test_pred_array))
test_Recall = recall_score(test_dataset.y,np.round(test_pred_array))
test_Precision = precision_score(test_dataset.y,np.round(test_pred_array))

# Finall result

In [14]:
performance_dataset = {
    'AUC':[val_AUC,test_AUC],
    'F1_score':[val_F1,test_F1],
    'ACC':[val_ACC,test_ACC],
    'Recall':[val_Recall,test_Recall],
    'Precision':[val_Precision,test_Precision],
}

In [15]:
performance = pd.DataFrame(performance_dataset,index=['val','test'])
performance

Unnamed: 0,AUC,F1_score,ACC,Recall,Precision
val,0.888683,0.855546,0.892838,0.865387,0.850386
test,0.883884,0.810345,0.734406,0.703242,0.955932
