In [2]:
import deepchem as dc
import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV,KFold

from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('../../data/stan_data.csv')

# Feature extraction & Data splitting

In [4]:
featurizer = dc.feat.MACCSKeysFingerprint()
features = featurizer.featurize(data['smiles'])
dataset = dc.data.NumpyDataset(features,data['class'])

In [5]:
splitter = dc.splits.RandomSplitter()
train_dataset, test_dataset = splitter.train_test_split(dataset=dataset,frac_train=0.8,seed=100)

# GridSearchCV

In [6]:
scoring = {
         'AUC':make_scorer(roc_auc_score, needs_proba=True),
          'ACC':make_scorer(accuracy_score),
           'PRE':make_scorer(precision_score),
           'REC':make_scorer(recall_score),
            
}

param_grid = {'learning_rate': [0.1,0.3,0.5,0.7,0.9],
              'n_estimators':[50,100,150,200],
              'max_depth': [ 4, 5, 6, 7, 8]}


gbdt_classifier = GradientBoostingClassifier(random_state=42)

gs = GridSearchCV(
                   gbdt_classifier,
                   param_grid,
                 scoring = scoring,
                  cv = KFold(n_splits=5, shuffle=True, random_state=100),
                 n_jobs = -1,
               refit = 'AUC',
               return_train_score = True)

gs_fit = gs.fit(train_dataset.X, train_dataset.y.ravel())

In [8]:
val_AUC = gs.best_score_
val_ACC = gs.cv_results_['mean_test_ACC'][gs.best_index_]
val_PRE = gs.cv_results_['mean_test_PRE'][gs.best_index_]
val_REC = gs.cv_results_['mean_test_REC'][gs.best_index_]

# GridSearchCV_Result

In [9]:
print('Best parameters: ', gs.best_params_)
print('Best score (AUC): ', gs.best_score_)
print('ACC: ',gs.cv_results_['mean_test_ACC'][gs.best_index_])
print('PRE: ',gs.cv_results_['mean_test_PRE'][gs.best_index_])
print('REC: ',gs.cv_results_['mean_test_REC'][gs.best_index_])

Best parameters:  {'learning_rate': 0.7, 'max_depth': 6, 'n_estimators': 100}
Best score (AUC):  0.8695587207942476
ACC:  0.778
PRE:  0.7664037526148622
REC:  0.7860210591138943


In [10]:
gxb_model = gs_fit.best_estimator_
gxb_model

GradientBoostingClassifier(learning_rate=0.7, max_depth=6, random_state=42)

# Evaluate model

In [11]:
y_test_pred_proba = gxb_model.predict_proba(test_dataset.X)
y_test_pred = gxb_model.predict(test_dataset.X)

In [12]:
tn, fp, fn, tp = confusion_matrix(test_dataset.y, y_test_pred).ravel()
print('TN:', tn)
print('FP:', fp)
print('FN:', fn)
print('TP:', tp)

TN: 106
FP: 28
FN: 19
TP: 98


In [13]:
test_pred_list = []
for test_score in y_test_pred_proba:
    test_score = test_score[1]
    test_pred_list.append(test_score)

In [14]:
test_pred_array = np.array(test_pred_list)

In [15]:
test_AUC = roc_auc_score(test_dataset.y,test_pred_list)
test_ACC = accuracy_score(test_dataset.y,np.round(test_pred_array))
test_PRE = precision_score(test_dataset.y,np.round(test_pred_array))
test_REC = recall_score(test_dataset.y,np.round(test_pred_array))

# Finall result

In [16]:
performance_dataset = {
    'AUC':[val_AUC,test_AUC],
    'ACC':[val_ACC,test_ACC],
    'PRE':[val_PRE,test_PRE],
    'REC':[val_REC,test_REC]
}

In [17]:
performance = pd.DataFrame(performance_dataset,index=['val','test'])
performance

Unnamed: 0,AUC,ACC,PRE,REC
val,0.869559,0.778,0.766404,0.786021
test,0.896989,0.812749,0.777778,0.837607
