In [18]:
import deepchem as dc
import pandas as pd
import numpy as np
from rdkit import Chem

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,KFold

from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import matthews_corrcoef

In [19]:
data = pd.read_csv('../../data/refined_gabaa.csv')

# Feature extraction & Data splitting

In [20]:
featurizer = dc.feat.CircularFingerprint(size=1024,radius=4)
X = featurizer.featurize(data['smiles'])
Y = data['label']
dataset = dc.data.DiskDataset.from_numpy(X=X,y=Y,ids=data['smiles'])

In [21]:
splitter = dc.splits.ScaffoldSplitter()
train_dataset,test_dataset = splitter.train_test_split(dataset=dataset,frac_train=0.7,seed=500)

# GridSearchCV

In [22]:
scoring = {
          'F1':make_scorer(f1_score),
          'AUC':make_scorer(roc_auc_score),
           'BA':make_scorer(balanced_accuracy_score),
           'MCC':make_scorer(matthews_corrcoef)
}

rf_param_grid = {'n_estimators':[150,200,250,300,350,400,450,500],
              'max_depth':[20,30,40,50,60,70,80,90,100,110],
            'max_features':["auto", "sqrt", "log2"]}
    
    
rf_classifier = RandomForestClassifier()
cv = KFold(n_splits=5, shuffle=True, random_state=500)

rf_gs = GridSearchCV(
                    rf_classifier,
                   rf_param_grid,
                 scoring = scoring,
                  cv = cv,
                 n_jobs = -1,
               refit = 'F1',
           return_train_score = True)

rf_gs_fit = rf_gs.fit(train_dataset.X, train_dataset.y.ravel())

In [23]:
val_F1 = rf_gs.best_score_
val_AUC = rf_gs.cv_results_['mean_test_AUC'][rf_gs.best_index_]
val_BA = rf_gs.cv_results_['mean_test_BA'][rf_gs.best_index_]
val_MCC = rf_gs.cv_results_['mean_test_MCC'][rf_gs.best_index_]

# GridSearchCV_Result

In [24]:
print('Best parameters: ', rf_gs.best_params_)
print('Best score (F1): ', rf_gs.best_score_)
print('AUC: ', rf_gs.cv_results_['mean_test_AUC'][rf_gs.best_index_])
print('BA: ', rf_gs.cv_results_['mean_test_BA'][rf_gs.best_index_])
print('MCC: ',rf_gs.cv_results_['mean_test_MCC'][rf_gs.best_index_])

Best parameters:  {'max_depth': 40, 'max_features': 'sqrt', 'n_estimators': 450}
Best score (F1):  0.8155497499438749
AUC:  0.8607233343420584
BA:  0.8607233343420584
MCC:  0.7398846330927293


In [25]:
rf_model = rf_gs_fit.best_estimator_
rf_model

RandomForestClassifier(max_depth=40, max_features='sqrt', n_estimators=450)

# Evaluate model

In [26]:
y_test_pred_proba = rf_model.predict_proba(test_dataset.X)
y_test_pred = rf_model.predict(test_dataset.X)

In [27]:
tn, fp, fn, tp = confusion_matrix(test_dataset.y, y_test_pred).ravel()
print('TN:', tn)
print('FP:', fp)
print('FN:', fn)
print('TP:', tp)

TN: 158
FP: 20
FN: 166
TP: 229


In [28]:
rf_test_pred_list = []
for test_score in y_test_pred_proba:
    test_score = test_score[1]
    rf_test_pred_list.append(test_score)

In [29]:
rf_test_pred_array = np.array(rf_test_pred_list)

In [30]:
test_F1 = f1_score(test_dataset.y,np.round(rf_test_pred_array))
test_AUC = roc_auc_score(test_dataset.y,rf_test_pred_list)
test_BA =  balanced_accuracy_score(test_dataset.y,np.round(rf_test_pred_array))
test_MCC = matthews_corrcoef(test_dataset.y,np.round(rf_test_pred_array))

# Finall result

In [31]:
rf_performance_dataset = {
    'F1':[val_F1,test_F1],
    'AUC':[val_AUC,test_AUC],
    'BA':[val_BA,test_BA],
    'MCC':[val_MCC,test_MCC],
}

In [32]:
rf_performance = pd.DataFrame(rf_performance_dataset,index=['val','test'])
rf_performance

Unnamed: 0,F1,AUC,BA,MCC
val,0.81555,0.860723,0.860723,0.739885
test,0.71118,0.86376,0.733694,0.436328
