In [1]:
import deepchem as dc
import pandas as pd
import numpy as np
from rdkit import Chem

import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,KFold

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

import matplotlib.pyplot as plt

Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (D:\anaconda\envs\PI3K\lib\site-packages\deepchem\models\torch_models\__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'pytorch_lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [2]:
data = pd.read_csv('../data/refined_gabaa.csv')

In [3]:
data

Unnamed: 0,smiles,label
0,Cc1ccc(-c2nc(C#N)c(S(=O)(=O)N(C)CCO)o2)cc1,1
1,CCOC(=O)c1cn2c(n1)CCC2,1
2,N#Cc1ccc(CSc2nc(=O)cc(O)[nH]2)cc1,1
3,Fc1ccc(-c2nnnn2-c2ccc(Br)cn2)cc1,1
4,CCc1ccc(NC(=O)c2c(C(=O)Nc3ccc(C(=O)OC)cc3)sc3n...,1
...,...,...
1649,CC(=O)Oc1c(C(C)C)cccc1C(C)C,0
1650,COCCN(CCOC)c1nn2c(-c3ccc(O)cc3)nnc2c2ccccc12,0
1651,CCCCCn1ncc2c(N)c(C(=O)NCC)cnc21,0
1652,NC(=O)N1Cc2c(-c3noc(C4CC4)n3)ncn2-c2ccccc21,0


# Feature extraction & Data splitting

In [4]:
featurizer = dc.feat.CircularFingerprint(size=1024,radius=4)
X = featurizer.featurize(data['smiles'])
Y = data['label']
dataset = dc.data.DiskDataset.from_numpy(X=X,y=Y,ids=data['smiles'])

In [6]:
splitter = dc.splits.ScaffoldSplitter()
train_dataset,test_dataset = splitter.train_test_split(dataset=dataset,frac_train=0.7,seed=200)

# GridSearchCV

In [8]:
scoring = {'AUC':make_scorer(roc_auc_score),
           'F1':make_scorer(f1_score),
           'ACC':make_scorer(accuracy_score),
           'Recall':make_scorer(recall_score),
           'Precision':make_scorer(precision_score)}

rf_param_grid = {'n_estimators':[150,200,250,300,350,400,450,500],
              'max_depth':[20,30,40,50,60,70,80,90,100,110],
            'max_features':["auto", "sqrt", "log2"]}
    
    
rf_classifier = RandomForestClassifier()
cv = KFold(n_splits=5, shuffle=True, random_state=200)

rf_gs = GridSearchCV(
                    rf_classifier,
                   rf_param_grid,
                 scoring = scoring,
                  cv = cv,
                 n_jobs = -1,
               refit = 'AUC',
           return_train_score = True)

rf_gs_fit = rf_gs.fit(train_dataset.X, train_dataset.y.ravel())

In [9]:
val_AUC = rf_gs.best_score_
val_F1 = rf_gs.cv_results_['mean_test_F1'][rf_gs.best_index_]
val_ACC = rf_gs.cv_results_['mean_test_ACC'][rf_gs.best_index_]
val_Recall = rf_gs.cv_results_['mean_test_Recall'][rf_gs.best_index_]
val_Precision = rf_gs.cv_results_['mean_test_Precision'][rf_gs.best_index_]

# GridSearchCV_Result

In [11]:
print('Best parameters: ', rf_gs.best_params_)
print('Best score (AUC): ', rf_gs.best_score_)
print('F1: ', rf_gs.cv_results_['mean_test_F1'][rf_gs.best_index_])
print('ACC: ', rf_gs.cv_results_['mean_test_ACC'][rf_gs.best_index_])
print('Recall: ',rf_gs.cv_results_['mean_test_Recall'][rf_gs.best_index_])
print('Precision: ', rf_gs.cv_results_['mean_test_Precision'][rf_gs.best_index_])

Best parameters:  {'max_depth': 100, 'max_features': 'log2', 'n_estimators': 450}
Best score (AUC):  0.8935055922980878
F1:  0.8635798659030776
ACC:  0.9006232273473653
Recall:  0.8609617870594987
Precision:  0.8701324698263173


In [12]:
rf_model = rf_gs_fit.best_estimator_
rf_model

RandomForestClassifier(max_depth=100, max_features='log2', n_estimators=450)

# Evaluate model

In [13]:
y_test_pred_proba = rf_model.predict_proba(test_dataset.X)

In [14]:
rf_test_pred_list = []
for test_score in y_test_pred_proba:
    test_score = test_score[1]
    rf_test_pred_list.append(test_score)

In [15]:
rf_test_pred_array = np.array(rf_test_pred_list)

In [16]:
test_AUC = roc_auc_score(test_dataset.y,rf_test_pred_list)
test_F1 = f1_score(test_dataset.y,np.round(rf_test_pred_array))
test_ACC = accuracy_score(test_dataset.y,np.round(rf_test_pred_array))
test_Recall = recall_score(test_dataset.y,np.round(rf_test_pred_array))
test_Precision = precision_score(test_dataset.y,np.round(rf_test_pred_array))

# Finall result

In [17]:
rf_performance_dataset = {
    'AUC':[val_AUC,test_AUC],
    'F1_score':[val_F1,test_F1],
    'ACC':[val_ACC,test_ACC],
    'Recall':[val_Recall,test_Recall],
    'Precision':[val_Precision,test_Precision],
}

In [18]:
rf_performance = pd.DataFrame(rf_performance_dataset,index=['val','test'])
rf_performance

Unnamed: 0,AUC,F1_score,ACC,Recall,Precision
val,0.893506,0.86358,0.900623,0.860962,0.870132
test,0.872026,0.752624,0.668008,0.625935,0.943609
