# molecular descriptors for ALDH1 inhibitors:

Different descriptors can be determined on the given molecules to find a (causal) relation between the molecule and the ability to inhibit ALDH1. 

Molecular descriptors are for instance: 
-   molecular mass
-   nr carbon atoms
-   nr hydrogen atoms  
-   nr of bonds
-   nr of branches
-   nr double bindings
-   nr triple bindings
-   cyclic structures
-   Aromaticity (indicated by lower letters)
    -   aromatic nitrogen
-   (tetra hedral) chirality
- nr of rings (e.g. cubane)

### rdkit has automatic implemented descriptors and Fingerprints:

This is used now for the generation of descriptors. also a couple of fingerprint variables can be included. 


In [129]:
import numpy as np
import pandas as pd
from tkinter import filedialog as fd
from rdkit import Chem, DataStructs
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw, Descriptors, AllChem
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
IPythonConsole.ipython_useSVG=True



In [130]:
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, cohen_kappa_score, matthews_corrcoef
import joblib

In [131]:
def getMolDescriptors(mol, missingVal=None):
    ''' calculate the full list of descriptors for a molecule
    
        missingVal is used if the descriptor cannot be calculated
    '''
    res = {}
    for nm,fn in Descriptors._descList:
        # some of the descriptor fucntions can throw errors if they fail, catch those here:
        try:
            val = fn(mol)
        except:
            # print the error message:
            import traceback
            traceback.print_exc()
            # and set the descriptor value to whatever missingVal is
            val = missingVal
        res[nm] = val
    return res

In [132]:
filename = fd.askopenfilename()
AHDL1Inhibitors = pd.read_csv(filename ,header = None)
print(AHDL1Inhibitors)

                                                      0                 1
0                                                SMILES  ALDH1_inhibition
1     COc1ccccc1CC(NC(C)=O)C(=O)NC1CCN(c2nnnn2-c2ccc...                 1
2                O=C(CSc1nc2cccnc2n1Cc1ccccc1)NCc1ccco1                 1
3     Cc1cccc2cc(C[NH+](CC3CCCO3)C(c3nnnn3Cc3ccco3)C...                 1
4                     CCN(CC)c1ccc2c(Cl)c(Br)c(=O)oc2c1                 1
...                                                 ...               ...
996              COc1ccc(N2C(=O)CC([NH2+]C3CC3)C2=O)cc1                 0
997                        CCNc1oc(COc2cccc(C)c2)nc1C#N                 0
998                           NC(=O)Cn1cnc(-c2ccccc2)c1                 0
999   Cc1cc(NC(=O)CSc2nc3c(c(=O)n(C)c(=O)n3C)n2C(C)C...                 0
1000          O=C(Cn1nnc2c(cnn2-c2ccccc2)c1=O)NCc1cccs1                 0

[1001 rows x 2 columns]


In [133]:
allTestedMolecules = AHDL1Inhibitors[0] # firts 3 for testing, needs to change for all molecules (remove[0:4])
MolList = allTestedMolecules.values.tolist()
with open('AllTestedMols.txt', 'w') as fp:
    for mol in MolList:
        # write each item on a new line
        fp.write("%s\n" % mol)
    print('Done')


Done


In [134]:
suppl = Chem.SmilesMolSupplier('AllTestedMols.txt')
mols = [m for m in suppl]
# len(mols)
print(mols)



[<rdkit.Chem.rdchem.Mol object at 0x0000020D886C19E0>, <rdkit.Chem.rdchem.Mol object at 0x0000020D886C17B0>, <rdkit.Chem.rdchem.Mol object at 0x0000020D886C1510>, <rdkit.Chem.rdchem.Mol object at 0x0000020D8868A200>, <rdkit.Chem.rdchem.Mol object at 0x0000020D8868A270>, <rdkit.Chem.rdchem.Mol object at 0x0000020DFF686120>, <rdkit.Chem.rdchem.Mol object at 0x0000020D8862ADD0>, <rdkit.Chem.rdchem.Mol object at 0x0000020D8862AEB0>, <rdkit.Chem.rdchem.Mol object at 0x0000020D8862ACF0>, <rdkit.Chem.rdchem.Mol object at 0x0000020D8862AB30>, <rdkit.Chem.rdchem.Mol object at 0x0000020D8862AC80>, <rdkit.Chem.rdchem.Mol object at 0x0000020D8862A350>, <rdkit.Chem.rdchem.Mol object at 0x0000020D8862A660>, <rdkit.Chem.rdchem.Mol object at 0x0000020D88789DD0>, <rdkit.Chem.rdchem.Mol object at 0x0000020D80527660>, <rdkit.Chem.rdchem.Mol object at 0x0000020D885B9EB0>, <rdkit.Chem.rdchem.Mol object at 0x0000020D88628F90>, <rdkit.Chem.rdchem.Mol object at 0x0000020D885BD0B0>, <rdkit.Chem.rdchem.Mol obje



In [135]:
allDescrs = [getMolDescriptors(m) for m in mols]
allDescrsDf = pd.DataFrame(allDescrs)
allDescrsDf.head()

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,13.083531,13.083531,0.001173,-0.68314,0.520365,463.542,434.31,463.233188,178,0,...,0,0,0,0,1,0,0,0,0,0
1,12.170097,12.170097,0.066966,-0.066966,0.498564,378.457,360.313,378.115047,136,0,...,1,0,0,0,0,0,0,0,0,0
2,10.905837,10.905837,0.016881,-0.016881,0.382043,477.589,444.325,477.260865,184,0,...,0,0,0,0,1,0,0,0,0,0
3,11.562446,11.562446,0.270607,-0.454447,0.795948,330.609,317.505,328.981818,96,0,...,0,0,0,0,0,0,0,0,0,0
4,12.108866,12.108866,0.086947,-3.251317,0.687618,419.553,402.417,419.043204,140,0,...,0,1,0,0,0,1,0,1,0,0


### Generate Fingerprints

In [136]:
def rdkit_numpy_convert(fp):
    output = []
    for f in fp:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [137]:
fingerp = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mols]
x = rdkit_numpy_convert(fingerp)
x

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [138]:
fpgen = AllChem.GetRDKitFPGenerator()
fps = [fpgen.GetFingerprint(x) for x in mols]


In [139]:
fpgen = AllChem.GetMorganGenerator(radius=2)
fp1 = fpgen.GetSparseCountFingerprint(mols[1])
# check imbalance dataset:
y = AHDL1Inhibitors[1][1:].astype(int)
sum(y)/len(y)
# mild imbalance

0.3

Use data for Machine Learning: 
- Random forest
- SVM

In [140]:
# split data in train and test sets. Set the testset size to 20%
seed = 13
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.20, random_state=seed)
# create folds for cross- validation
cv = StratifiedKFold(n_splits=5, shuffle=True,random_state=seed)

In [141]:
# print out ids of folds
for i, (train_index, test_index) in enumerate(cv.split(xTrain, yTrain)):
    print("\nFold_" + str(i+1))
    print("TRAIN:", train_index)
    print("TEST:", test_index)


Fold_1
TRAIN: [  0   1   2   3   4   6   8   9  10  11  12  13  14  15  16  17  19  20
  22  23  24  25  26  27  29  30  31  32  33  34  35  36  38  39  40  41
  42  43  44  45  46  48  50  51  52  53  54  55  56  58  59  60  61  62
  63  64  65  66  68  70  72  73  74  75  76  77  78  79  80  81  82  83
  84  85  86  87  88  89  90  91  94  95  96  97  99 101 102 103 104 105
 106 107 108 109 110 111 115 116 117 118 119 120 122 123 125 126 129 131
 134 135 136 137 138 139 140 141 142 143 144 145 146 148 149 150 151 153
 154 155 156 157 158 159 160 161 162 163 165 166 167 168 169 171 172 173
 174 176 177 178 179 181 182 183 185 186 187 189 190 191 192 194 196 197
 198 199 200 201 202 203 205 206 207 208 210 211 212 213 214 215 216 218
 219 221 222 223 224 225 226 228 229 231 233 234 236 237 238 239 240 243
 244 245 247 248 251 252 253 254 255 256 258 259 260 261 262 263 264 265
 269 272 273 274 276 278 279 281 282 283 285 288 289 290 291 292 295 297
 299 300 301 303 304 305 306 307 308

In [142]:
# Scale inputs
scale = StandardScaler().fit(xTrain)
xTrain = scale.transform(xTrain)


In [143]:
# save data for future use
joblib.dump(scale, "Fingerprints.pkl", compress=3)

['Fingerprints.pkl']

In [144]:
# create grid search dictionary
param_grid = {"max_features": [xTrain.shape[1] // 10, xTrain.shape[1] // 7, 
                               xTrain.shape[1] // 5, xTrain.shape[1] // 3], 
              "n_estimators": [100, 250, 500]}

In [145]:
# setup model building
m = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=2, cv=cv, verbose=1)

In [146]:
# run model building
m.fit(xTrain, yTrain)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=13, shuffle=True),
             estimator=RandomForestClassifier(), n_jobs=2,
             param_grid={'max_features': [204, 292, 409, 682],
                         'n_estimators': [100, 250, 500]},
             verbose=1)

In [147]:
m.best_params_

{'max_features': 204, 'n_estimators': 250}

In [148]:
m.best_score_

0.7137499999999999

In [149]:
m.cv_results_


{'mean_fit_time': array([ 1.05973458,  2.6511025 ,  5.26903687,  1.43228593,  3.51309724,
         7.10380721,  1.89025984,  4.69095268,  9.5289124 ,  2.98476839,
         7.38912883, 14.74378772]),
 'std_fit_time': array([0.02075714, 0.06927321, 0.10368585, 0.06456037, 0.07594137,
        0.20823501, 0.05208668, 0.16573223, 0.35442252, 0.10384411,
        0.27876322, 0.76746608]),
 'mean_score_time': array([0.00937276, 0.02811799, 0.04686384, 0.00624952, 0.01903195,
        0.04425797, 0.01249619, 0.02527361, 0.04061489, 0.00624828,
        0.02459049, 0.04376621]),
 'std_score_time': array([7.65282557e-03, 6.24899873e-03, 5.30983387e-07, 7.65407162e-03,
        6.13058150e-03, 6.56909110e-03, 6.24809266e-03, 7.86933506e-03,
        7.65350691e-03, 7.65255304e-03, 7.36026249e-03, 6.26129270e-03]),
 'param_max_features': masked_array(data=[204, 204, 204, 292, 292, 292, 409, 409, 409, 682, 682,
                    682],
              mask=[False, False, False, False, False, False, False

In [150]:
joblib.dump(m, "RFmodelMorganFingerprint.pkl", compress=3)

['RFmodelMorganFingerprint.pkl']

Load model (also to check if it works)

In [151]:
scale = joblib.load("Fingerprints.pkl")
# scale descriptors of the test set compounds
xTest = scale.transform(xTest)
# predict logBB class
predRF = m.predict(xTest)
predRF

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0])

In [152]:
accuracy_score(yTest, predRF)

0.735

In [153]:
# if the model includes several ones like RF models or consensus models (or for probabilistic models)
# we can calculate consistency of predictions amongs those models and use it for estimation of applicability domain
pred_prob = m.predict_proba(xTest)
pred_prob

array([[0.588, 0.412],
       [0.688, 0.312],
       [0.788, 0.212],
       [0.76 , 0.24 ],
       [0.824, 0.176],
       [0.464, 0.536],
       [0.832, 0.168],
       [0.764, 0.236],
       [0.896, 0.104],
       [0.776, 0.224],
       [0.676, 0.324],
       [0.844, 0.156],
       [0.724, 0.276],
       [0.4  , 0.6  ],
       [0.428, 0.572],
       [0.496, 0.504],
       [0.796, 0.204],
       [0.632, 0.368],
       [0.764, 0.236],
       [0.748, 0.252],
       [0.624, 0.376],
       [0.92 , 0.08 ],
       [0.724, 0.276],
       [0.396, 0.604],
       [0.704, 0.296],
       [0.892, 0.108],
       [0.7  , 0.3  ],
       [0.676, 0.324],
       [0.884, 0.116],
       [0.656, 0.344],
       [0.784, 0.216],
       [0.688, 0.312],
       [0.696, 0.304],
       [0.952, 0.048],
       [0.464, 0.536],
       [0.9  , 0.1  ],
       [0.804, 0.196],
       [0.66 , 0.34 ],
       [0.876, 0.124],
       [0.644, 0.356],
       [0.54 , 0.46 ],
       [0.764, 0.236],
       [0.312, 0.688],
       [0.9

In [154]:
# setup threshold
threshold = 0.8
# calc maximum predicted probability for each row (compound) and compare to the threshold
da = np.amax(pred_prob, axis=1) > threshold
da

array([False, False, False, False,  True, False,  True, False,  True,
       False, False,  True, False, False, False, False, False, False,
       False, False, False,  True, False, False, False,  True, False,
       False,  True, False, False, False, False,  True, False,  True,
        True, False,  True, False, False, False, False,  True, False,
        True, False,  True,  True, False,  True, False, False,  True,
       False, False,  True, False, False, False, False, False, False,
        True, False, False,  True, False,  True, False, False, False,
       False, False, False, False,  True, False,  True, False, False,
       False, False,  True,  True,  True, False, False, False, False,
       False,  True, False, False,  True, False,  True, False,  True,
        True, False, False,  True, False,  True,  True, False, False,
        True, False, False, False, False,  True, False, False, False,
       False,  True, False, False, False, False, False, False, False,
        True, False,

In [155]:
# calc statistics
accuracy_score(np.asarray(yTest)[da], predRF[da])

0.8688524590163934

In [156]:
# calc coverage
sum(da) / len(da)

0.305

SVM approach

In [157]:
# create grid search dictionary
param_grid = {"C": [10 ** i for i in range(0, 5)],
              "gamma": [10 ** i for i in range(-6, 0)]}
# setup model building
svm = GridSearchCV(SVC(kernel='rbf', probability=True), param_grid, n_jobs=2, cv=cv, verbose=1)

In [158]:
# run model building
svm.fit(xTrain, yTrain)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=13, shuffle=True),
             estimator=SVC(probability=True), n_jobs=2,
             param_grid={'C': [1, 10, 100, 1000, 10000],
                         'gamma': [1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]},
             verbose=1)

In [159]:
svm.best_params_

{'C': 1, 'gamma': 1e-06}

In [160]:
svm.best_score_

0.69375

In [161]:
# save model
joblib.dump(svm, "SVMmorganFingerprint.pkl", compress=3)

['SVMmorganFingerprint.pkl']

In [162]:
# predict logBB for the test set compounds
pred_svm = svm.predict(xTest)
pred_svm


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])

In [163]:
# calc statistics
print("Accuracy = ", accuracy_score(yTest, pred_svm))
print("MCC = ", matthews_corrcoef(yTest, pred_svm))
print("Kappa = ", cohen_kappa_score(yTest, pred_svm))

Accuracy =  0.725
MCC =  0.0
Kappa =  0.0


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [164]:
# estimate applicability domain and calc stat
pred_prob = svm.predict_proba(xTest)
pred_prob

array([[0.57083948, 0.42916052],
       [0.72284486, 0.27715514],
       [0.72995417, 0.27004583],
       [0.70496736, 0.29503264],
       [0.59535567, 0.40464433],
       [0.53421343, 0.46578657],
       [0.88940693, 0.11059307],
       [0.57943846, 0.42056154],
       [0.81173248, 0.18826752],
       [0.7335703 , 0.2664297 ],
       [0.60825988, 0.39174012],
       [0.79334542, 0.20665458],
       [0.69285544, 0.30714456],
       [0.56137254, 0.43862746],
       [0.66560918, 0.33439082],
       [0.50853042, 0.49146958],
       [0.58919936, 0.41080064],
       [0.68491521, 0.31508479],
       [0.773372  , 0.226628  ],
       [0.71890599, 0.28109401],
       [0.71492432, 0.28507568],
       [0.8126749 , 0.1873251 ],
       [0.66657068, 0.33342932],
       [0.55502751, 0.44497249],
       [0.78833074, 0.21166926],
       [0.81273191, 0.18726809],
       [0.52719708, 0.47280292],
       [0.64635562, 0.35364438],
       [0.83439487, 0.16560513],
       [0.64399142, 0.35600858],
       [0.

In [165]:
da = np.amax(pred_prob, axis=1) > threshold

In [166]:
print("Accuracy = ", accuracy_score(np.asarray(yTest)[da], pred_svm[da]))
print("MCC = ", matthews_corrcoef(np.asarray(yTest)[da], pred_svm[da]))
print("Kappa = ", cohen_kappa_score(np.asarray(yTest)[da], pred_svm[da]))
print("Coverage = ", sum(da) / len(da))

Accuracy =  0.9142857142857143
MCC =  0.0
Kappa =  0.0
Coverage =  0.175


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Add the descriptors to the model

In [167]:
xNew = np.concatenate((x,allDescrsDf), axis=1)
xNew.shape

(1000, 2257)

Performing PCA on data

In [168]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [169]:
df = pd.DataFrame(xNew)
df_std = StandardScaler().fit_transform(df)
df_std =  pd.DataFrame(df_std)

pca = PCA()
principalComponents = pca.fit_transform(df_std)
principalDF = pd.DataFrame(data=principalComponents)

explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
i = 0
sumVar = 0
while sumVar<0.9:
    sumVar = cumulative_variance_ratio[i]
    i +=1

xNew = df_std.iloc[:,:i]


In [170]:
xNtr, xNts, yNtr, yNts = train_test_split(xNew, y, test_size=0.20, random_state=seed)
scale = StandardScaler().fit(xNtr)
xNtr = scale.transform(xNtr)

In [171]:
# create grid search dictionary
param_grid = {"max_features": [xNtr.shape[1] // 10, xNtr.shape[1] // 7, xNtr.shape[1] // 5, xNtr.shape[1] // 3], 
              "n_estimators": [100, 250, 500]}

In [172]:
# setup model building
m = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=2, cv=cv, verbose=1)

In [173]:
# run model building
m.fit(xNtr, yNtr)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=13, shuffle=True),
             estimator=RandomForestClassifier(), n_jobs=2,
             param_grid={'max_features': [57, 82, 115, 192],
                         'n_estimators': [100, 250, 500]},
             verbose=1)

In [174]:
m.best_score_

0.7

In [175]:
# scale descriptors of the test set compounds
xNts = scale.transform(xNts)
# predict
pred = m.predict(xNts)
pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])

In [176]:
# calc statistics
print("Accuracy = ", accuracy_score(yNts, pred))
print("MCC = ", matthews_corrcoef(yNts, pred))
print("Kappa = ", cohen_kappa_score(yNts, pred))

Accuracy =  0.705
MCC =  0.06604131764144666
Kappa =  0.05144694533762062


In [177]:
# estimate applicability domain and calc stat
pred_prob = m.predict_proba(xNts)
da = np.amax(pred_prob, axis=1) > threshold

print("Accuracy = ", accuracy_score(np.asarray(yNts)[da], pred[da]))
print("MCC = ", matthews_corrcoef(np.asarray(yNts)[da], pred[da]))
print("Kappa = ", cohen_kappa_score(np.asarray(yNts)[da], pred[da]))
print("Coverage = ", sum(da) / len(da))

Accuracy =  0.7948717948717948
MCC =  0.0
Kappa =  0.0
Coverage =  0.39


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [178]:
# rebuild RF model manually using best parameters to be able to extract additional information from the model
rf = RandomForestClassifier(n_estimators=m.best_params_["n_estimators"], 
                           max_features=m.best_params_["max_features"],
                           random_state=seed)
rf.fit(xNtr, yNtr)

RandomForestClassifier(max_features=57, n_estimators=250, random_state=13)

In [179]:
imp = rf.feature_importances_
imp

array([3.83605612e-04, 6.44033272e-03, 4.21833801e-03, 1.63338457e-03,
       1.97588409e-03, 7.75543524e-03, 6.80941213e-05, 2.15266562e-03,
       1.94343088e-03, 3.90899400e-04, 0.00000000e+00, 1.29717200e-03,
       4.18468818e-04, 3.01269657e-03, 4.90360082e-03, 1.63224215e-05,
       7.83873668e-04, 6.39489859e-05, 2.95171430e-03, 1.96760229e-03,
       3.79467927e-03, 1.93159286e-04, 8.26140158e-05, 0.00000000e+00,
       1.77460628e-03, 2.96146851e-03, 2.97247627e-03, 1.06526314e-03,
       4.90664118e-04, 6.01318553e-04, 1.64050008e-04, 6.85324961e-03,
       1.66878597e-03, 8.89272205e-04, 5.06891943e-03, 2.95521305e-04,
       6.05005397e-05, 1.26404361e-03, 3.87352024e-04, 0.00000000e+00,
       9.11214443e-04, 1.01551950e-02, 1.68184571e-03, 1.13802633e-03,
       0.00000000e+00, 4.16952802e-03, 8.67551714e-04, 0.00000000e+00,
       5.27096424e-04, 7.61447845e-03, 1.01103202e-02, 4.74342908e-05,
       5.75336442e-04, 1.57364119e-05, 1.19762698e-03, 3.03830477e-03,
      

In [180]:
indices = np.argsort(imp)[::-1]

print("Feature ranking:")

# print top 10 features
for i in range(10):
    print("%d. feature %d (%f)" % (i + 1, indices[i], imp[indices[i]]))

Feature ranking:
1. feature 378 (0.012374)
2. feature 242 (0.011482)
3. feature 80 (0.011252)
4. feature 314 (0.011198)
5. feature 41 (0.010155)
6. feature 50 (0.010110)
7. feature 249 (0.009614)
8. feature 474 (0.009301)
9. feature 526 (0.009165)
10. feature 389 (0.009164)


features 1-2028 are different Morgan fingerprints