# molecular descriptors for ALDH1 inhibitors:

Different descriptors can be determined on the given molecules to find a (causal) relation between the molecule and the ability to inhibit ALDH1. 

Molecular descriptors are for instance: 
-   molecular mass
-   nr carbon atoms
-   nr hydrogen atoms  
-   nr of bonds
-   nr of branches
-   nr double bindings
-   nr triple bindings
-   cyclic structures
-   Aromaticity (indicated by lower letters)
    -   aromatic nitrogen
-   (tetra hedral) chirality
- nr of rings (e.g. cubane)

### rdkit has automatic implemented descriptors and Fingerprints:

This is used now for the generation of descriptors. also a couple of fingerprint variables can be included. 


In [23]:
import numpy as np
import pandas as pd
from tkinter import filedialog as fd
from rdkit import Chem, DataStructs
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw, Descriptors, AllChem
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
IPythonConsole.ipython_useSVG=True



In [24]:
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, cohen_kappa_score, matthews_corrcoef
import joblib

In [25]:
def getMolDescriptors(mol, missingVal=None):
    ''' calculate the full list of descriptors for a molecule
    
        missingVal is used if the descriptor cannot be calculated
    '''
    res = {}
    for nm,fn in Descriptors._descList:
        # some of the descriptor fucntions can throw errors if they fail, catch those here:
        try:
            val = fn(mol)
        except:
            # print the error message:
            import traceback
            traceback.print_exc()
            # and set the descriptor value to whatever missingVal is
            val = missingVal
        res[nm] = val
    return res

In [26]:
filename1 = fd.askopenfilename()
AHDL1Inhibitors1 = pd.read_csv(filename1 ,header = None)
filename2 = fd.askopenfilename()
AHDL1Inhibitors2 = pd.read_csv(filename2 ,header = None)
AHDL1Inhibitors = pd.concat([AHDL1Inhibitors1,AHDL1Inhibitors2])
AHDL1Inhibitors = AHDL1Inhibitors.reset_index(drop=True)
print(AHDL1Inhibitors)

                                                      0                 1
0                                                SMILES  ALDH1_inhibition
1     COc1ccccc1CC(NC(C)=O)C(=O)NC1CCN(c2nnnn2-c2ccc...                 1
2                O=C(CSc1nc2cccnc2n1Cc1ccccc1)NCc1ccco1                 1
3     Cc1cccc2cc(C[NH+](CC3CCCO3)C(c3nnnn3Cc3ccco3)C...                 1
4                     CCN(CC)c1ccc2c(Cl)c(Br)c(=O)oc2c1                 1
...                                                 ...               ...
1997                    C/C(=N\NC(=S)Nc1ccc(F)cc1)C1CC1                 1
1998                     COC(=O)c1c(NC(C)=O)sc2c1CCCCC2                 1
1999                            O=C(CCl)NC1CCCc2ccccc21                 1
2000    COc1ccc(-n2c(SCC(=O)N3CCCCC3C)nnc2-c2cccnc2)cc1                 1
2001  COc1ccc(NC(=O)C2CCC(N3C(=O)C4C5C=CC(C5)C4C3=O)...                 1

[2002 rows x 2 columns]


In [27]:
allTestedMolecules = AHDL1Inhibitors[0] # firts 3 for testing, needs to change for all molecules (remove[0:4])
MolList = allTestedMolecules.values.tolist()
with open('AllTestedMols.txt', 'w') as fp:
    for mol in MolList:
        # write each item on a new line
        fp.write("%s\n" % mol)
    print('Done')


Done


In [28]:
suppl = Chem.SmilesMolSupplier('AllTestedMols.txt')
mols = [m for m in suppl]
# len(mols)
print(mols)



[<rdkit.Chem.rdchem.Mol object at 0x0000015E34CD20B0>, <rdkit.Chem.rdchem.Mol object at 0x0000015E34CD2970>, <rdkit.Chem.rdchem.Mol object at 0x0000015E34CD2A50>, <rdkit.Chem.rdchem.Mol object at 0x0000015E34CD2BA0>, <rdkit.Chem.rdchem.Mol object at 0x0000015E34CD2C80>, <rdkit.Chem.rdchem.Mol object at 0x0000015E34CD2D60>, <rdkit.Chem.rdchem.Mol object at 0x0000015E34CD2E40>, <rdkit.Chem.rdchem.Mol object at 0x0000015E34CD2F20>, <rdkit.Chem.rdchem.Mol object at 0x0000015E3A501BA0>, <rdkit.Chem.rdchem.Mol object at 0x0000015E3A501E40>, <rdkit.Chem.rdchem.Mol object at 0x0000015E3A501EB0>, <rdkit.Chem.rdchem.Mol object at 0x0000015E3A501C80>, <rdkit.Chem.rdchem.Mol object at 0x0000015E3A5019E0>, <rdkit.Chem.rdchem.Mol object at 0x0000015E3A518120>, <rdkit.Chem.rdchem.Mol object at 0x0000015E3A518BA0>, <rdkit.Chem.rdchem.Mol object at 0x0000015E3A5182E0>, <rdkit.Chem.rdchem.Mol object at 0x0000015E3A5184A0>, <rdkit.Chem.rdchem.Mol object at 0x0000015E3A518660>, <rdkit.Chem.rdchem.Mol obje

[19:35:00] SMILES Parse Error: syntax error while parsing: SMILES
[19:35:00] SMILES Parse Error: Failed parsing SMILES 'SMILES' for input: 'SMILES'
[19:35:00] ERROR: Smiles parse error on line 1001
[19:35:00] ERROR: Cannot create molecule from : 'SMILES'


In [29]:
allDescrs = [getMolDescriptors(m) for m in mols]
allDescrsDf = pd.DataFrame(allDescrs)
allDescrsDf.head()

Traceback (most recent call last):
  File "C:\Users\20212807\AppData\Local\Temp\ipykernel_1860\407407914.py", line 10, in getMolDescriptors
    val = fn(mol)
  File "C:\Users\20212807\.conda\envs\group_assignment\lib\site-packages\rdkit\Chem\EState\EState.py", line 91, in MaxAbsEStateIndex
    return max(abs(x) for x in EStateIndices(mol, force))
  File "C:\Users\20212807\.conda\envs\group_assignment\lib\site-packages\rdkit\Chem\EState\EState.py", line 47, in EStateIndices
    nAtoms = mol.GetNumAtoms()
AttributeError: 'NoneType' object has no attribute 'GetNumAtoms'
Traceback (most recent call last):
  File "C:\Users\20212807\AppData\Local\Temp\ipykernel_1860\407407914.py", line 10, in getMolDescriptors
    val = fn(mol)
  File "C:\Users\20212807\.conda\envs\group_assignment\lib\site-packages\rdkit\Chem\EState\EState.py", line 77, in MaxEStateIndex
    return max(EStateIndices(mol, force))
  File "C:\Users\20212807\.conda\envs\group_assignment\lib\site-packages\rdkit\Chem\EState\EStat

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,13.083531,13.083531,0.001173,-0.68314,0.520365,463.542,434.31,463.233188,178.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,12.170097,12.170097,0.066966,-0.066966,0.498564,378.457,360.313,378.115047,136.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10.905837,10.905837,0.016881,-0.016881,0.382043,477.589,444.325,477.260865,184.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,11.562446,11.562446,0.270607,-0.454447,0.795948,330.609,317.505,328.981818,96.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12.108866,12.108866,0.086947,-3.251317,0.687618,419.553,402.417,419.043204,140.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


### Generate Fingerprints

In [30]:
def rdkit_numpy_convert(fp):
    output = []
    for f in fp:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [31]:
fingerp = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mols]
x = rdkit_numpy_convert(fingerp)
print(x)

ArgumentError: Python argument types in
    rdkit.Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect(NoneType, int)
did not match C++ signature:
    GetMorganFingerprintAsBitVect(class RDKit::ROMol mol, unsigned int radius, unsigned int nBits=2048, class boost::python::api::object invariants=[], class boost::python::api::object fromAtoms=[], bool useChirality=False, bool useBondTypes=True, bool useFeatures=False, class boost::python::api::object bitInfo=None, bool includeRedundantEnvironments=False)

In [None]:
fpgen = AllChem.GetRDKitFPGenerator()
fps = [fpgen.GetFingerprint(x) for x in mols]


In [None]:
fpgen = AllChem.GetMorganGenerator(radius=2)
fp1 = fpgen.GetSparseCountFingerprint(mols[1])
# check imbalance dataset:
y = AHDL1Inhibitors[1][1:].astype(int)
sum(y)/len(y)
# mild imbalance

Use data for Machine Learning: 
- Random forest
- SVM

In [None]:
# split data in train and test sets. Set the testset size to 20%
seed = 13
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.20, random_state=seed)
# create folds for cross- validation
cv = StratifiedKFold(n_splits=5, shuffle=True,random_state=seed)

In [None]:
# print out ids of folds
for i, (train_index, test_index) in enumerate(cv.split(xTrain, yTrain)):
    print("\nFold_" + str(i+1))
    print("TRAIN:", train_index)
    print("TEST:", test_index)

In [None]:
# Scale inputs
scale = StandardScaler().fit(xTrain)
xTrain = scale.transform(xTrain)


In [None]:
# save data for future use
joblib.dump(scale, "Fingerprints.pkl", compress=3)

In [None]:
# create grid search dictionary
param_grid = {"max_features": [xTrain.shape[1] // 10, xTrain.shape[1] // 7, 
                               xTrain.shape[1] // 5, xTrain.shape[1] // 3], 
              "n_estimators": [100, 250, 500]}

In [None]:
# setup model building
m = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=2, cv=cv, verbose=1)

In [None]:
# run model building
m.fit(xTrain, yTrain)

In [None]:
m.best_params_

In [None]:
m.best_score_

In [None]:
m.cv_results_


In [None]:
joblib.dump(m, "RFmodelMorganFingerprint.pkl", compress=3)

Load model (also to check if it works)

In [None]:
scale = joblib.load("Fingerprints.pkl")
# scale descriptors of the test set compounds
xTest = scale.transform(xTest)
# predict logBB class
predRF = m.predict(xTest)
predRF

In [None]:
accuracy_score(yTest, predRF)

In [None]:
# if the model includes several ones like RF models or consensus models (or for probabilistic models)
# we can calculate consistency of predictions amongs those models and use it for estimation of applicability domain
pred_prob = m.predict_proba(xTest)
pred_prob

In [None]:
# setup threshold
threshold = 0.8
# calc maximum predicted probability for each row (compound) and compare to the threshold
da = np.amax(pred_prob, axis=1) > threshold
da

In [None]:
# calc statistics
accuracy_score(np.asarray(yTest)[da], predRF[da])

In [None]:
# calc coverage
sum(da) / len(da)

SVM approach

In [None]:
# create grid search dictionary
param_grid = {"C": [10 ** i for i in range(0, 5)],
              "gamma": [10 ** i for i in range(-6, 0)]}
# setup model building
svm = GridSearchCV(SVC(kernel='rbf', probability=True), param_grid, n_jobs=2, cv=cv, verbose=1)

In [None]:
# run model building
svm.fit(xTrain, yTrain)

In [None]:
svm.best_params_

In [None]:
svm.best_score_

In [None]:
# save model
joblib.dump(svm, "SVMmorganFingerprint.pkl", compress=3)

In [None]:
# predict logBB for the test set compounds
pred_svm = svm.predict(xTest)
pred_svm


In [None]:
# calc statistics
print("Accuracy = ", accuracy_score(yTest, pred_svm))
print("MCC = ", matthews_corrcoef(yTest, pred_svm))
print("Kappa = ", cohen_kappa_score(yTest, pred_svm))

In [None]:
# estimate applicability domain and calc stat
pred_prob = svm.predict_proba(xTest)
pred_prob

In [None]:
da = np.amax(pred_prob, axis=1) > threshold

In [None]:
print("Accuracy = ", accuracy_score(np.asarray(yTest)[da], pred_svm[da]))
print("MCC = ", matthews_corrcoef(np.asarray(yTest)[da], pred_svm[da]))
print("Kappa = ", cohen_kappa_score(np.asarray(yTest)[da], pred_svm[da]))
print("Coverage = ", sum(da) / len(da))

Add the descriptors to the model

In [None]:
xNew = np.concatenate((x,allDescrsDf), axis=1)
xNew.shape

Performing PCA on data

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
df = pd.DataFrame(xNew)
df_std = StandardScaler().fit_transform(df)
df_std =  pd.DataFrame(df_std)

pca = PCA()
principalComponents = pca.fit_transform(df_std)
principalDF = pd.DataFrame(data=principalComponents)

explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
i = 0
sumVar = 0
while sumVar<0.9:
    sumVar = cumulative_variance_ratio[i]
    i +=1

xNew = df_std.iloc[:,:i]


In [None]:
xNtr, xNts, yNtr, yNts = train_test_split(xNew, y, test_size=0.20, random_state=seed)
scale = StandardScaler().fit(xNtr)
xNtr = scale.transform(xNtr)

In [None]:
# create grid search dictionary
param_grid = {"max_features": [xNtr.shape[1] // 10, xNtr.shape[1] // 7, xNtr.shape[1] // 5, xNtr.shape[1] // 3], 
              "n_estimators": [100, 250, 500]}

In [None]:
# setup model building
m = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=2, cv=cv, verbose=1)

In [None]:
# run model building
m.fit(xNtr, yNtr)

In [None]:
m.best_score_

In [None]:
# scale descriptors of the test set compounds
xNts = scale.transform(xNts)
# predict
pred = m.predict(xNts)
pred

In [None]:
# calc statistics
print("Accuracy = ", accuracy_score(yNts, pred))
print("MCC = ", matthews_corrcoef(yNts, pred))
print("Kappa = ", cohen_kappa_score(yNts, pred))

In [None]:
# estimate applicability domain and calc stat
pred_prob = m.predict_proba(xNts)
da = np.amax(pred_prob, axis=1) > threshold

print("Accuracy = ", accuracy_score(np.asarray(yNts)[da], pred[da]))
print("MCC = ", matthews_corrcoef(np.asarray(yNts)[da], pred[da]))
print("Kappa = ", cohen_kappa_score(np.asarray(yNts)[da], pred[da]))
print("Coverage = ", sum(da) / len(da))

In [None]:
# rebuild RF model manually using best parameters to be able to extract additional information from the model
rf = RandomForestClassifier(n_estimators=m.best_params_["n_estimators"], 
                           max_features=m.best_params_["max_features"],
                           random_state=seed)
rf.fit(xNtr, yNtr)

In [None]:
imp = rf.feature_importances_
imp

In [None]:
indices = np.argsort(imp)[::-1]

print("Feature ranking:")

# print top 10 features
for i in range(10):
    print("%d. feature %d (%f)" % (i + 1, indices[i], imp[indices[i]]))

features 1-2028 are different Morgan fingerprints