In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from rdkit.Chem import PandasTools, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from sklearn import pipeline as pl
from sklearn import preprocessing as pp
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
import graphviz 


### Dataset initiation
Initiate and prepare the dataset with descriptors to make them usable for the ML-model. 

In [2]:
dataset_molecules = pd.read_csv('tested_molecules_v2.csv')
dataset_molecules_test = pd.read_csv('tested_molecules-1.csv')
PandasTools.AddMoleculeColumnToFrame(dataset_molecules, smilesCol='SMILES')
PandasTools.AddMoleculeColumnToFrame(dataset_molecules_test, smilesCol='SMILES')
dataset_molecules.head()


Unnamed: 0,SMILES,ALDH1_inhibition,ROMol
0,[NH3+]CCSSCC[NH3+],0,<rdkit.Chem.rdchem.Mol object at 0x0000024BCEA...
1,[NH3+]CCC[NH2+]CCCC[NH2+]CCC[NH3+],0,<rdkit.Chem.rdchem.Mol object at 0x0000024BCEA...
2,[NH3+]CCCCCCCCCC[NH3+],0,<rdkit.Chem.rdchem.Mol object at 0x0000024BCEA...
3,[NH3+]CCSSCC[NH3+],0,<rdkit.Chem.rdchem.Mol object at 0x0000024BCEA...
4,ClCC[NH+](CCCl)CCCl,0,<rdkit.Chem.rdchem.Mol object at 0x0000024BCEA...


In [3]:

descriptors_labels = [n[0] for n in Descriptors._descList[:]]
calc = MoleculeDescriptors.MolecularDescriptorCalculator(descriptors_labels)
rdkit_desc = [calc.CalcDescriptors(m) for m in dataset_molecules["ROMol"]]
rdkit_desc_test = [calc.CalcDescriptors(m) for m in dataset_molecules_test["ROMol"]]
dataset_descriptors = pd.DataFrame(rdkit_desc, columns=[descriptors_labels])
dataset_descriptors_test = pd.DataFrame(rdkit_desc_test, columns=[descriptors_labels])
X_train = dataset_descriptors
y_train = dataset_molecules['ALDH1_inhibition']
X_test = dataset_descriptors_test
y_test = dataset_molecules['ALDH1_inhibition']

dataset_descriptors.head()

KeyError: 'ALDH1_inhibition'

### Introducing a Pipeline with minmax scaler and Desciscion tree.

In [None]:
scaler = pp.MinMaxScaler()
pca = PCA()
dtree = tree.DecisionTreeClassifier(max_features=None, max_depth=None, max_leaf_nodes=None)

pipeline = pl.Pipeline(steps=[("sc", scaler), ("pca", pca), ("dtreeCLF", dtree)])
cross_score = cross_val_score(pipeline, X_train, y_train, cv=10)
print("Cross score:") 
print(cross_score)
pipeline.fit(X_train, y_train)
print("score:")
print(pipeline.score(X_test, y_test))

Cross score:
[0.92 1.   0.98 0.98 0.98 0.92 0.98 0.96 0.96 0.92]
score:
0.964


### Creating a physical tree

In [None]:
dot_data = tree.export_graphviz(dtree, out_file=None, feature_names=dataset_descriptors.columns, class_names=['1', '0'])
graph = graphviz.Source(dot_data) 
graph.render("descision_tree") 

'descision_tree.pdf'

### Analysing the model

In [None]:
y_true = y_test
y_pred = pipeline.predict(X_test)

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
specificity = tn / (tn+fp)
sensitivity = tp/(tp+fp) 
accuracy = (tp+tn)/(tp+tn+fp+fn)
print(" |+\t|-\t|\n+|{:.0f}\t|{:.0f}\t|\n-|{:.0f}\t|{:.0f}\t|".format(tp, fp, fn, tn))
print("Specificiteit: {:.2f} Sensitivity: {:.2f} Accuracy {:.2f}".format(specificity, sensitivity, accuracy))

 |+	|-	|
+|123	|8	|
-|10	|359	|
Specificiteit: 0.98 Sensitivity: 0.94 Accuracy 0.96
