In [1]:
import pandas as pd
import numpy as np
import utils as ut

import matplotlib.pyplot as plt
import seaborn as sns


from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski, AllChem, Draw

from numpy.random import seed
from numpy.random import randn
from scipy.stats import mannwhitneyu

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
#from sklearn.metrics import confusion_matrix, plot_confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_fscore_support
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, IsolationForest, StackingClassifier
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.inspection import permutation_importance

#from lazypredict.Supervised import LazyClassifier

import pickle

from chembl_webresource_client.new_client import new_client

In [2]:
lipinski = pd.read_csv('lox_basic_lipinski.csv')
lipinski.head()

Unnamed: 0.1,Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,target,mol_wt,mol_logp,num_H_don,num_H_acpt
0,0,CHEMBL177598,O=C(Nc1ccc(CCc2ccc(O)c(O)c2)cc1)c1ccc(Cl)cc1O,400.0,active,383.831,4.4943,4.0,4.0
1,1,CHEMBL175216,Cc1ccc(C(=O)Nc2ccc(CCc3ccc(O)c(O)c3)cc2)c(O)c1,400.0,active,363.413,4.14932,4.0,4.0
2,2,CHEMBL52,CC(Cc1ccc(O)c(O)c1)C(C)Cc1ccc(O)c(O)c1,4900.0,intermediate,302.37,3.5664,4.0,4.0
3,3,CHEMBL176728,O=C(Nc1ccc(CCc2ccc(O)c(O)c2)cc1)c1ccc(-c2ccccc...,25000.0,inactive,425.484,5.5079,4.0,4.0
4,4,CHEMBL172429,COc1cc(CCc2ccc(NC(=O)c3ccc(C)cc3O)cc2)ccc1O,25000.0,inactive,377.44,4.45232,3.0,4.0


In [3]:
lipinski.columns

Index(['Unnamed: 0', 'molecule_chembl_id', 'canonical_smiles',
       'standard_value', 'target', 'mol_wt', 'mol_logp', 'num_H_don',
       'num_H_acpt'],
      dtype='object')

In [11]:
#How to delete the column name "Unnamed: 0"
data_1 = lipinski.iloc[:,1:]

In [12]:
data_1.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,target,mol_wt,mol_logp,num_H_don,num_H_acpt
0,CHEMBL177598,O=C(Nc1ccc(CCc2ccc(O)c(O)c2)cc1)c1ccc(Cl)cc1O,400.0,active,383.831,4.4943,4.0,4.0
1,CHEMBL175216,Cc1ccc(C(=O)Nc2ccc(CCc3ccc(O)c(O)c3)cc2)c(O)c1,400.0,active,363.413,4.14932,4.0,4.0
2,CHEMBL52,CC(Cc1ccc(O)c(O)c1)C(C)Cc1ccc(O)c(O)c1,4900.0,intermediate,302.37,3.5664,4.0,4.0
3,CHEMBL176728,O=C(Nc1ccc(CCc2ccc(O)c(O)c2)cc1)c1ccc(-c2ccccc...,25000.0,inactive,425.484,5.5079,4.0,4.0
4,CHEMBL172429,COc1cc(CCc2ccc(NC(=O)c3ccc(C)cc3O)cc2)ccc1O,25000.0,inactive,377.44,4.45232,3.0,4.0


In [15]:
data_1.tail()

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,target,mol_wt,mol_logp,num_H_don,num_H_acpt
503,CHEMBL1390514,FC(F)(F)c1ccc(CSc2nccn2-c2ccccc2)cc1,50000.0,inactive,334.366,5.1834,0.0,3.0
504,CHEMBL1520238,Brc1ccc(CSc2nccn2-c2ccccc2)cc1,50000.0,inactive,345.265,4.9271,0.0,3.0
505,CHEMBL1352020,CCc1ccc(CSc2nccn2-c2ccccc2)cc1,50000.0,inactive,294.423,4.727,0.0,3.0
506,CHEMBL4869861,COc1cccc(CNc2ccc(S(=O)(=O)Nc3nc4c(ccc5ccccc54)...,100000.0,inactive,491.594,5.5766,3.0,7.0
507,CHEMBL5089414,CCCCCCCCOC(=O)NS(=O)(=O)Nc1cc(-c2cc3ccccc3[nH]...,33.0,active,473.595,5.5869,3.0,5.0


In [None]:
#get the features using smiles id of ligands 

In [57]:
padel = pd.read_csv('descriptors_output_1.csv')
padel.head()

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,CHEMBL52,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL177598,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL172429,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL367410,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL175216,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
padel.columns

Index(['Name', 'PubchemFP0', 'PubchemFP1', 'PubchemFP2', 'PubchemFP3',
       'PubchemFP4', 'PubchemFP5', 'PubchemFP6', 'PubchemFP7', 'PubchemFP8',
       ...
       'PubchemFP871', 'PubchemFP872', 'PubchemFP873', 'PubchemFP874',
       'PubchemFP875', 'PubchemFP876', 'PubchemFP877', 'PubchemFP878',
       'PubchemFP879', 'PubchemFP880'],
      dtype='object', length=882)

In [61]:
padel.columns = padel.columns.str.replace('Name', 'molecule_chembl_id')

In [62]:
padel

Unnamed: 0,molecule_chembl_id,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,CHEMBL52,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL177598,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL172429,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL367410,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL175216,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,CHEMBL1520238,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
504,CHEMBL1352020,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
505,CHEMBL5089414,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
506,CHEMBL1270704,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
data_1

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,target,mol_wt,mol_logp,num_H_don,num_H_acpt
0,CHEMBL177598,O=C(Nc1ccc(CCc2ccc(O)c(O)c2)cc1)c1ccc(Cl)cc1O,400.0,active,383.831,4.49430,4.0,4.0
1,CHEMBL175216,Cc1ccc(C(=O)Nc2ccc(CCc3ccc(O)c(O)c3)cc2)c(O)c1,400.0,active,363.413,4.14932,4.0,4.0
2,CHEMBL52,CC(Cc1ccc(O)c(O)c1)C(C)Cc1ccc(O)c(O)c1,4900.0,intermediate,302.370,3.56640,4.0,4.0
3,CHEMBL176728,O=C(Nc1ccc(CCc2ccc(O)c(O)c2)cc1)c1ccc(-c2ccccc...,25000.0,inactive,425.484,5.50790,4.0,4.0
4,CHEMBL172429,COc1cc(CCc2ccc(NC(=O)c3ccc(C)cc3O)cc2)ccc1O,25000.0,inactive,377.440,4.45232,3.0,4.0
...,...,...,...,...,...,...,...,...
503,CHEMBL1390514,FC(F)(F)c1ccc(CSc2nccn2-c2ccccc2)cc1,50000.0,inactive,334.366,5.18340,0.0,3.0
504,CHEMBL1520238,Brc1ccc(CSc2nccn2-c2ccccc2)cc1,50000.0,inactive,345.265,4.92710,0.0,3.0
505,CHEMBL1352020,CCc1ccc(CSc2nccn2-c2ccccc2)cc1,50000.0,inactive,294.423,4.72700,0.0,3.0
506,CHEMBL4869861,COc1cccc(CNc2ccc(S(=O)(=O)Nc3nc4c(ccc5ccccc54)...,100000.0,inactive,491.594,5.57660,3.0,7.0


In [64]:
new_data = pd.merge(padel, data_1, on='molecule_chembl_id')

In [65]:
new_data

Unnamed: 0,molecule_chembl_id,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP878,PubchemFP879,PubchemFP880,canonical_smiles,standard_value,target,mol_wt,mol_logp,num_H_don,num_H_acpt
0,CHEMBL52,1,1,1,0,0,0,0,0,0,...,0,0,0,CC(Cc1ccc(O)c(O)c1)C(C)Cc1ccc(O)c(O)c1,4900.0,intermediate,302.370,3.56640,4.0,4.0
1,CHEMBL177598,1,1,1,0,0,0,0,0,0,...,0,0,0,O=C(Nc1ccc(CCc2ccc(O)c(O)c2)cc1)c1ccc(Cl)cc1O,400.0,active,383.831,4.49430,4.0,4.0
2,CHEMBL172429,1,1,1,0,0,0,0,0,0,...,0,0,0,COc1cc(CCc2ccc(NC(=O)c3ccc(C)cc3O)cc2)ccc1O,25000.0,inactive,377.440,4.45232,3.0,4.0
3,CHEMBL367410,1,1,1,0,0,0,0,0,0,...,0,0,0,O=C(Nc1ccc(CCc2ccc(O)c(O)c2)cc1)c1ccccc1O,300.0,active,349.386,3.84090,4.0,4.0
4,CHEMBL175216,1,1,1,0,0,0,0,0,0,...,0,0,0,Cc1ccc(C(=O)Nc2ccc(CCc3ccc(O)c(O)c3)cc2)c(O)c1,400.0,active,363.413,4.14932,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,CHEMBL1520238,1,1,0,0,0,0,0,0,0,...,0,0,0,Brc1ccc(CSc2nccn2-c2ccccc2)cc1,50000.0,inactive,345.265,4.92710,0.0,3.0
504,CHEMBL1352020,1,1,1,0,0,0,0,0,0,...,0,0,0,CCc1ccc(CSc2nccn2-c2ccccc2)cc1,50000.0,inactive,294.423,4.72700,0.0,3.0
505,CHEMBL5089414,1,1,1,0,0,0,0,0,0,...,0,0,0,CCCCCCCCOC(=O)NS(=O)(=O)Nc1cc(-c2cc3ccccc3[nH]...,33.0,active,473.595,5.58690,3.0,5.0
506,CHEMBL1270704,1,1,1,0,0,0,0,0,0,...,0,0,0,O=C(OCC#CCSc1nnc(-c2cccc3ccccc23)o1)c1ccc2cccc...,65.0,active,450.519,5.99550,0.0,6.0


In [67]:
new_data_1 = new_data.drop(['canonical_smiles','standard_value'], axis=1)

In [68]:
#Now we combined the features of ligands of finger prints 
new_data_1

Unnamed: 0,molecule_chembl_id,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,target,mol_wt,mol_logp,num_H_don,num_H_acpt
0,CHEMBL52,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,intermediate,302.370,3.56640,4.0,4.0
1,CHEMBL177598,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,active,383.831,4.49430,4.0,4.0
2,CHEMBL172429,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,inactive,377.440,4.45232,3.0,4.0
3,CHEMBL367410,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,active,349.386,3.84090,4.0,4.0
4,CHEMBL175216,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,active,363.413,4.14932,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,CHEMBL1520238,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,inactive,345.265,4.92710,0.0,3.0
504,CHEMBL1352020,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,inactive,294.423,4.72700,0.0,3.0
505,CHEMBL5089414,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,active,473.595,5.58690,3.0,5.0
506,CHEMBL1270704,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,active,450.519,5.99550,0.0,6.0


In [69]:
new_data_1.to_csv('new_descriptors_lip_padel.csv', index=False)

In [74]:
#Data visuaization 
data = new_data_1.drop(['target'],axis=1)

In [75]:
data

Unnamed: 0,molecule_chembl_id,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,mol_wt,mol_logp,num_H_don,num_H_acpt
0,CHEMBL52,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,302.370,3.56640,4.0,4.0
1,CHEMBL177598,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,383.831,4.49430,4.0,4.0
2,CHEMBL172429,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,377.440,4.45232,3.0,4.0
3,CHEMBL367410,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,349.386,3.84090,4.0,4.0
4,CHEMBL175216,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,363.413,4.14932,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,CHEMBL1520238,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,345.265,4.92710,0.0,3.0
504,CHEMBL1352020,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,294.423,4.72700,0.0,3.0
505,CHEMBL5089414,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,473.595,5.58690,3.0,5.0
506,CHEMBL1270704,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,450.519,5.99550,0.0,6.0


In [76]:
target = new_data_1['target']

In [None]:
target

#Questions : 
I would like to see the dataset of ligands clustered in the space where the target labels are 3 classes!!

In [22]:
padel['Name'][0]

'CHEMBL52'

In [24]:
padel['Name'][0] == data_1['molecule_chembl_id'][2]

True

In [30]:
chembl_id = [i for i in padel['Name']]
chembl_id

['CHEMBL52',
 'CHEMBL177598',
 'CHEMBL172429',
 'CHEMBL367410',
 'CHEMBL175216',
 'CHEMBL177098',
 'CHEMBL176923',
 'CHEMBL176728',
 'CHEMBL176689',
 'CHEMBL175315',
 'CHEMBL173802',
 'CHEMBL177171',
 'CHEMBL177100',
 'CHEMBL177055',
 'CHEMBL88952',
 'CHEMBL176634',
 'CHEMBL92212',
 'CHEMBL92359',
 'CHEMBL413350',
 'CHEMBL89283',
 'CHEMBL173368',
 'CHEMBL89275',
 'CHEMBL7660',
 'CHEMBL316139',
 'CHEMBL91531',
 'CHEMBL314941',
 'CHEMBL79109',
 'CHEMBL81848',
 'CHEMBL79402',
 'CHEMBL274642',
 'CHEMBL81583',
 'CHEMBL421710',
 'CHEMBL186007',
 'CHEMBL420775',
 'CHEMBL57284',
 'CHEMBL185651',
 'CHEMBL79275',
 'CHEMBL184410',
 'CHEMBL157082',
 'CHEMBL187292',
 'CHEMBL186858',
 'CHEMBL111507',
 'CHEMBL17815',
 'CHEMBL362761',
 'CHEMBL186098',
 'CHEMBL185893',
 'CHEMBL150098',
 'CHEMBL186099',
 'CHEMBL187472',
 'CHEMBL148503',
 'CHEMBL185828',
 'CHEMBL185753',
 'CHEMBL184398',
 'CHEMBL184450',
 'CHEMBL204926',
 'CHEMBL202034',
 'CHEMBL201368',
 'CHEMBL370583',
 'CHEMBL201969',
 'CHEMBL204917',

In [40]:
chembl_id[0] == data_1['molecule_chembl_id'][2]

True

In [49]:
for i,j in enumerate(chembl_id):
    if chembl_id[i] == data_1['molecule_chembl_id'][i]:
        print('yes')
    else:
        print('no')
    

no
no
no
no
no
no
no
no
no
yes
no
yes
no
yes
no
yes
no
no
no
no
no
no
no
no
yes
yes
no
no
no
no
no
yes
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
yes
yes
no
no
no
no
no
yes
no
no
no
no
no
no
no
no
no
no
no
no
yes
no
no
no
no
no
yes
no
yes
yes
yes
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
yes
no
no
no
no
no
no
yes
no
no
no
no
no
no
no
no
no
no
yes
no
no
no
no
yes
no
no
yes
no
no
no
yes
yes
no
no
no
no
no
no
no
no
no
no
no
yes
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
yes
no
no
no
no
no
no
no
yes
no
yes
no
no
no
no
no
yes
no
no
no
no
no
no
yes
no
no
no
no
yes
no
no
no
no
no
no
no
no
no
no
yes
no
no
yes
yes
no
no
no
no
no
no
no
no
no
no
no
no
yes
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
no
yes
yes
yes
no
no
no
yes
yes
yes
no
no
no
no
no
no
yes
no
yes
yes
no
no
no
yes
no
no
no
no
yes
no
no
no
no
yes
no
no
no
no
no
no
no
no
no
no
yes
no
yes
no
no
yes
yes
yes
no
no
yes
no
yes
no
no
yes
yes
no
no
no
no
no
no
no
yes
no
yes
yes
yes


In [47]:
data_1['molecule_chembl_id'][0]

'CHEMBL177598'

In [48]:
chembl_id[0]

'CHEMBL52'