In [1]:
import pandas as pd
import numpy as py
import seaborn as sb

from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors 
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole


In [3]:
dataset = PandasTools.LoadSDF(r"C:\Users\sneha\Desktop\Comp Chem\vscode\HMG_CoA_ChEMBL_Database.sdf")
dataset.head(5)

[18:01:33] skipping block at line 182674: 'BEGIN BOND'
[18:01:33] skipping block at line 182708: 'BEGIN BOND'
[18:01:33] skipping block at line 198422: 'BEGIN BOND'
[18:01:33] skipping block at line 198450: 'BEGIN BOND'


Unnamed: 0,Target_Name,Target_Organism,Total_Molweight,Smiles,ID,ROMol
0,HMG-CoA reductase,Homo sapiens,292.139,Brc(c1nccnc1cc1)c1NC1=NCCN1,Compound 1,<rdkit.Chem.rdchem.Mol object at 0x000001EDC9D...
1,HMG-CoA reductase,Homo sapiens,292.139,Brc(c1nccnc1cc1)c1NC1=NCCN1,Compound 2,<rdkit.Chem.rdchem.Mol object at 0x000001EDD2A...
2,HMG-CoA reductase,Homo sapiens,157.01,Brc1ccccc1,Compound 3,<rdkit.Chem.rdchem.Mol object at 0x000001EDD19...
3,HMG-CoA reductase,Homo sapiens,157.01,Brc1ccccc1,Compound 4,<rdkit.Chem.rdchem.Mol object at 0x000001EDD19...
4,HMG-CoA reductase,Homo sapiens,368.522,C(/C=C/c1ccccc1)N(CC1)CCN1C(c1ccccc1)c1ccccc1,Compound 5,<rdkit.Chem.rdchem.Mol object at 0x000001EDD19...


In [5]:
dataset_new = dataset.drop_duplicates(subset=['Smiles'])
len(dataset_new)

1282

In [6]:
dataset_new

Unnamed: 0,Target_Name,Target_Organism,Total_Molweight,Smiles,ID,ROMol
0,HMG-CoA reductase,Homo sapiens,292.139,Brc(c1nccnc1cc1)c1NC1=NCCN1,Compound 1,<rdkit.Chem.rdchem.Mol object at 0x000001EDC9D...
2,HMG-CoA reductase,Homo sapiens,157.01,Brc1ccccc1,Compound 3,<rdkit.Chem.rdchem.Mol object at 0x000001EDD19...
4,HMG-CoA reductase,Homo sapiens,368.522,C(/C=C/c1ccccc1)N(CC1)CCN1C(c1ccccc1)c1ccccc1,Compound 5,<rdkit.Chem.rdchem.Mol object at 0x000001EDD19...
6,HMG-CoA reductase,Homo sapiens,277.494,C(C(C1CCCCC1)C1CCCCC1)C1NCCCC1,Compound 7,<rdkit.Chem.rdchem.Mol object at 0x000001EDD19...
8,HMG-CoA reductase,Homo sapiens,234.385,C(CC1)C[C@H]2N1C[C@@H]1[C@H](CCCC3)N3C[C@H]2C1,Compound 9,<rdkit.Chem.rdchem.Mol object at 0x000001EDD19...
...,...,...,...,...,...,...
2420,HMG-CoA reductase,Homo sapiens,332.496,c(cc1)cc2c1sc(SSc1nc(cccc3)c3s1)n2,Compound 2421,<rdkit.Chem.rdchem.Mol object at 0x000001EDD5B...
2422,HMG-CoA reductase,Homo sapiens,260.339,c(cc1)ccc1Nc(cc1)ccc1Nc1ccccc1,Compound 2423,<rdkit.Chem.rdchem.Mol object at 0x000001EDD5B...
2424,HMG-CoA reductase,Homo sapiens,201.253,c1c(-c2nc(cccc3)c3[nH]2)ncs1,Compound 2425,<rdkit.Chem.rdchem.Mol object at 0x000001EDD5B...
2426,HMG-CoA reductase,Homo sapiens,252.315,c1cc2c(cccc3ccc4ccc5)c3c4c5c2cc1,Compound 2427,<rdkit.Chem.rdchem.Mol object at 0x000001EDD5B...


In [8]:
PandasTools.AddMoleculeColumnToFrame(dataset,'Smiles','Structure',includeFingerprints=True)

In [10]:
descriptor_names = list(rdMolDescriptors.Properties.GetAvailableProperties())
get_descriptors = rdMolDescriptors.Properties(descriptor_names)

print(len(descriptor_names))
descriptor_names

43


['exactmw',
 'amw',
 'lipinskiHBA',
 'lipinskiHBD',
 'NumRotatableBonds',
 'NumHBD',
 'NumHBA',
 'NumHeavyAtoms',
 'NumAtoms',
 'NumHeteroatoms',
 'NumAmideBonds',
 'FractionCSP3',
 'NumRings',
 'NumAromaticRings',
 'NumAliphaticRings',
 'NumSaturatedRings',
 'NumHeterocycles',
 'NumAromaticHeterocycles',
 'NumSaturatedHeterocycles',
 'NumAliphaticHeterocycles',
 'NumSpiroAtoms',
 'NumBridgeheadAtoms',
 'NumAtomStereoCenters',
 'NumUnspecifiedAtomStereoCenters',
 'labuteASA',
 'tpsa',
 'CrippenClogP',
 'CrippenMR',
 'chi0v',
 'chi1v',
 'chi2v',
 'chi3v',
 'chi4v',
 'chi0n',
 'chi1n',
 'chi2n',
 'chi3n',
 'chi4n',
 'hallKierAlpha',
 'kappa1',
 'kappa2',
 'kappa3',
 'Phi']

In [12]:
def smi_to_descriptors(smile):
    mol = Chem.MolFromSmiles(smile)
    descriptors = []
    if mol:
        descriptors = py.array(get_descriptors.ComputeProperties(mol))
    return descriptors
dataset['descriptors'] = dataset.Smiles.apply(smi_to_descriptors)

In [13]:
dataset[descriptor_names] = dataset['descriptors'].to_list()
dataset[descriptor_names]

Unnamed: 0,exactmw,amw,lipinskiHBA,lipinskiHBD,NumRotatableBonds,NumHBD,NumHBA,NumHeavyAtoms,NumAtoms,NumHeteroatoms,...,chi0n,chi1n,chi2n,chi3n,chi4n,hallKierAlpha,kappa1,kappa2,kappa3,Phi
0,291.011957,292.140,5.0,2.0,1.0,2.0,5.0,17.0,27.0,6.0,...,8.943220,5.289999,2.506022,2.506022,1.708586,-1.69,10.463099,4.289689,1.954369,2.640202
1,291.011957,292.140,5.0,2.0,1.0,2.0,5.0,17.0,27.0,6.0,...,8.943220,5.289999,2.506022,2.506022,1.708586,-1.69,10.463099,4.289689,1.954369,2.640202
2,155.957462,157.010,0.0,0.0,0.0,0.0,0.0,7.0,12.0,1.0,...,3.764716,2.099666,0.733111,0.733111,0.414667,-0.30,4.849254,2.123680,1.316124,1.471181
3,155.957462,157.010,0.0,0.0,0.0,0.0,0.0,7.0,12.0,1.0,...,3.764716,2.099666,0.733111,0.733111,0.414667,-0.30,4.849254,2.123680,1.316124,1.471181
4,368.225249,368.524,2.0,0.0,6.0,0.0,2.0,28.0,56.0,2.0,...,16.322266,10.178996,5.476133,5.476133,3.990870,-2.68,18.672581,9.495928,4.776071,6.332624
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2425,201.036068,201.254,3.0,1.0,1.0,1.0,3.0,14.0,21.0,4.0,...,7.266777,4.266449,2.064315,2.064315,1.403585,-1.68,7.698702,2.919836,1.117699,1.605639
2426,252.093900,252.316,0.0,0.0,0.0,0.0,0.0,20.0,32.0,0.0,...,10.928203,6.976068,4.393233,4.393233,3.557443,-2.60,10.219023,3.486522,1.119859,1.781442
2427,252.093900,252.316,0.0,0.0,0.0,0.0,0.0,20.0,32.0,0.0,...,10.928203,6.976068,4.393233,4.393233,3.557443,-2.60,10.219023,3.486522,1.119859,1.781442
2428,252.093900,252.316,0.0,0.0,0.0,0.0,0.0,20.0,32.0,0.0,...,10.928203,6.970085,4.370904,4.370904,3.489494,-2.60,10.219023,3.486522,1.163009,1.781442
