In [1]:
import pandas as pd
from rdkit.Chem import PandasTools, Draw
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina
from rdkit.Chem import rdMolDescriptors as rdmd
from rdkit.Chem import Descriptors
import seaborn as sns

In [2]:
def butina_cluster(mol_list,cutoff=0.35):
    fp_list = [rdmd.GetMorganFingerprintAsBitVect(m, 3, nBits=2048) for m in mol_list]
    dists = []
    nfps = len(fp_list)
    for i in range(1,nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fp_list[i],fp_list[:i])
        dists.extend([1-x for x in sims])
    mol_clusters = Butina.ClusterData(dists,nfps,cutoff,isDistData=True)
    cluster_id_list = [0]*nfps
    for idx,cluster in enumerate(mol_clusters,1):
        for member in cluster:
            cluster_id_list[member] = idx
    return cluster_id_list

In [3]:
df = pd.read_csv("../data/dude_erk2_mk01.csv")

In [4]:
df.head()

Unnamed: 0,SMILES,Name,is_active
0,Cn1ccnc1Sc2ccc(cc2Cl)Nc3c4cc(c(cc4ncc3C#N)OCCC...,168691,1
1,C[C@@]12[C@@H]([C@@H](CC(O1)n3c4ccccc4c5c3c6n2...,86358,1
2,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3...,575087,1
3,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3...,575065,1
4,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3...,575047,1


In [5]:
PandasTools.AddMoleculeColumnToFrame(df,smilesCol="SMILES")

In [6]:
df.head()

Unnamed: 0,SMILES,Name,is_active,ROMol
0,Cn1ccnc1Sc2ccc(cc2Cl)Nc3c4cc(c(cc4ncc3C#N)OCCC...,168691,1,"<img src=""data:image/png;base64,iVBORw0KGgoAAA..."
1,C[C@@]12[C@@H]([C@@H](CC(O1)n3c4ccccc4c5c3c6n2...,86358,1,"<img src=""data:image/png;base64,iVBORw0KGgoAAA..."
2,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3...,575087,1,"<img src=""data:image/png;base64,iVBORw0KGgoAAA..."
3,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3...,575065,1,"<img src=""data:image/png;base64,iVBORw0KGgoAAA..."
4,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3...,575047,1,"<img src=""data:image/png;base64,iVBORw0KGgoAAA..."


In [7]:
df['Cluster'] = butina_cluster(df.ROMol.values)

In [8]:
df.head()

Unnamed: 0,SMILES,Name,is_active,ROMol,Cluster
0,Cn1ccnc1Sc2ccc(cc2Cl)Nc3c4cc(c(cc4ncc3C#N)OCCC...,168691,1,"<img src=""data:image/png;base64,iVBORw0KGgoAAA...",3918
1,C[C@@]12[C@@H]([C@@H](CC(O1)n3c4ccccc4c5c3c6n2...,86358,1,"<img src=""data:image/png;base64,iVBORw0KGgoAAA...",3917
2,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3...,575087,1,"<img src=""data:image/png;base64,iVBORw0KGgoAAA...",2
3,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3...,575065,1,"<img src=""data:image/png;base64,iVBORw0KGgoAAA...",2
4,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3...,575047,1,"<img src=""data:image/png;base64,iVBORw0KGgoAAA...",2


In [9]:
df["logP"] = [Descriptors.MolLogP(mol) for mol in df.ROMol]

In [10]:
df.head()

Unnamed: 0,SMILES,Name,is_active,ROMol,Cluster,logP
0,Cn1ccnc1Sc2ccc(cc2Cl)Nc3c4cc(c(cc4ncc3C#N)OCCC...,168691,1,"<img src=""data:image/png;base64,iVBORw0KGgoAAA...",3918,5.49788
1,C[C@@]12[C@@H]([C@@H](CC(O1)n3c4ccccc4c5c3c6n2...,86358,1,"<img src=""data:image/png;base64,iVBORw0KGgoAAA...",3917,4.354
2,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3...,575087,1,"<img src=""data:image/png;base64,iVBORw0KGgoAAA...",2,4.96202
3,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3...,575065,1,"<img src=""data:image/png;base64,iVBORw0KGgoAAA...",2,4.36922
4,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3...,575047,1,"<img src=""data:image/png;base64,iVBORw0KGgoAAA...",2,5.12922


In [11]:
df.sort_values(["Cluster","logP"],inplace=True)

In [12]:
df.head()

Unnamed: 0,SMILES,Name,is_active,ROMol,Cluster,logP
1142,C=CCNS(=O)(=O)c1ccc2c(c1)[C@@H]3C=CC[C@H]3[C@H...,C20681887,0,"<img src=""data:image/png;base64,iVBORw0KGgoAAA...",1,3.3886
1193,C=CCNS(=O)(=O)c1ccc2c(c1)[C@H]3C=CC[C@H]3[C@H]...,C20681890,0,"<img src=""data:image/png;base64,iVBORw0KGgoAAA...",1,3.3886
387,CC(C)NS(=O)(=O)c1ccc2c(c1)[C@@H]3C=CC[C@@H]3[C...,C13639961,0,"<img src=""data:image/png;base64,iVBORw0KGgoAAA...",1,3.611
1137,CC(C)NS(=O)(=O)c1ccc2c(c1)[C@H]3C=CC[C@@H]3[C@...,C13639963,0,"<img src=""data:image/png;base64,iVBORw0KGgoAAA...",1,3.611
991,CC(C)(C)NS(=O)(=O)c1ccc2c(c1)[C@H]3C=CC[C@@H]3...,C13639902,0,"<img src=""data:image/png;base64,iVBORw0KGgoAAA...",1,4.0011


In [17]:
df_unique = df.drop_duplicates("Cluster")

In [18]:
df_unique.head()

Unnamed: 0.1,Unnamed: 0,SMILES,ID,is_active,ROMol,Cluster,logP
1142,1063,C=CCNS(=O)(=O)c1ccc2c(c1)[C@@H]3C=CC[C@H]3[C@H](N2)c4ccc(cc4O)O,C20681887,0,,1,3.3886
72,72,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@@H](CO)c3ccccc3)Nc4ccccc4,574871,1,,2,3.98712
28,28,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3)Cl)Nc4cccc5c4OCCO5,574915,1,,3,4.41172
62,62,c1ccc(cc1)[C@@H](C(=O)N)NC(=O)c2cc(c[nH]2)c3c(cn[nH]3)c4cccc(c4)Cl,370131,1,,4,3.6816
2779,2700,C[C@@]12CC[C@H]([C@@]([C@H]1Cc3c(nc(s3)NC)[C@H]2CC(=O)NC4CCCCC4)(C)CO)O,C04222506,0,,5,3.4393
