In [11]:
import pandas as pd
from rdkit.Chem import PandasTools, Draw
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina
from rdkit.Chem import rdMolDescriptors as rdmd
from rdkit.Chem import Descriptors
import seaborn as sns

In [4]:
def butina_cluster(mol_list,cutoff=0.35):
    fp_list = [rdmd.GetMorganFingerprintAsBitVect(m, 3, nBits=2048) for m in mol_list]
    dists = []
    nfps = len(fp_list)
    for i in range(1,nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fp_list[i],fp_list[:i])
        dists.extend([1-x for x in sims])
    mol_clusters = Butina.ClusterData(dists,nfps,cutoff,isDistData=True)
    cluster_id_list = [0]*nfps
    for idx,cluster in enumerate(mol_clusters,1):
        for member in cluster:
            cluster_id_list[member] = idx
    return cluster_id_list

In [5]:
df = pd.read_csv("dude_erk2_mk01.csv")

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,SMILES,ID,is_active
0,0,Cn1ccnc1Sc2ccc(cc2Cl)Nc3c4cc(c(cc4ncc3C#N)OCCCN5CCOCC5)OC,168691,1
1,1,C[C@@]12[C@@H]([C@@H](CC(O1)n3c4ccccc4c5c3c6n2c7ccccc7c6c8c5C(=O)NC8)NC)OC,86358,1
2,2,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3)Cl)Nc4cccc5c4OC(O5)(F)F,575087,1
3,3,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3)Cl)Nc4cccc5c4OCO5,575065,1
4,4,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3)Cl)Nc4cccc5c4CCC5,575047,1


In [7]:
PandasTools.AddMoleculeColumnToFrame(df,smilesCol="SMILES")

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,SMILES,ID,is_active,ROMol
0,0,Cn1ccnc1Sc2ccc(cc2Cl)Nc3c4cc(c(cc4ncc3C#N)OCCCN5CCOCC5)OC,168691,1,
1,1,C[C@@]12[C@@H]([C@@H](CC(O1)n3c4ccccc4c5c3c6n2c7ccccc7c6c8c5C(=O)NC8)NC)OC,86358,1,
2,2,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3)Cl)Nc4cccc5c4OC(O5)(F)F,575087,1,
3,3,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3)Cl)Nc4cccc5c4OCO5,575065,1,
4,4,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3)Cl)Nc4cccc5c4CCC5,575047,1,


In [9]:
df['Cluster'] = butina_cluster(df.ROMol.values)

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,SMILES,ID,is_active,ROMol,Cluster
0,0,Cn1ccnc1Sc2ccc(cc2Cl)Nc3c4cc(c(cc4ncc3C#N)OCCCN5CCOCC5)OC,168691,1,,3918
1,1,C[C@@]12[C@@H]([C@@H](CC(O1)n3c4ccccc4c5c3c6n2c7ccccc7c6c8c5C(=O)NC8)NC)OC,86358,1,,3917
2,2,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3)Cl)Nc4cccc5c4OC(O5)(F)F,575087,1,,2
3,3,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3)Cl)Nc4cccc5c4OCO5,575065,1,,2
4,4,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3)Cl)Nc4cccc5c4CCC5,575047,1,,2


In [12]:
df["logP"] = [Descriptors.MolLogP(mol) for mol in df.ROMol]

In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,SMILES,ID,is_active,ROMol,Cluster,logP
0,0,Cn1ccnc1Sc2ccc(cc2Cl)Nc3c4cc(c(cc4ncc3C#N)OCCCN5CCOCC5)OC,168691,1,,3918,5.49788
1,1,C[C@@]12[C@@H]([C@@H](CC(O1)n3c4ccccc4c5c3c6n2c7ccccc7c6c8c5C(=O)NC8)NC)OC,86358,1,,3917,4.354
2,2,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3)Cl)Nc4cccc5c4OC(O5)(F)F,575087,1,,2,4.96202
3,3,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3)Cl)Nc4cccc5c4OCO5,575065,1,,2,4.36922
4,4,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3)Cl)Nc4cccc5c4CCC5,575047,1,,2,5.12922


In [15]:
df.sort_values(["Cluster","logP"],inplace=True)

In [16]:
df.head()

Unnamed: 0.1,Unnamed: 0,SMILES,ID,is_active,ROMol,Cluster,logP
1142,1063,C=CCNS(=O)(=O)c1ccc2c(c1)[C@@H]3C=CC[C@H]3[C@H](N2)c4ccc(cc4O)O,C20681887,0,,1,3.3886
1193,1114,C=CCNS(=O)(=O)c1ccc2c(c1)[C@H]3C=CC[C@H]3[C@H](N2)c4ccc(cc4O)O,C20681890,0,,1,3.3886
387,308,CC(C)NS(=O)(=O)c1ccc2c(c1)[C@@H]3C=CC[C@@H]3[C@@H](N2)c4ccc(cc4O)O,C13639961,0,,1,3.611
1137,1058,CC(C)NS(=O)(=O)c1ccc2c(c1)[C@H]3C=CC[C@@H]3[C@@H](N2)c4ccc(cc4O)O,C13639963,0,,1,3.611
3740,3661,CC[C@H](C)NS(=O)(=O)c1ccc2c(c1)[C@H]3C=CC[C@H]3[C@H](N2)c4ccc(cc4O)O,C20682285,0,,1,4.0011


In [17]:
df_unique = df.drop_duplicates("Cluster")

In [18]:
df_unique.head()

Unnamed: 0.1,Unnamed: 0,SMILES,ID,is_active,ROMol,Cluster,logP
1142,1063,C=CCNS(=O)(=O)c1ccc2c(c1)[C@@H]3C=CC[C@H]3[C@H](N2)c4ccc(cc4O)O,C20681887,0,,1,3.3886
72,72,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@@H](CO)c3ccccc3)Nc4ccccc4,574871,1,,2,3.98712
28,28,Cc1cnc(nc1c2cc([nH]c2)C(=O)N[C@H](CO)c3cccc(c3)Cl)Nc4cccc5c4OCCO5,574915,1,,3,4.41172
62,62,c1ccc(cc1)[C@@H](C(=O)N)NC(=O)c2cc(c[nH]2)c3c(cn[nH]3)c4cccc(c4)Cl,370131,1,,4,3.6816
2779,2700,C[C@@]12CC[C@H]([C@@]([C@H]1Cc3c(nc(s3)NC)[C@H]2CC(=O)NC4CCCCC4)(C)CO)O,C04222506,0,,5,3.4393
