In [1]:
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem.Draw import MolsToGridImage
import mols2grid

A simple function to calculate the Tanimoto Similarity between two fingerprints.  When comparing fragments to leads, it's important to use counts.  Otherwise the similarity will be dramatically overestimated for cases where the lead is highly symmetric.  

In [2]:
def smiles_similarity(smi_1, smi_2, use_counts=False):
    fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048,countSimulation=True)
    mol_1 = Chem.MolFromSmiles(smi_1)
    mol_2 = Chem.MolFromSmiles(smi_2)
    if use_counts:
        fp_1 = rdFingerprintGenerator.GetCountFPs([mol_1])[0]
        fp_2 = rdFingerprintGenerator.GetCountFPs([mol_2])[0]
    else:
        fp_1 = rdFingerprintGenerator.GetFPs([mol_1])[0]
        fp_2 = rdFingerprintGenerator.GetFPs([mol_2])[0]
    return DataStructs.TanimotoSimilarity(fp_1, fp_2)

Read the data 

In [3]:
df = pd.read_csv("data/fragment_lead_pairs.csv")

Calculate the similarity without considering counts

In [4]:
df['fp_sim'] = [smiles_similarity(a,b,False) for a,b in df[["Fragment","Lead"]].values]

Calculate the similarity considering counts

In [5]:
df['count_sim'] = [smiles_similarity(a,b,True) for a,b in df[["Fragment","Lead"]].values]

Sort the data by **fp_sim** to show cases where fingerprint simiarity overestimates. Note the first two examples in the table below. 

In [6]:
df.sort_values("fp_sim", ascending=False,inplace=True)

In [7]:
df

Unnamed: 0,Year,Table_Entry,Fragment,Lead,fp_sim,count_sim
128,2017,19,Cn1cc(cc1C(=O)N)C(=O)c2cccc(Cl)c2Cl,Cn1cc(cc1C(=O)NCCNC(=O)c2cc(cn2C)C(=O)c3cccc(C...,0.695652,0.400000
107,2018,24,O=S1(=O)NCN(C2CC2)c3ccccc31,O=S1(=O)NCN(C2CC2)c3cc(CCc4ccc5c(c4)N(CNS5(=O)...,0.625000,0.360825
125,2017,16,Cc1cc(C)c2oc(N)nc2c1,Cc1cc(c2ccccc2)c3oc(N)nc3c1,0.583333,0.490909
42,2021,28,CCOC(=O)N1CCC(CC1)N2CCC(CC2)C(=O)N,CCOC(=O)N1CCC[C@@H](CC1)N2CCC(CC2)C(=O)NC3(C)CCC3,0.560000,0.612500
45,2020,3,C1CNCC(C1)Nc2cccc3cnccc23,Nc1ccc(c(F)c1)c2cc(N[C@H]3CCCCNC3)c4ccncc4c2,0.550000,0.548780
...,...,...,...,...,...,...
16,2021,2,Nc1ncnc2[nH]cnc12,C[C@@H]1CCCN1c2c(C#N)c3c(N)nc(Nc4cnn([C@H]5CCO...,0.088608,0.103774
144,2016,5,c1ccc2cnccc2c1,CNc1cc2ccc(C#CCOc3ccc(CN4CCNCC4)c(c3)C(F)(F)F)...,0.084507,0.146789
93,2018,10,O=c1[nH]cnc2sccc12,C[C@H](CC(=O)N1CCC(O)(Cn2cnc3c(c4ccc(CN)cc4)n(...,0.083333,0.087302
4,2022,5,CN(C)C(=O)C(N)Cc1ccc(F)cc1,Clc1ccc(cc1)[C@H]2CN[C@H](C2)C(=O)N3CCN(CC3)c4...,0.076923,0.116071


In [8]:
mols2grid.display(df,smiles_col="Lead", subset=["img","fp_sim","count_sim"],
                  transform={"fp_sim": lambda x: f"{x:.2f}",
                            "count_sim": lambda x: f"{x:.2f}"})

MolGridWidget()