In [1]:
import pandas as pd
import swifter
from rdkit import Chem
from rdkit.DataStructs import TanimotoSimilarity, CosineSimilarity

def pairwise_distance(input_str, other_str):
    mol1 = Chem.MolFromSmiles(input_str)
    mol2 = Chem.MolFromSmiles(other_str)
    if mol1 is None or mol2 is None:
        return None
    return 1 - CosineSimilarity(Chem.RDKFingerprint(mol1), Chem.RDKFingerprint(mol2))

def tanimoto_similarity(input_str, other_str):
    mol1 = Chem.MolFromSmiles(input_str)
    mol2 = Chem.MolFromSmiles(other_str)
    if mol1 is None or mol2 is None:
        return None
    return TanimotoSimilarity(Chem.RDKFingerprint(mol1), Chem.RDKFingerprint(mol2))

def combined_metric(tanimoto_coefficient, pairwise_distance):
    return 0.5 * (tanimoto_coefficient + (1 / (1 + pairwise_distance)))

def get_avg_similarity(input_str, ref_df):
    similarity = 0
    pairwise = 0
    n = len(ref_df)
    for i in ref_df.SMILES.values:
        s = tanimoto_similarity(input_str, i)
        p = pairwise_distance(input_str, i)
        similarity += s
        pairwise += p
    return similarity / n, pairwise / n

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ref_df = pd.concat([
    pd.read_csv("./data/SMILES_training/trainingset_covalent_smiles.csv"),
    pd.read_csv("./data/SMILES_training/trainingset_noncovalent_smiles.csv")
                    ])
ref_df["SMILES"] = ref_df["SMILES"].swifter.apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))

Pandas Apply: 100%|██████████| 52227/52227 [00:10<00:00, 4760.61it/s]


In [3]:
test_mol = "[H]C#CN([H])C(=O)c1nc(N2C([H])([H])C([H])([H])N(C([H])([H])c3c([H])c([H])c(O[H])c(C([H])=O)c3[H])C([H])([H])C2([H])[H])nc(N([H])c2nn([H])c(C3([H])C([H])([H])C3([H])[H])c2[H])c1[H]"
get_avg_similarity(test_mol, ref_df)

(0.3451725584548221, 0.4822043934436243)