In [1]:
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem.Fingerprints import FingerprintMols
import pandas as pd

In [2]:
# read the csv
df = pd.read_csv('DEDuCT_ChemicalBasicInformation.csv')

In [4]:
# proof and make a list of SMILES
df_smiles = df['SMILES (Canonical)']
c_smiles = []
for ds in df_smiles:
    try:
        cs = Chem.CanonSmiles(ds)
        c_smiles.append(cs)
    except:
        print('Invalid SMILES:', ds)
print(len(c_smiles), len(df_smiles))

686 686


In [5]:
# make a list of mols
ms = [Chem.MolFromSmiles(x) for x in c_smiles]

# make a list of fingerprints (fp)
fps = [FingerprintMols.FingerprintMol(x) for x in ms]

# Make a query_fingerprint for the reference molecule of interest eg. Bisphenol A (BPA) or estradiol
estradiol_SMILES = 'CC12CCC3C(C1CCC2O)CCC4=C3C=CC(=C4)O'
BPA_SMILES = 'CC(C)(C1=CC=C(C=C1)O)C2=CC=C(C=C2)O'
query_ms = Chem.MolFromSmiles(BPA_SMILES)
query_fp = FingerprintMols.FingerprintMol(query_ms)

# ta is the list of targets, sim is the similarity between the target and the reference molecule of interest chosen previously
ta, sim = [], []
for n in range(len(fps)):
    s = DataStructs.TanimotoSimilarity(query_fp, fps[n])
    ta.append(c_smiles[n])
    sim.append(s)
print(ta, sim)

# Build the dataframe and sort it according to similarity with the reference molecule
d = {'Target':ta, 'Similarity':sim}
df_scores = pd.DataFrame(data=d)
df_final = df_scores.sort_values('Similarity', ascending=False)
print(df_final)

# save as csv
df_final.to_csv('Classified.csv', index=False, sep=',')


['ClC1C(Cl)C(Cl)C(Cl)C(Cl)C1Cl', 'Nc1n[nH]c(=S)s1.Nc1n[nH]c(=S)s1.[Cu]', 'ClC1C(Cl)C(Cl)C(Cl)C(Cl)C1Cl', 'COc1cc(-c2cc(=O)c3ccccc3o2)ccc1[N+](=O)[O-]', 'O=P(O)(O)OCCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F', 'Cc1cc(O)c2c(c1)C(=O)c1cccc(O)c1C2=O', '[Cu+2].[S-]c1nnc(NCNc2nnc([S-])s2)s1', 'N#CC(Cc1ccc(O)cc1)c1ccc(O)cc1', 'Oc1ccc(C2(c3ccc(O)cc3)CC3CC2C2CCCC32)cc1', '[Cl-].[Cl-].[Mn+2]', 'CC(C)C(C(=O)OC(C#N)c1cccc(Oc2ccccc2)c1)c1ccc(Cl)cc1', 'NC(=O)c1cnccn1', 'CC12CCC3c4ccc(O)cc4CC(CCCCCCCCCS(=O)CCCC(F)(F)C(F)(F)F)C3C1CCC2O', 'FC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)I', 'Brc1cc(-c2cc(Br)c(Br)c(Br)c2)cc(Br)c1Br', 'Oc1ccc(-c2c(Cl)cc(Cl)cc2Cl)cc1', 'O=S(=O)(O)O.[Fe]', 'CN(C)c1ccc(C(O)(c2ccccc2)c2ccc(N(C)C)cc2)cc1', 'CNc1c(C)n(C)n(-c2ccccc2)c1=O', 'CC(=O)C1(O)CCC2C3CC(C)C4=CC(=O)CCC4(C)C3CCC21C', 'Clc1cc2c(Cl)c(Cl)c(Cl)c(Cl)c2cc1Cl', 'CS(=O)(=O)c1cc(Cl)cc(-c2cc(Cl)c(Cl)cc2Cl)c1Cl', 'NC(Cc1ccc(O)c(-c2cc(CC(N)C(=O)O)ccc2O)c1)C(=O)O', 'Brc1ccc(-c2ccc(Br)c(Br)c2)cc

In [None]:
# Classified version of the Chemical Basic Information csv file
df_classified = pd.read_csv('DEDuCT_ChemicalBasicInformation.csv')
df_classified['Similarity'] = sim
df_classified = df_classified.sort_values('Similarity', ascending=False)
df_classified.to_csv('DEDuCT_Classified_ChemicalBasicInformation.csv', index=False, sep=',')