In [1]:
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem.Fingerprints import FingerprintMols
import pandas as pd

In [2]:
# read the csv
df = pd.read_csv('DEDuCT_ChemicalBasicInformation.csv')

In [4]:
# proof and make a list of SMILES
df_smiles = df['SMILES (Canonical)']
c_smiles = []
for ds in df_smiles:
    try:
        cs = Chem.CanonSmiles(ds)
        c_smiles.append(cs)
    except:
        print('Invalid SMILES:', ds)
print(len(c_smiles), len(df_smiles))

686 686


In [7]:
# make a list of mols
ms = [Chem.MolFromSmiles(x) for x in c_smiles]

# make a list of fingerprints (fp)
fps = [FingerprintMols.FingerprintMol(x) for x in ms]

# Make a query_fingerprint for the reference molecule of interest eg. Bisphenol A (BPA) or estradiol
estradiol_SMILES = 'CC12CCC3C(C1CCC2O)CCC4=C3C=CC(=C4)O'
BPA_SMILES = 'CC(C)(C1=CC=C(C=C1)O)C2=CC=C(C=C2)O'
query_ms = Chem.MolFromSmiles(BPA_SMILES)
query_fp = FingerprintMols.FingerprintMol(query_ms)

# ta is the list of targets, sim is the similarity between the target and the reference molecule of interest chosen previously
ta, sim = [], []
for n in range(len(fps)):
    s = DataStructs.TanimotoSimilarity(query_fp, fps[n])
    ta.append(c_smiles[n])
    sim.append(s)

# Build the dataframe and sort it according to similarity with the reference molecule
d = {'Target':ta, 'Similarity':sim}
df_scores = pd.DataFrame(data=d)
df_final = df_scores.sort_values('Similarity', ascending=False)
#print(df_final)

# save as csv
df_final.to_csv('Classified.csv', index=False, sep=',')


In [None]:
# Classified version of the Chemical Basic Information csv file
df_classified = pd.read_csv('DEDuCT_ChemicalBasicInformation.csv')
df_classified['Similarity'] = sim
df_classified = df_classified.sort_values('Similarity', ascending=False)
df_classified.to_csv('DEDuCT_Classified_ChemicalBasicInformation.csv', index=False, sep=',')