In [1]:
import pandas as pd
from rdkit import Chem
import seaborn as sns
from tqdm import tqdm
import numpy as np

In [2]:
tqdm.pandas()


In [5]:
df = pd.read_csv("./ChEMBL_doc_CHEMBL1142817/CHEMBL1142817.csv", sep=";")
df

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Target Type,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type,Standard Text Value
0,CHEMBL839,CARTEOLOL,4.0,292.38,0.0,1.7,Carteolol,CC(C)(C)NCC(O)COc1cccc2c1CCC(=O)N2,Log 1/Km,'=',...,SINGLE PROTEIN,CHEMBL1142817,1,Scientific Literature,Bioorg Med Chem Lett,2005,,,,
1,CHEMBL1423,PIMOZIDE,4.0,461.56,1.0,5.86,SID56422165,O=c1[nH]c2ccccc2n1C1CCN(CCCC(c2ccc(F)cc2)c2ccc...,Log 1/Km,'=',...,SINGLE PROTEIN,CHEMBL1142817,1,Scientific Literature,Bioorg Med Chem Lett,2005,,,,
2,CHEMBL114,SAQUINAVIR,4.0,670.86,1.0,3.09,Saquinavir,CC(C)(C)NC(=O)[C@@H]1C[C@@H]2CCCC[C@@H]2CN1C[C...,Log 1/Km,'=',...,SINGLE PROTEIN,CHEMBL1142817,1,Scientific Literature,Bioorg Med Chem Lett,2005,,,,
3,CHEMBL389621,HYDROCORTISONE,4.0,362.47,0.0,1.78,Cortisol,C[C@]12CCC(=O)C=C1CC[C@@H]1[C@@H]2[C@@H](O)C[C...,Log 1/Km,'=',...,SINGLE PROTEIN,CHEMBL1142817,1,Scientific Literature,Bioorg Med Chem Lett,2005,,,,
4,CHEMBL328250,ADINAZOLAM,2.0,351.84,0.0,3.33,Adinazolam,CN(C)Cc1nnc2n1-c1ccc(Cl)cc1C(c1ccccc1)=NC2,Log 1/Km,'=',...,SINGLE PROTEIN,CHEMBL1142817,1,Scientific Literature,Bioorg Med Chem Lett,2005,,,,
5,CHEMBL108,CARBAMAZEPINE,4.0,236.27,0.0,3.39,Carbamazepine,NC(=O)N1c2ccccc2C=Cc2ccccc21,Log 1/Km,'=',...,SINGLE PROTEIN,CHEMBL1142817,1,Scientific Literature,Bioorg Med Chem Lett,2005,,,,
6,CHEMBL629,AMITRIPTYLINE,4.0,277.41,0.0,4.17,Amitriptyline,CN(C)CCC=C1c2ccccc2CCc2ccccc21,Log 1/Km,'=',...,SINGLE PROTEIN,CHEMBL1142817,1,Scientific Literature,Bioorg Med Chem Lett,2005,,,,
7,CHEMBL549,CITALOPRAM,4.0,324.4,0.0,3.81,Citalopram,CN(C)CCCC1(c2ccc(F)cc2)OCc2cc(C#N)ccc21,Log 1/Km,'=',...,SINGLE PROTEIN,CHEMBL1142817,1,Scientific Literature,Bioorg Med Chem Lett,2005,,,,
8,CHEMBL24646,PIMOBENDAN,2.0,334.38,0.0,3.1,Pimobendan,COc1ccc(-c2nc3cc(C4=NNC(=O)CC4C)ccc3[nH]2)cc1,Log 1/Km,'=',...,SINGLE PROTEIN,CHEMBL1142817,1,Scientific Literature,Bioorg Med Chem Lett,2005,,,,
9,CHEMBL115,INDINAVIR,4.0,613.8,1.0,2.87,Indinavir,CC(C)(C)NC(=O)[C@@H]1CN(Cc2cccnc2)CCN1C[C@@H](...,Log 1/Km,'=',...,SINGLE PROTEIN,CHEMBL1142817,1,Scientific Literature,Bioorg Med Chem Lett,2005,,,,


In [6]:
# standardisation of SMILES
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.rdBase import BlockLogs
import mols2grid

In [7]:

# from Pat Walters method
def standardize_smiles(smiles):
    try:
        # follows the steps in
        # https://github.com/greglandrum/RSC_OpenScience_Standardization_202104/blob/main/MolStandardize%20pieces.ipynb
        # as described **excellently** (by Greg) in
        # https://www.youtube.com/watch?v=eWTApNX8dJQ
        mol = Chem.MolFromSmiles(smiles)
    
        # removeHs, disconnect metal atoms, normalize the molecule, reionize the molecule
        clean_mol = rdMolStandardize.Cleanup(mol)
    
        # if many fragments, get the "parent" (the actual mol we are interested in) 
        parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)
    
        # try to neutralize molecule
        uncharger = rdMolStandardize.Uncharger()  # annoying, but necessary as no convenience method exists
        uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
    
        # note that no attempt is made at reionization at this step
        # nor at ionization at some pH (rdkit has no pKa caculator)
        # the main aim to represent all molecules from different sources
        # in a (single) standard way, for use in ML, catalogue, etc.
    
        te = rdMolStandardize.TautomerEnumerator()  # idem
        taut_uncharged_parent_clean_mol = te.Canonicalize(uncharged_parent_clean_mol)
    
        return Chem.MolToSmiles(taut_uncharged_parent_clean_mol)
    except:
        return pd.NA

In [8]:

with BlockLogs():
    df["CANONICAL_SMILES"] = df["Smiles"].progress_apply(lambda x: standardize_smiles(x))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 58/58 [00:01<00:00, 37.82it/s]


In [9]:
def smi_to_inchikey(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return Chem.MolToInchiKey(mol)
    except:
        return pd.NA

In [10]:
# compute inchikeys
with BlockLogs():
    df["INCHIKEY"] = df["CANONICAL_SMILES"].progress_apply(lambda x: smi_to_inchikey(x))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 58/58 [00:00<00:00, 3008.34it/s]


In [11]:
# not much more to do really
df["dataset"] = "CHEMBL1142817"
df["data_type"] = "assay" # going to assume assay
df["action_type"] = "substrate"

In [12]:
df.to_csv("processed/CHEMBL1142817.csv", index=False)