In [2]:
import chembl_downloader
import pandas as pd
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
from tqdm import tqdm

In [13]:
def standardize(mol):
    # follows the steps in
    # https://github.com/greglandrum/RSC_OpenScience_Standardization_202104/blob/main/MolStandardize%20pieces.ipynb
    # as described **excellently** (by Greg) in
    # https://www.youtube.com/watch?v=eWTApNX8dJQ
     
    # removeHs, disconnect metal atoms, normalize the molecule, reionize the molecule
    clean_mol = rdMolStandardize.Cleanup(mol) 
     
    # if many fragments, get the "parent" (the actual mol we are interested in) 
    parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)
         
    # try to neutralize molecule
    uncharger = rdMolStandardize.Uncharger() # annoying, but necessary as no convenience method exists
    uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
     
    # note that no attempt is made at reionization at this step
    # nor at ionization at some pH (rdkit has no pKa caculator)
    # the main aim to to represent all molecules from different sources
    # in a (single) standard way, for use in ML, catalogue, etc.
     
    te = rdMolStandardize.TautomerEnumerator() # idem
    taut_uncharged_parent_clean_mol = te.Canonicalize(uncharged_parent_clean_mol)
     
    return taut_uncharged_parent_clean_mol

def standardize_and_catch(mol):
    try:
        std_mol = standardize(mol)
    except:
        std_mol = None
    return std_mol

def add_analysis_columns(df,smiles_col,year_col,name_col):
    # add an RDKit molecule column
    df['mol'] = df[smiles_col].progress_apply(Chem.MolFromSmiles)
    # standardize the molecule
    df['std_mol'] = df.mol.progress_apply(standardize_and_catch)
    # generate a standard smiles, useful for duplicate removal
    df['std_smiles'] = df.std_mol.progress_apply(Chem.MolToSmiles)
    # add a fingerprint column
    df['fp'] = df.std_mol.progress_apply(uru.mol2morgan_fp)
    # add a year with a standard name
    df['year'] = df[year_col]
    # clean the name and create a column
    df['drug_name'] = [x.strip().upper() for x in df[name_col]]

In [14]:
tqdm.pandas()

In [15]:
chembl_drug_query = """select cs.canonical_smiles, md.*
       from molecule_dictionary md
         join compound_structures cs on md.molregno = cs.molregno
         where max_phase == 4"""

In [16]:
chembl_drug_df = chembl_downloader.query(chembl_drug_query)

In [17]:
add_analysis_columns(chembl_drug_df,"canonical_smiles","first_approval","pref_name")

100%|████████████████████████████████████████████████████████████| 3447/3447 [00:00<00:00, 5378.05it/s]
  1%|▉                                                              | 49/3447 [00:00<00:32, 103.39it/s][15:37:47] Can't kekulize mol.  Unkekulized atoms: 4 9
  4%|██▎                                                           | 127/3447 [00:01<00:26, 126.25it/s][15:37:48] Can't kekulize mol.  Unkekulized atoms: 3 7
 13%|████████▏                                                     | 455/3447 [00:03<00:18, 164.35it/s][15:37:52] Tautomer enumeration stopped at 1000 tautomers: max tautomers reached
 15%|█████████▏                                                     | 503/3447 [00:05<01:10, 41.61it/s][15:37:53] Tautomer enumeration stopped at 1000 tautomers: max tautomers reached
 17%|██████████▋                                                    | 588/3447 [00:06<00:35, 79.52it/s][15:37:55] Tautomer enumeration stopped at 1000 tautomers: max tautomers reached
[15:37:56] Tautomer enumerat

ArgumentError: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(RDKit::ROMol mol, RDKit::SmilesWriteParams params)

In [13]:
paper_df = pd.read_csv("data/41570_2022_451_MOESM2_ESM.csv")

In [14]:
paper_df

Unnamed: 0,NAME,APPROVAL_YEAR,DRUG_UID,SMILES,MW,N_HBA,N_HBD,LOGP,RO5_violations,FRACTION_CSP3,N_HEAVY,N_ROT_BONDS,TPSA,LOGD_74
0,Daridorexant,2022,DB15031,COc1ccc(-n2nccn2)c(C(=O)N2CCCC2(C)c2nc3c(C)c(C...,450.921,8,1,3.602,0,0.304,32,4,88.93,3.530
1,Nirmatrelvir,2022,DB16691,CC(C)(C)C(NC(=O)C(F)(F)F)C(=O)N1CC2C(C1C(=O)NC...,499.526,9,3,3.173,0,0.783,35,9,131.40,3.130
2,Pacritinib,2022,DB11697,C1=C/COCc2cc(ccc2OCCN2CCCC2)Nc2nccc(n2)-c2cccc...,472.579,7,1,4.435,0,0.357,35,4,68.74,2.912
3,Abrocitinib,2022,DB14973,CCCS(=O)(=O)NC1CC(N(C)c2ncnc3[nH]ccc23)C1,323.414,7,2,1.825,0,0.571,22,6,99.36,1.816
4,Mitapivat,2022,DB16236,O=C(c1ccc(NS(=O)(=O)c2cccc3cccnc23)cc1)N1CCN(C...,450.553,7,1,2.337,0,0.333,32,6,90.99,2.164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,Sarecycline,2018,DB12035,CON(C)Cc1ccc(O)c2c1CC1CC3C(N(C)C)C(O)=C(C(N)=O...,487.502,11,6,1.154,2,0.458,35,5,173.86,-1.757
113,Tafenoquine,2018,DB06608,COc1cc(C)c2c(Oc3cccc(C(F)(F)F)c3)c(OC)cc(NC(C)...,463.493,6,3,4.840,0,0.375,33,10,78.63,2.054
114,Benznidazole,2018,DB11989,O=C(Cn1ccnc1[N+](=O)[O-])NCc1ccccc1,260.249,7,1,0.985,0,0.167,19,5,92.74,0.985
115,Lusutrombopag,2018,DB13125,CCCCCCOC(C)c1cccc(-c2csc(NC(=O)c3cc(Cl)c(/C=C(...,591.546,7,2,6.695,2,0.345,39,13,125.99,3.496
