In [1]:
import pandas as pd
from rdkit import Chem
import seaborn as sns
from tqdm import tqdm
import numpy as np

In [2]:
tqdm.pandas()


In [3]:
df = pd.read_csv("./Flockheart/2704859600940388722.txt", sep="\t")

In [4]:
df

Unnamed: 0,common_name,SMILES
0,adagrasib,CN1CCC[C@H]1COC2=NC3=C(CCN(C3)C4=CC=CC5=C4C(=C...
1,amlodipine,CCOC(=O)C1=C(NC(=C(C1C2=CC=CC=C2Cl)C(=O)OC)C)C...
2,aprepitant,C[C@H](C1=CC(=CC(=C1)C(F)(F)F)C(F)(F)F)O[C@@H]...
3,atomoxetine,CC1=CC=CC=C1O[C@H](CCNC)C2=CC=CC=C2
4,boceprevir,CC1([C@@H]2[C@H]1[C@H](N(C2)C(=O)[C@H](C(C)(C)...
5,ceritinib,CC1=CC(=C(C=C1C2CCNCC2)OC(C)C)NC3=NC=C(C(=N3)N...
6,chloramphenicol,C1=CC(=CC=C1[C@H]([C@@H](CO)NC(=O)C(Cl)Cl)O)[N...
7,cimetidine,CC1=C(N=CN1)CSCCNC(=NC)NC#N
8,ciprofloxacin,C1CC1N2C=C(C(=O)C3=CC(=C(C=C32)N4CCNCC4)F)C(=O)O
9,clarithromycin,CC[C@@H]1[C@@]([C@@H]([C@H](C(=O)[C@@H](C[C@@]...


In [5]:
# standardisation of SMILES
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.rdBase import BlockLogs
import mols2grid

In [6]:

# from Pat Walters method
def standardize_smiles(smiles):
    try:
        # follows the steps in
        # https://github.com/greglandrum/RSC_OpenScience_Standardization_202104/blob/main/MolStandardize%20pieces.ipynb
        # as described **excellently** (by Greg) in
        # https://www.youtube.com/watch?v=eWTApNX8dJQ
        mol = Chem.MolFromSmiles(smiles)
    
        # removeHs, disconnect metal atoms, normalize the molecule, reionize the molecule
        clean_mol = rdMolStandardize.Cleanup(mol)
    
        # if many fragments, get the "parent" (the actual mol we are interested in) 
        parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)
    
        # try to neutralize molecule
        uncharger = rdMolStandardize.Uncharger()  # annoying, but necessary as no convenience method exists
        uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
    
        # note that no attempt is made at reionization at this step
        # nor at ionization at some pH (rdkit has no pKa caculator)
        # the main aim to represent all molecules from different sources
        # in a (single) standard way, for use in ML, catalogue, etc.
    
        te = rdMolStandardize.TautomerEnumerator()  # idem
        taut_uncharged_parent_clean_mol = te.Canonicalize(uncharged_parent_clean_mol)
    
        return Chem.MolToSmiles(taut_uncharged_parent_clean_mol)
    except:
        return pd.NA

In [7]:
def smi_to_inchikey(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return Chem.MolToInchiKey(mol)
    except:
        return pd.NA

In [8]:

with BlockLogs():
    df["CANONICAL_SMILES"] = df["SMILES"].progress_apply(lambda x: standardize_smiles(x))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41/41 [00:00<00:00, 42.40it/s]


In [9]:
# compute inchikeys
with BlockLogs():
    df["INCHIKEY"] = df["CANONICAL_SMILES"].progress_apply(lambda x: smi_to_inchikey(x))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41/41 [00:00<00:00, 2910.69it/s]


In [10]:
df["dataset"] = "flockheart"
df["action_type"] = "inhibitor"
df["active"] = True
df.drop(columns="SMILES")

Unnamed: 0,common_name,CANONICAL_SMILES,INCHIKEY,dataset,action_type,active
0,adagrasib,C=C(F)C(=O)N1CCN(c2nc(OC[C@@H]3CCCN3C)nc3c2CCN...,PEMUGDMSUDYLHU-ZEQRLZLVSA-N,flockheart,inhibitor,True
1,amlodipine,CCOC(=O)C1=C(COCCN)N=C(C)C(C(=O)OC)C1c1ccccc1Cl,YMDXSGBNCBQYGC-UHFFFAOYSA-N,flockheart,inhibitor,True
2,aprepitant,C[C@@H](O[C@H]1OCCN(Cc2n[nH]c(=O)[nH]2)[C@H]1c...,ATALOFNDEOCMKK-OITMNORJSA-N,flockheart,inhibitor,True
3,atomoxetine,CNCC[C@@H](Oc1ccccc1C)c1ccccc1,VHGCDTVCOLNTBX-QGZVFWFLSA-N,flockheart,inhibitor,True
4,boceprevir,CC(C)(C)NC(=O)NC(C(=O)N1C[C@H]2C(C1C(=O)NC(CC1...,LHHCSNFAOIFYRV-CCLIWJKGSA-N,flockheart,inhibitor,True
5,ceritinib,Cc1cc(Nc2ncc(Cl)c(Nc3ccccc3S(=O)(=O)C(C)C)n2)c...,VERWOWGGCGHDQE-UHFFFAOYSA-N,flockheart,inhibitor,True
6,chloramphenicol,O=C(N[C@H](CO)[C@H](O)c1ccc([N+](=O)[O-])cc1)C...,WIIZWVCIJKGZOK-RKDXNWHRSA-N,flockheart,inhibitor,True
7,cimetidine,CN=C(NC#N)NCCSCc1[nH]cnc1C,AQIXAKUUQRKLND-UHFFFAOYSA-N,flockheart,inhibitor,True
8,ciprofloxacin,O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O,MYSWGUAQZAJSOK-UHFFFAOYSA-N,flockheart,inhibitor,True
9,clarithromycin,CC[C@H]1OC(=O)C(C)[C@@H](O[C@H]2C[C@@](C)(OC)[...,AGOYDEPGAOXOCK-WLLSWQJKSA-N,flockheart,inhibitor,True


In [11]:
df.to_csv("processed/flockheart.csv", index=False)