In [24]:
import pandas as pd
import pandas as pd
from rdkit import Chem
import seaborn as sns
from tqdm import tqdm
import numpy as np

In [25]:
tqdm.pandas()

In [8]:
def read_2col(f):
    nmap = {}
    with open(f, "r") as r:
        lines = r.readlines()
        for l in lines:
            toks = l.split()
            if len(toks) == 1:
                nmap[toks[0]] = None
            if len(toks) == 2:
                nmap[toks[0]] = toks[1]
            elif len(toks) == 3:
                nmap["".join(toks[0:1])] = toks[2]
            else:
                pass
    return nmap
                

In [10]:
dct = read_2col("./drugbank_DBCAT002646/names_+_smiles.txt")

In [16]:
names = list(dct.keys())
vals = list(dct.values())

In [17]:
data = {"common_name": names, "SMILES": vals}

In [19]:
df = pd.DataFrame(data)

In [20]:
df

Unnamed: 0,common_name,SMILES
0,Indinavir,CC(C)(C)NC(=O)[C@@H]1CN(CCN1C[C@H](C[C@@H](CC2...
1,Lovastatin,CC[C@H](C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([...
2,Nevirapine,CC1=C2C(=NC=C1)N(C3=C(C=CC=N3)C(=O)N2)C4CC4
3,Diltiazem,CC(=O)O[C@@H]1[C@@H](SC2=CC=CC=C2N(C1=O)CCN(C)...
4,Amlodipine,CCOC(=O)C1=C(NC(=C(C1C2=CC=CC=C2Cl)C(=O)OC)C)C...
...,...,...
821,Sofpironium,CCOC(=O)C[N+]1(CC[C@H](C1)OC(=O)[C@@](C2CCCC2)...
822,Seladelpar,CCO[C@H](COC1=CC=C(C=C1)C(F)(F)F)CSC2=CC(=C(C=...
823,Vorasidenib,C[C@H](C(F)(F)F)NC1=NC(=NC(=N1)C2=NC(=CC=C2)Cl...
824,Lazertinib,CN(C)CC1=CN(N=C1C2=CC=CC=C2)C3=NC(=NC=C3)NC4=C...


In [23]:
# standardisation of SMILES
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.rdBase import BlockLogs
import mols2grid

In [26]:

# from Pat Walters method
def standardize_smiles(smiles):
    try:
        # follows the steps in
        # https://github.com/greglandrum/RSC_OpenScience_Standardization_202104/blob/main/MolStandardize%20pieces.ipynb
        # as described **excellently** (by Greg) in
        # https://www.youtube.com/watch?v=eWTApNX8dJQ
        mol = Chem.MolFromSmiles(smiles)
    
        # removeHs, disconnect metal atoms, normalize the molecule, reionize the molecule
        clean_mol = rdMolStandardize.Cleanup(mol)
    
        # if many fragments, get the "parent" (the actual mol we are interested in) 
        parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)
    
        # try to neutralize molecule
        uncharger = rdMolStandardize.Uncharger()  # annoying, but necessary as no convenience method exists
        uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
    
        # note that no attempt is made at reionization at this step
        # nor at ionization at some pH (rdkit has no pKa caculator)
        # the main aim to represent all molecules from different sources
        # in a (single) standard way, for use in ML, catalogue, etc.
    
        te = rdMolStandardize.TautomerEnumerator()  # idem
        taut_uncharged_parent_clean_mol = te.Canonicalize(uncharged_parent_clean_mol)
    
        return Chem.MolToSmiles(taut_uncharged_parent_clean_mol)
    except:
        return pd.NA

In [31]:
def smi_to_inchikey(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return Chem.MolToInchiKey(mol)
    except:
        return pd.NA

In [32]:

with BlockLogs():
    df["CANONICAL_SMILES"] = df["SMILES"].progress_apply(lambda x: standardize_smiles(x))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 826/826 [00:10<00:00, 82.00it/s]


In [33]:
# compute inchikeys
with BlockLogs():
    df["INCHIKEY"] = df["CANONICAL_SMILES"].progress_apply(lambda x: smi_to_inchikey(x))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 826/826 [00:00<00:00, 3605.06it/s]


In [34]:
df

Unnamed: 0,common_name,SMILES,CANONICAL_SMILES,INCHIKEY
0,Indinavir,CC(C)(C)NC(=O)[C@@H]1CN(CCN1C[C@H](C[C@@H](CC2...,CC(C)(C)NC(=O)C1CN(Cc2cccnc2)CCN1C[C@@H](O)CC(...,CBVCZFGXHXORBI-IXVTTXLYSA-N
1,Lovastatin,CC[C@H](C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([...,CCC(C)C(=O)O[C@H]1C[C@@H](C)C=C2C=C[C@H](C)[C@...,PCZOHLXUXFIOCF-SGFUJYLFSA-N
2,Nevirapine,CC1=C2C(=NC=C1)N(C3=C(C=CC=N3)C(=O)N2)C4CC4,Cc1ccnc2c1NC(=O)c1cccnc1N2C1CC1,NQDJXKOVJZTUJA-UHFFFAOYSA-N
3,Diltiazem,CC(=O)O[C@@H]1[C@@H](SC2=CC=CC=C2N(C1=O)CCN(C)...,COc1ccc([C@@H]2Sc3ccccc3N(CCN(C)C)C(=O)C2OC(C)...,HSUGRBWQSSZJOP-LBAQZLPGSA-N
4,Amlodipine,CCOC(=O)C1=C(NC(=C(C1C2=CC=CC=C2Cl)C(=O)OC)C)C...,CCOC(=O)C1=C(COCCN)N=C(C)C(C(=O)OC)C1c1ccccc1Cl,YMDXSGBNCBQYGC-UHFFFAOYSA-N
...,...,...,...,...
821,Sofpironium,CCOC(=O)C[N+]1(CC[C@H](C1)OC(=O)[C@@](C2CCCC2)...,CCOC(=O)C[N+]1(C)CC[C@@H](OC(=O)[C@](O)(c2cccc...,SEVCTUCCZYBJER-BSJAROSPSA-N
822,Seladelpar,CCO[C@H](COC1=CC=C(C=C1)C(F)(F)F)CSC2=CC(=C(C=...,CCO[C@H](COc1ccc(C(F)(F)F)cc1)CSc1ccc(OCC(=O)O...,JWHYSEDOYMYMNM-QGZVFWFLSA-N
823,Vorasidenib,C[C@H](C(F)(F)F)NC1=NC(=NC(=N1)C2=NC(=CC=C2)Cl...,C[C@@H](Nc1nc(N[C@H](C)C(F)(F)F)nc(-c2cccc(Cl)...,QCZAWDGAVJMPTA-RNFRBKRXSA-N
824,Lazertinib,CN(C)CC1=CN(N=C1C2=CC=CC=C2)C3=NC(=NC=C3)NC4=C...,C=CC(=O)Nc1cc(Nc2nccc(-n3cc(CN(C)C)c(-c4ccccc4...,RRMJMHOQSALEJJ-UHFFFAOYSA-N


In [36]:
df["dataset"] = "drugbank"
df["action_type"] = "substrate"

In [37]:
df

Unnamed: 0,common_name,SMILES,CANONICAL_SMILES,INCHIKEY,dataset,action_type
0,Indinavir,CC(C)(C)NC(=O)[C@@H]1CN(CCN1C[C@H](C[C@@H](CC2...,CC(C)(C)NC(=O)C1CN(Cc2cccnc2)CCN1C[C@@H](O)CC(...,CBVCZFGXHXORBI-IXVTTXLYSA-N,drugbank,substrate
1,Lovastatin,CC[C@H](C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([...,CCC(C)C(=O)O[C@H]1C[C@@H](C)C=C2C=C[C@H](C)[C@...,PCZOHLXUXFIOCF-SGFUJYLFSA-N,drugbank,substrate
2,Nevirapine,CC1=C2C(=NC=C1)N(C3=C(C=CC=N3)C(=O)N2)C4CC4,Cc1ccnc2c1NC(=O)c1cccnc1N2C1CC1,NQDJXKOVJZTUJA-UHFFFAOYSA-N,drugbank,substrate
3,Diltiazem,CC(=O)O[C@@H]1[C@@H](SC2=CC=CC=C2N(C1=O)CCN(C)...,COc1ccc([C@@H]2Sc3ccccc3N(CCN(C)C)C(=O)C2OC(C)...,HSUGRBWQSSZJOP-LBAQZLPGSA-N,drugbank,substrate
4,Amlodipine,CCOC(=O)C1=C(NC(=C(C1C2=CC=CC=C2Cl)C(=O)OC)C)C...,CCOC(=O)C1=C(COCCN)N=C(C)C(C(=O)OC)C1c1ccccc1Cl,YMDXSGBNCBQYGC-UHFFFAOYSA-N,drugbank,substrate
...,...,...,...,...,...,...
821,Sofpironium,CCOC(=O)C[N+]1(CC[C@H](C1)OC(=O)[C@@](C2CCCC2)...,CCOC(=O)C[N+]1(C)CC[C@@H](OC(=O)[C@](O)(c2cccc...,SEVCTUCCZYBJER-BSJAROSPSA-N,drugbank,substrate
822,Seladelpar,CCO[C@H](COC1=CC=C(C=C1)C(F)(F)F)CSC2=CC(=C(C=...,CCO[C@H](COc1ccc(C(F)(F)F)cc1)CSc1ccc(OCC(=O)O...,JWHYSEDOYMYMNM-QGZVFWFLSA-N,drugbank,substrate
823,Vorasidenib,C[C@H](C(F)(F)F)NC1=NC(=NC(=N1)C2=NC(=CC=C2)Cl...,C[C@@H](Nc1nc(N[C@H](C)C(F)(F)F)nc(-c2cccc(Cl)...,QCZAWDGAVJMPTA-RNFRBKRXSA-N,drugbank,substrate
824,Lazertinib,CN(C)CC1=CN(N=C1C2=CC=CC=C2)C3=NC(=NC=C3)NC4=C...,C=CC(=O)Nc1cc(Nc2nccc(-n3cc(CN(C)C)c(-c4ccccc4...,RRMJMHOQSALEJJ-UHFFFAOYSA-N,drugbank,substrate


In [38]:
df["active"] = True

In [39]:
df.to_csv("processed/drugbank_processed.csv", index=False)