In [1]:
import pandas as pd
from rdkit import Chem
import seaborn as sns
from tqdm import tqdm
import numpy as np

# From here
https://s3-us-west-2.amazonaws.com/drugbank/cite_this/attachments/files/000/001/580/original/cyp1a2.pdf?1537393556

# Extended from here and processed with pubchem
https://go.drugbank.com/categories/DBCAT002609

In [2]:
tqdm.pandas()

In [3]:
df = pd.read_csv("./drugbank/drugbank_1a2.csv")
df_extended = pd.read_csv("./drugbank/drugbank_extended_smiles.csv")

In [4]:
df

Unnamed: 0,SMILES,active,common_name,action_type
0,CC(=O)CC(c1ccc([N+](=O)[O-])cc1)c1c(O)c2ccccc2...,True,Acenocoumarol,substrate
1,Cn1c(=O)c2[nH]cnc2n(C)c1=O.Cn1c(=O)c2[nH]cnc2n...,True,Aminophylline,substrate
2,CC(C)NCC(O)COc1ccc(CCOCC2CC2)cc1,True,Betaxolol,substrate
3,Cn1c(=O)c2c(ncn2C)n(C)c1=O,Caffeine,True,substrate
4,CN(C)CCCN1c2ccccc2CCc2ccc(Cl)cc21,True,Clomipramine,substrate
5,CN1CCN(C2=Nc3cc(Cl)ccc3Nc3ccccc32)CC1,True,Clozapine,substrate
6,CN(C)CCC=C1c2ccccc2C=Cc2ccccc21,True,Cyclobenzaprine,substrate
7,CN(C)/N=N/c1[nH]cnc1C(N)=O,True,Dacarbazine,substrate
8,CN(C)CCC=C1c2ccccc2COc2ccccc21,True,Doxepin,substrate
9,CNCC[C@H](Oc1cccc2ccccc12)c1cccs1,True,Duloxetine,substrate


In [5]:
df_extended

Unnamed: 0,SMILES
0,B([C@H](CC(C)C)NC(=O)CNC(=O)C1=C(C=CC(=C1)Cl)C...
1,B([C@H](CC(C)C)NC(=O)[C@H](CC1=CC=CC=C1)NC(=O)...
2,C#CC1=CC(=CC=C1)NC2=NC=NC3=CC4=C(C=C32)OCCOCCO...
3,C#CCN[C@@H]1CCC2=CC=CC=C12
4,C(CCl)NC(=O)N(CCCl)N=O
...,...
212,C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@@H]2OC(=O)CCC...
213,C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@]2(C#C)O)CCC4...
214,C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@]2(C#C)O)CCC4...
215,C[N+]1(CCC(C1)OC(=O)C(C2CCCC2)(C3=CC=CC=C3)O)C


In [6]:
from utils.utils import get_common_name
from rdkit.rdBase import BlockLogs
with BlockLogs():
    df_extended["common_name"] = df_extended["SMILES"].progress_apply(lambda x: get_common_name(x))


100%|██████████| 217/217 [03:18<00:00,  1.09it/s]


In [7]:
df_combined = pd.concat([df, df_extended], axis=0)

In [8]:
from utils.utils import standardize_smiles
from rdkit.rdBase import BlockLogs
with BlockLogs():
    df_combined["CANONICAL_SMILES"] = df_combined["SMILES"].progress_apply(lambda x: standardize_smiles(x))

100%|██████████| 244/244 [00:01<00:00, 184.92it/s]


In [9]:
from utils.utils import smi_to_inchikey
from rdkit.rdBase import BlockLogs
# compute inchikeys
with BlockLogs():
    df_combined["INCHIKEY"] = df_combined["CANONICAL_SMILES"].progress_apply(lambda x: smi_to_inchikey(x))

100%|██████████| 244/244 [00:00<00:00, 5681.82it/s]


In [10]:
df_combined["dataset"] = "drugbank"
df_combined["action_type"] = "substrate"
df_combined["active"] = True
df_combined.to_csv("processed/drugbank.csv", index=False)