## Remove entries in DrugBank that are similar to CYP450 dataset

In [18]:
import sys
sys.path.append("..")
import itertools

import pandas as pd
from rdkit import DataStructs
from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect
from tqdm import tqdm

### Load datasets

In [5]:
cyp_data_path = "../data/fromraw_cid_inchi_smiles_fp_labels_onehots.csv"
cyp_df = pd.read_csv(cyp_data_path)

In [7]:
drugbank_data_path = "../data/DrugBank_smiles_fp.csv"
drugbank_df = pd.read_csv(drugbank_data_path)

In [8]:
cyp_df.head()

Unnamed: 0.1,Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_CID,InChI,canonical_SMILES,isomeric_SMILES,ECFP,Label,1a2,3a4,2c9,2c19,2d6,onehot_label
0,0,1,6602638,InChI=1S/C16H24ClN3O.ClH/c1-3-5-16(21)18-13-6-...,CCCC(=O)Nc1ccc(N2CCN(CC)CC2)c(Cl)c1.Cl,CCCC(=O)NC1=CC(=C(C=C1)N2CCN(CC2)CC)Cl.Cl,0000000000000010000000000000000000100000010000...,00_0_,0,0,_,0,_,
1,1,2,644510,InChI=1S/C23H23FN6O4/c24-18-4-2-15(3-5-18)11-3...,O=c1[nH]c2cc3c(cc2cc1CN(CCCO)Cc1nnnn1Cc1ccc(F)...,C1OC2=C(O1)C=C3C(=C2)C=C(C(=O)N3)CN(CCCO)CC4=N...,0000000000000000000000000001000000000000000000...,11___,1,1,_,_,_,
2,2,3,1960010,InChI=1S/C17H18N2O3S2/c1-3-18-12-8-5-4-7-11(12...,CCN1C(=O)C(=C2SC(=S)N(CCCOC)C2=O)c2ccccc21,CCN1C2=CC=CC=C2/C(=C/3\C(=O)N(C(=S)S3)CCCOC)/C1=O,0000100000000100000000000000000000100000000000...,1_1_0,1,_,1,_,0,
3,3,4,644675,"InChI=1S/C13H9NO5S3/c1-8(15)14(22(17,18)12-3-2...",CC(=O)N(c1ccc2oc(=O)sc2c1)S(=O)(=O)c1cccs1,CC(=O)N(C1=CC2=C(C=C1)OC(=O)S2)S(=O)(=O)C3=CC=CS3,0000000000000010000000000000000000000000000000...,11_10,1,1,_,1,0,
4,4,5,644851,InChI=1S/C14H10ClN3/c15-12-9-5-4-8-11(12)14-16...,Clc1ccccc1-c1nc(-c2ccccc2)n[nH]1,C1=CC=C(C=C1)C2=NNC(=N2)C3=CC=CC=C3Cl,0100000000000000000000000000000000000000000000...,1_11_,1,_,1,1,_,


In [9]:
drugbank_df.head()

Unnamed: 0.1,Unnamed: 0,DrugBank ID,SMILES,ECFP
0,0,DB07361,ClC1=CC=CC(NC(=O)NC2=NC=C(CCNC3=NC=NC4=C3SC=C4...,0000000000000010000000000000000000000000000001...
1,1,DB13157,[Na+].CCCCCCCCCCCCOC(=O)CS([O-])(=O)=O,0000000000000100000000000000000000001000000000...
2,2,DB05577,OC(=O)CNC(=O)C1=C(O)C2=CC=CC=C2C(Cl)=N1,0000000000000000000000000000000000000000000000...
3,3,DB05667,CC(=O)N1CCN(CC1)C1=CC=C(OC[C@@H]2CO[C@](CN3C=C...,0000000000000000000000000000000000000100001000...
4,4,DB00104,[H][C@]1(NC(=O)[C@H](CCCCN)NC(=O)[C@@H](CC2=CN...,0100000000000000000000000000000010000000000000...


### Generate ECFPs

In [12]:
cyp_fps = list()
for smiles in cyp_df["isomeric_SMILES"]:
    mol = Chem.MolFromSmiles(smiles)
    fp = GetMorganFingerprintAsBitVect(mol, 4, nBits=2048)
    cyp_fps.append(fp)

In [17]:
drugbank_fps = list()
for smiles in drugbank_df["SMILES"]:
    mol = Chem.MolFromSmiles(smiles)
    fp = GetMorganFingerprintAsBitVect(mol, 4, nBits=2048)
    drugbank_fps.append(fp)

### Find the chemicals in DrugBank with a Tanimoto score higher than 0.85 to any chemicals in CYP450 dataset 

In [19]:
db_rm_indices = list()
for cyp_fp in tqdm(cyp_fps):
    for i, db_fp in enumerate(drugbank_fps):
        score = DataStructs.FingerprintSimilarity(cyp_fp, db_fp)
        if score > 0.85:
            db_rm_indices.append(i)

100%|████████████████████████████████████████████████████████████████████████████| 17121/17121 [08:14<00:00, 34.61it/s]


### Remove similar chemicals

In [22]:
db_rm_indices_set = set(db_rm_indices)

In [28]:
filtered_drugbank = drugbank_df.drop(db_rm_indices_set)

### Save the filtered dataset

In [None]:
filtered_drugbank.to_csv("../data/DrugBank_smiles_fp_fil.csv")