# Get ECFP from DrugBank dataset

In [20]:
import os

import pandas as pd
from rdkit.Chem import MolFromSmiles, MolToSmiles
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect

### Read data

In [4]:
data_path = "../data/drugbank_smallmolecules.txt"

data = pd.read_csv(data_path, sep="\t")

### Print the basic information

In [11]:
data.columns

Index(['DrugBank ID', 'Small molecule', 'InChI Key', 'SMILES', 'Name'], dtype='object')

In [12]:
data.shape

(9089, 5)

In [24]:
data.head()

Unnamed: 0,DrugBank ID,Small molecule,InChI Key,SMILES,Name
0,DB07361,True,FAYAUAZLLLJJGH-UHFFFAOYSA-N,ClC1=CC=CC(NC(=O)NC2=NC=C(CCNC3=NC=NC4=C3SC=C4...,"1-(3-chlorophenyl)-3-{5-[2-(thieno[3,2-d]pyrim..."
1,DB13157,True,UAJTZZNRJCKXJN-UHFFFAOYSA-M,[Na+].CCCCCCCCCCCCOC(=O)CS([O-])(=O)=O,Sodium lauryl sulfoacetate
2,DB05577,True,OUQVKRKGTAUJQA-UHFFFAOYSA-N,OC(=O)CNC(=O)C1=C(O)C2=CC=CC=C2C(Cl)=N1,FG-2216
3,DB05667,True,XMAYWYJOQHXEEK-ZEQKJWHPSA-N,CC(=O)N1CCN(CC1)C1=CC=C(OC[C@@H]2CO[C@](CN3C=C...,Levoketoconazole
4,DB00104,True,DEQANNDTNATYII-OULOTJBUSA-N,[H][C@]1(NC(=O)[C@H](CCCCN)NC(=O)[C@@H](CC2=CN...,Octreotide


### Extract SMILES from the raw data and convert to ECFP

In [26]:
def smiles2ecfp(smiles, radius=4, bits=2048):
    mol = MolFromSmiles(smiles)
    if mol is None:
        return ""
    fp = GetMorganFingerprintAsBitVect(mol, radius, nBits=bits)
    return "".join(map(str, list(fp)))

In [27]:
extracted_data = data[["DrugBank ID", "SMILES"]]
extracted_data["ECFP"] = extracted_data["SMILES"].map(smiles2ecfp)
extracted_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,DrugBank ID,SMILES,ECFP
0,DB07361,ClC1=CC=CC(NC(=O)NC2=NC=C(CCNC3=NC=NC4=C3SC=C4...,0000000000000010000000000000000000000000000001...
1,DB13157,[Na+].CCCCCCCCCCCCOC(=O)CS([O-])(=O)=O,0000000000000100000000000000000000001000000000...
2,DB05577,OC(=O)CNC(=O)C1=C(O)C2=CC=CC=C2C(Cl)=N1,0000000000000000000000000000000000000000000000...
3,DB05667,CC(=O)N1CCN(CC1)C1=CC=C(OC[C@@H]2CO[C@](CN3C=C...,0000000000000000000000000000000000000100001000...
4,DB00104,[H][C@]1(NC(=O)[C@H](CCCCN)NC(=O)[C@@H](CC2=CN...,0100000000000000000000000000000010000000000000...


### Drop the data that the SMILES strings cannot be loaded by RDKit

In [37]:
extracted_data.drop(extracted_data[extracted_data["ECFP"]==""].index, inplace=True)
extracted_data.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


(9072, 3)

### Write data out

In [38]:
extracted_data.to_csv("../data/DrugBank_smiles_fp.csv")

## The dataset has 90