# Extract SMILES from ChEMBL24 all compounds dataset and create ECFPs for the entries

In [2]:
import pandas as pd
from rdkit.Chem import MolFromSmiles, MolToSmiles
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect

### Read data into Pandas dataframe

In [3]:
path = "../data/ChEMBL24_all_compounds.csv.gz"
data = pd.read_csv(path)

  interactivity=interactivity, compiler=compiler, result=result)


### Print basic information

In [5]:
data.shape

(1739196, 20)

In [7]:
data.columns

Index(['ChEMBL_ID', ' Molregno', ' InChIKey', ' SMILES', ' MolWeight',
       ' AlogP', ' HBAcc', ' HBDonor', ' PSArea', ' RotaBonds', ' Ro3Pass',
       ' Ro5Viol', ' aPKA', ' bPKA', ' logP', ' logD', ' MolType', ' Aromatic',
       ' Hatoms', ' QED'],
      dtype='object')

In [4]:
data.head()

Unnamed: 0,ChEMBL_ID,Molregno,InChIKey,SMILES,MolWeight,AlogP,HBAcc,HBDonor,PSArea,RotaBonds,Ro3Pass,Ro5Viol,aPKA,bPKA,logP,logD,MolType,Aromatic,Hatoms,QED
0,CHEMBL6329,1,OWRSAHYFSSNENM-UHFFFAOYSA-N,Cc1cc(ccc1C(=O)c2ccccc2Cl)N3N=CC(=O)NC3=O,341.75,2.11,5,1,84.82,3,N,0,6.44,,3.19,2.22,ACID,3,24,0.74
1,CHEMBL6328,2,ZJYUMURGSZQFMH-UHFFFAOYSA-N,Cc1cc(ccc1C(=O)c2ccc(cc2)C#N)N3N=CC(=O)NC3=O,332.32,1.33,6,1,108.61,3,N,0,6.44,,2.64,1.68,ACID,3,25,0.73
2,CHEMBL265667,3,YOMWDCALSDWFSV-UHFFFAOYSA-N,Cc1cc(cc(C)c1C(O)c2ccc(Cl)cc2)N3N=CC(=O)NC3=O,357.8,2.27,5,2,87.98,3,N,0,6.59,0.0,3.04,2.2,NEUTRAL,3,25,0.75
3,CHEMBL6362,4,PSOPUAQFGCRDIP-UHFFFAOYSA-N,Cc1ccc(cc1)C(=O)c2ccc(cc2)N3N=CC(=O)NC3=O,307.31,1.46,5,1,84.82,3,N,0,6.12,,2.98,1.76,ACID,3,23,0.74
4,CHEMBL267864,5,KEZNSCMBVRNOHO-UHFFFAOYSA-N,Cc1cc(ccc1C(=O)c2ccc(Cl)cc2)N3N=CC(=O)NC3=O,341.75,2.11,5,1,84.82,3,N,0,6.44,,3.97,3.01,ACID,3,24,0.74


### Extract SMILES from the raw data and convert to ECFP

In [8]:
def smiles2ecfp(smiles, radius=4, bits=2048):
    mol = MolFromSmiles(smiles)
    if mol is None:
        return ""
    fp = GetMorganFingerprintAsBitVect(mol, radius, nBits=bits)
    return "".join(map(str, list(fp)))

In [11]:
extracted_data = data[["ChEMBL_ID", " SMILES"]]
extracted_data["ECFP"] = extracted_data[" SMILES"].map(smiles2ecfp)
extracted_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,ChEMBL_ID,SMILES,ECFP
0,CHEMBL6329,Cc1cc(ccc1C(=O)c2ccccc2Cl)N3N=CC(=O)NC3=O,0100000001000000000000000000000100000000000100...
1,CHEMBL6328,Cc1cc(ccc1C(=O)c2ccc(cc2)C#N)N3N=CC(=O)NC3=O,0000000001000000000000000000000100100000000100...
2,CHEMBL265667,Cc1cc(cc(C)c1C(O)c2ccc(Cl)cc2)N3N=CC(=O)NC3=O,0100000000000000000000000000000000000000000100...
3,CHEMBL6362,Cc1ccc(cc1)C(=O)c2ccc(cc2)N3N=CC(=O)NC3=O,0000000001000000000000000000000000000000000100...
4,CHEMBL267864,Cc1cc(ccc1C(=O)c2ccc(Cl)cc2)N3N=CC(=O)NC3=O,0000000001000000000000000000000100000000000100...


In [12]:
extracted_data.loc[extracted_data["ECFP"]==""]

Unnamed: 0,ChEMBL_ID,SMILES,ECFP
1070237,CHEMBL1965222,Clp1(Cl)np(Cl)(Cl)np2(NNP(=O)(NN2)Oc3ccccc3)n1,
1092487,CHEMBL1989375,Clp1(Cl)np(Cl)(Cl)np2(NNP(=S)(NN2)Oc3ccccc3)n1,
1099106,CHEMBL1996554,N1c2ccccc2p3(c4ccccc14)c5ccccc5nc6ccccc36,
1108484,CHEMBL2006679,S=P1(NNp2(NN1)np(np(n2)(N3CC3)N4CC4)(N5CC5)N6...,
1114774,CHEMBL2022084,FC1(F)COp2(OCC1(F)F)np(Cl)(Cl)np(Cl)(Cl)n2,
1114776,CHEMBL2022086,CC1(C)COp2(OC1)np(Cl)(Cl)np(Cl)(Cl)n2,
1114777,CHEMBL2022087,Clp1(Cl)np(Nc2ccccc2)(Nc3ccccc3)np4(NCCCN4CCC...,
1114778,CHEMBL2022088,Clp1(Cl)np(Sc2ccccc2)(Sc3ccccc3)np4(NCCCN4CCC...,
1115936,CHEMBL2023245,FC1(F)COp2(OCC1(F)F)np(Cl)(Cl)np3(NCCCN3CCCCN...,
1115937,CHEMBL2023246,FC1(F)COp2(Cl)np(Cl)(OCC1(F)F)np3(NCCCN3CCCCN...,


### Drop the data that the SMILES strings cannot be loaded by RDKit

In [13]:
extracted_data.drop(extracted_data[extracted_data["ECFP"]==""].index, inplace=True)
extracted_data.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


(1739165, 3)

### Write data out

In [14]:
extracted_data.to_csv("../data/ChEMBL24_smiles_fp.csv")