In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

# 1. read datasets

In [2]:
CheMBL_raw_Data = pd.read_csv('./data/PPAR alpha CheMBL.csv', sep=';')
BindingDB_raw_Data = pd.read_csv('./data/PPAR alpha BingdingDB.tsv', sep='\t')

In [3]:
CheMBL_raw_Data.columns

Index(['Molecule ChEMBL ID', 'Molecule Name', 'Molecule Max Phase',
       'Molecular Weight', '#RO5 Violations', 'AlogP', 'Compound Key',
       'Smiles', 'Standard Type', 'Standard Relation', 'Standard Value',
       'Standard Units', 'pChEMBL Value', 'Data Validity Comment', 'Comment',
       'Uo Units', 'Ligand Efficiency BEI', 'Ligand Efficiency LE',
       'Ligand Efficiency LLE', 'Ligand Efficiency SEI', 'Potential Duplicate',
       'Assay ChEMBL ID', 'Assay Description', 'Assay Type', 'BAO Format ID',
       'BAO Label', 'Assay Organism', 'Assay Tissue ChEMBL ID',
       'Assay Tissue Name', 'Assay Cell Type', 'Assay Subcellular Fraction',
       'Assay Parameters', 'Assay Variant Accession', 'Assay Variant Mutation',
       'Target ChEMBL ID', 'Target Name', 'Target Organism', 'Target Type',
       'Document ChEMBL ID', 'Source ID', 'Source Description',
       'Document Journal', 'Document Year', 'Cell ChEMBL ID', 'Properties',
       'Action Type', 'Standard Text Value'],
 

# 2. missing and duplicate

## 2.1 CheMBL

In [4]:
useful_CheMBL = CheMBL_raw_Data[['Molecule ChEMBL ID', 'Smiles', 'Standard Value' ]]
useful_CheMBL = useful_CheMBL.rename(columns={
    'Standard Value': 'IC50 (nM)',
    'Molecule ChEMBL ID': 'ID'
})
print('number of nan \n:', useful_CheMBL.isna().sum(), '\n')
print('unique \n:', useful_CheMBL.nunique())

number of nan 
: ID            0
Smiles        0
IC50 (nM)    55
dtype: int64 

unique 
: ID           1037
Smiles       1037
IC50 (nM)     653
dtype: int64


In [5]:
def canonical_smiles(dataframe):
    smiles = []
    canonical_smiles = [Chem.MolToSmiles(Chem.MolFromSmiles(x)) for x in dataframe['Smiles']]

    for i in canonical_smiles:  # select the longest smiles (avoid mixture/multi-components system)
        if pd.isna(i):
            smiles.append('')
        else:
            compound = i.split('.')
            longest_compound = max(compound, key=len)
            smiles.append(longest_compound)

    canonical_smiles = pd.Series(smiles, name='canonical_smiles')
    return pd.concat([dataframe, canonical_smiles], axis=1)

useful_CheMBL = canonical_smiles(useful_CheMBL)

In [6]:
useful_CheMBL = useful_CheMBL.dropna(subset='IC50 (nM)')
useful_CheMBL = useful_CheMBL.drop_duplicates(subset='canonical_smiles').reset_index(drop=True)
useful_CheMBL

Unnamed: 0,ID,Smiles,IC50 (nM),canonical_smiles
0,CHEMBL556073,Cc1c(Cc2cccc(OCC(=O)O)c2)c2cc(OC(F)(F)F)ccc2n1...,15000.0,Cc1c(Cc2cccc(OCC(=O)O)c2)c2cc(OC(F)(F)F)ccc2n1...
1,CHEMBL118756,CCCc1cc(C2CCCCC2)ccc1OCCCOc1cccc(C2OC(=O)NC2=O)c1,290.0,CCCc1cc(C2CCCCC2)ccc1OCCCOc1cccc(C2OC(=O)NC2=O)c1
2,CHEMBL300629,CCCc1cc(Oc2ccccc2)ccc1OCCCOc1ccc(C2SC(=O)NC2=O...,50000.0,CCCc1cc(Oc2ccccc2)ccc1OCCCOc1ccc(C2SC(=O)NC2=O...
3,CHEMBL209563,CCCc1c(OCCCN(C)c2ccc(CC(=O)O)cc2C)ccc2c(C(F)(F...,600.0,CCCc1c(OCCCN(C)c2ccc(CC(=O)O)cc2C)ccc2c(C(F)(F...
4,CHEMBL392950,O=C(O)COc1ccc(SCc2ccc(OCc3ccc(F)cc3C(F)(F)F)cc...,3481.0,O=C(O)COc1ccc(SCc2ccc(OCc3ccc(F)cc3C(F)(F)F)cc...
...,...,...,...,...
978,CHEMBL4575562,CCOc1ccc(-c2ccc(CCCc3nn(-c4ccc(S(C)(=O)=O)cc4)...,5040.0,CCOc1ccc(-c2ccc(CCCc3nn(-c4ccc(S(C)(=O)=O)cc4)...
979,CHEMBL3678128,Cc1c(C)n(Cc2ccc(Cl)c(O[C@@H](C)C(=O)O)c2)c2ccc...,935.0,Cc1c(C)n(Cc2ccc(Cl)c(O[C@@H](C)C(=O)O)c2)c2ccc...
980,CHEMBL510698,O=C(Nc1ccncc1)c1cc([N+](=O)[O-])ccc1Cl,2950.0,O=C(Nc1ccncc1)c1cc([N+](=O)[O-])ccc1Cl
981,CHEMBL5286386,CC(C)(O)c1ccc(NC(=O)c2cc(C(=O)NCc3ccc(F)c(F)c3...,16000.0,CC(C)(O)c1ccc(NC(=O)c2cc(C(=O)NCc3ccc(F)c(F)c3...


The Lipinski's Rule stated the following:
* Molecular weight < 500 Dalton
* Octanol-water partition coefficient (LogP) < 5
* Hydrogen bond donors < 5
* Hydrogen bond acceptors < 10 

In [7]:
def Lipinski_descriptors(smiles):
    mols = [Chem.MolFromSmiles(x) for x in smiles]
    
    Molecular_Weight = np.array([Descriptors.MolWt(mol) for mol in mols])
    LogP = np.array([Descriptors.MolLogP(mol) for mol in mols])
    num_H_acceptors = np.array([Lipinski.NumHAcceptors(mol) for mol in mols])
    num_H_Donors = np.array([Lipinski.NumHDonors(mol) for mol in mols])

    data = np.vstack([smiles, Molecular_Weight, LogP, num_H_acceptors, num_H_Donors]).transpose()

    dataframe = pd.DataFrame(data=data, 
                             columns=['canonical_smiles', 'Molecular Weight', 'LogP', 'num_H_acceptors', 'num_H_Donors'])
    return dataframe

Lipinski_CheMBL = Lipinski_descriptors(useful_CheMBL['canonical_smiles'])
Lipinski_CheMBL

Unnamed: 0,canonical_smiles,Molecular Weight,LogP,num_H_acceptors,num_H_Donors
0,Cc1c(Cc2cccc(OCC(=O)O)c2)c2cc(OC(F)(F)F)ccc2n1...,517.887,6.24442,5,1
1,CCCc1cc(C2CCCCC2)ccc1OCCCOc1cccc(C2OC(=O)NC2=O)c1,451.563,5.8422,5,1
2,CCCc1cc(Oc2ccccc2)ccc1OCCCOc1ccc(C2SC(=O)NC2=O...,477.582,6.3034,6,1
3,CCCc1c(OCCCN(C)c2ccc(CC(=O)O)cc2C)ccc2c(C(F)(F...,464.484,5.63992,5,1
4,O=C(O)COc1ccc(SCc2ccc(OCc3ccc(F)cc3C(F)(F)F)cc...,506.517,6.6679,4,1
...,...,...,...,...,...
978,CCOc1ccc(-c2ccc(CCCc3nn(-c4ccc(S(C)(=O)=O)cc4)...,629.71,5.4339,8,1
979,Cc1c(C)n(Cc2ccc(Cl)c(O[C@@H](C)C(=O)O)c2)c2ccc...,561.122,7.60024,4,2
980,O=C(Nc1ccncc1)c1cc([N+](=O)[O-])ccc1Cl,277.667,2.8955,4,1
981,CC(C)(O)c1ccc(NC(=O)c2cc(C(=O)NCc3ccc(F)c(F)c3...,476.882,5.167,3,3


In [8]:
useful_CheMBL = pd.merge(useful_CheMBL, Lipinski_CheMBL, on='canonical_smiles')
# useful_CheMBL = useful_CheMBL[useful_CheMBL['Molecular Weight'] < 500]
# useful_CheMBL = useful_CheMBL[useful_CheMBL['LogP'] < 5]
# useful_CheMBL = useful_CheMBL[useful_CheMBL['num_H_Donors'] < 5]
# useful_CheMBL = useful_CheMBL[useful_CheMBL['num_H_acceptors'] < 10].reset_index(drop=True)
# useful_CheMBL

In [9]:
useful_CheMBL.to_csv('./data/CheMBL_clean.csv', index=None)

## 2.2 BindingDB

In [10]:
BindingDB_raw_Data.columns

Index(['BindingDB Reactant_set_id', 'Ligand SMILES', 'Ligand InChI',
       'Ligand InChI Key', 'BindingDB MonomerID', 'BindingDB Ligand Name',
       'Target Name',
       'Target Source Organism According to Curator or DataSource', 'Ki (nM)',
       'IC50 (nM)', 'Kd (nM)', 'EC50 (nM)', 'kon (M-1-s-1)', 'koff (s-1)',
       'pH', 'Temp (C)', 'Curation/DataSource', 'Article DOI',
       'BindingDB Entry DOI', 'PMID', 'PubChem AID', 'Patent Number',
       'Authors', 'Institution', 'Link to Ligand in BindingDB',
       'Link to Target in BindingDB',
       'Link to Ligand-Target Pair in BindingDB', 'Ligand HET ID in PDB',
       'PDB ID(s) for Ligand-Target Complex', 'PubChem CID of Ligand',
       'PubChem SID of Ligand', 'ChEBI ID of Ligand', 'ChEMBL ID of Ligand',
       'DrugBank ID of Ligand', 'IUPHAR_GRAC ID of Ligand',
       'KEGG ID of Ligand', 'ZINC ID of Ligand',
       'Number of Protein Chains in Target (>1 implies a multichain complex)',
       'BindingDB Target Chain Sequ

In [11]:
useful_BindingDB = BindingDB_raw_Data[['BindingDB Reactant_set_id', 'Ligand SMILES', 'IC50 (nM)']]
useful_BindingDB = useful_BindingDB.rename(columns={'Ligand SMILES': 'Smiles', 'BindingDB Reactant_set_id': 'ID'})
print('number of nan \n:', useful_BindingDB.isna().sum(), '\n')
print('unique \n:', useful_BindingDB.nunique())

number of nan 
: ID              0
Smiles          0
IC50 (nM)    4022
dtype: int64 

unique 
: ID           5310
Smiles       3515
IC50 (nM)     709
dtype: int64


In [12]:
useful_BindingDB = canonical_smiles(useful_BindingDB)
useful_BindingDB

Unnamed: 0,ID,Smiles,IC50 (nM),canonical_smiles
0,50952815,COc1ccc(NC(=O)N(CCCCC2CCCCC2)CCc2ccc(SC(C)(C)C...,0.28,COc1ccc(NC(=O)N(CCCCC2CCCCC2)CCc2ccc(SC(C)(C)C...
1,50952818,CC(C)(Sc1ccc(CCN(CCCCC2CCCCC2)C(=O)NC2CCCCC2)c...,0.46,CC(C)(Sc1ccc(CCN(CCCCC2CCCCC2)C(=O)NC2CCCCC2)c...
2,50952816,COc1cccc(NC(=O)N(CCCCC2CCCCC2)CCc2ccc(SC(C)(C)...,0.59,COc1cccc(NC(=O)N(CCCCC2CCCCC2)CCc2ccc(SC(C)(C)...
3,50952817,COc1ccccc1NC(=O)N(CCCCC1CCCCC1)CCc1ccc(SC(C)(C...,1.9,COc1ccccc1NC(=O)N(CCCCC1CCCCC1)CCc1ccc(SC(C)(C...
4,50332806,CC[C@@](C)(Cc1ccc(OCCCOc2ccc(cc2Cl)C2CCCCC2)cc...,2,CC[C@@](C)(Cc1ccc(OCCCOc2ccc(C3CCCCC3)cc2Cl)cc...
...,...,...,...,...
5305,51319617,CCC(NC(C)=O)c1nc(no1)-c1ccc(Oc2ccc(OCc3ccccc3)...,,CCC(NC(C)=O)c1nc(-c2ccc(Oc3ccc(OCc4ccccc4)cc3)...
5306,51319621,CCCCCCCC\C=C/CCCCCCCC(=O)NCCO,,CCCCCCCC/C=C\CCCCCCCC(=O)NCCO
5307,51319622,CC(NC(=O)C(F)(F)F)c1nc(no1)-c1ccc(Oc2ccc(OCc3c...,,CC(NC(=O)C(F)(F)F)c1nc(-c2ccc(Oc3ccc(OCc4ccccc...
5308,51323320,COc1ccc(cc1)\N=N\c1ccc(OCCc2ccc(OC(C)(C)C(O)=O...,,COc1ccc(/N=N/c2ccc(OCCc3ccc(OC(C)(C)C(=O)O)cc3...


In [13]:
useful_BindingDB = useful_BindingDB.dropna(subset='IC50 (nM)')
useful_BindingDB = useful_BindingDB.drop_duplicates(subset='canonical_smiles').reset_index(drop=True)
useful_BindingDB

Unnamed: 0,ID,Smiles,IC50 (nM),canonical_smiles
0,50952815,COc1ccc(NC(=O)N(CCCCC2CCCCC2)CCc2ccc(SC(C)(C)C...,0.28,COc1ccc(NC(=O)N(CCCCC2CCCCC2)CCc2ccc(SC(C)(C)C...
1,50952818,CC(C)(Sc1ccc(CCN(CCCCC2CCCCC2)C(=O)NC2CCCCC2)c...,0.46,CC(C)(Sc1ccc(CCN(CCCCC2CCCCC2)C(=O)NC2CCCCC2)c...
2,50952816,COc1cccc(NC(=O)N(CCCCC2CCCCC2)CCc2ccc(SC(C)(C)...,0.59,COc1cccc(NC(=O)N(CCCCC2CCCCC2)CCc2ccc(SC(C)(C)...
3,50952817,COc1ccccc1NC(=O)N(CCCCC1CCCCC1)CCc1ccc(SC(C)(C...,1.9,COc1ccccc1NC(=O)N(CCCCC1CCCCC1)CCc1ccc(SC(C)(C...
4,50332806,CC[C@@](C)(Cc1ccc(OCCCOc2ccc(cc2Cl)C2CCCCC2)cc...,2,CC[C@@](C)(Cc1ccc(OCCCOc2ccc(C3CCCCC3)cc2Cl)cc...
...,...,...,...,...
997,50386748,Cc1nc(sc1COc1ccc2n(CC(O)=O)ccc2c1)C(F)(F)F,550,Cc1nc(C(F)(F)F)sc1COc1ccc2c(ccn2CC(=O)O)c1
998,50386738,OC(=O)Cn1ccc2cc(OCc3ccc(Oc4ccc(cc4)C(F)(F)F)cc...,562,O=C(O)Cn1ccc2cc(OCc3ccc(Oc4ccc(C(F)(F)F)cc4)cc...
999,50938877,CO[C@@H](Cc1ccc(OCCCOc2ccc(cc2)-c2ccccc2)cc1)C...,567.54,CO[C@@H](Cc1ccc(OCCCOc2ccc(-c3ccccc3)cc2)cc1)C...
1000,51241764,CCOc1ccc(cc1CC(O)=O)-c1ccc(CCCc2nn(-c3ccc(cc3)...,569,CCOc1ccc(-c2ccc(CCCc3nn(-c4ccc(C(F)(F)F)cc4)c(...


In [14]:
Lipinski_BindingDB = Lipinski_descriptors(useful_BindingDB['canonical_smiles'])
Lipinski_BindingDB

Unnamed: 0,canonical_smiles,Molecular Weight,LogP,num_H_acceptors,num_H_Donors
0,COc1ccc(NC(=O)N(CCCCC2CCCCC2)CCc2ccc(SC(C)(C)C...,526.743,7.4777,4,2
1,CC(C)(Sc1ccc(CCN(CCCCC2CCCCC2)C(=O)NC2CCCCC2)c...,502.765,7.2793,3,2
2,COc1cccc(NC(=O)N(CCCCC2CCCCC2)CCc2ccc(SC(C)(C)...,526.743,7.4777,4,2
3,COc1ccccc1NC(=O)N(CCCCC1CCCCC1)CCc1ccc(SC(C)(C...,526.743,7.4777,4,2
4,CC[C@@](C)(Cc1ccc(OCCCOc2ccc(C3CCCCC3)cc2Cl)cc...,459.026,7.279,3,1
...,...,...,...,...,...
997,Cc1nc(C(F)(F)F)sc1COc1ccc2c(ccn2CC(=O)O)c1,370.352,4.08862,5,1
998,O=C(O)Cn1ccc2cc(OCc3ccc(Oc4ccc(C(F)(F)F)cc4)cc...,441.405,6.116,4,1
999,CO[C@@H](Cc1ccc(OCCCOc2ccc(-c3ccccc3)cc2)cc1)C...,406.478,4.8436,4,1
1000,CCOc1ccc(-c2ccc(CCCc3nn(-c4ccc(C(F)(F)F)cc4)c(...,602.613,6.3051,7,1


In [15]:
useful_BindingDB = pd.merge(useful_BindingDB, Lipinski_BindingDB, on='canonical_smiles')
# useful_BindingDB = useful_BindingDB[useful_BindingDB['Molecular Weight'] < 500]
# useful_BindingDB = useful_BindingDB[useful_BindingDB['LogP'] < 5]
# useful_BindingDB = useful_BindingDB[useful_BindingDB['num_H_Donors'] < 5]
# useful_BindingDB = useful_BindingDB[useful_BindingDB['num_H_acceptors'] < 10].reset_index(drop=True)
# useful_BindingDB

In [16]:
value_list = []
for idx, row in useful_BindingDB.iterrows():
    data = row['IC50 (nM)']
    if data.startswith(r'>'):
        value = float(data.split('>')[-1])
        value_list.append(value)
    elif data.startswith(r'<'):
        value = float(data.split('<')[-1])
        value_list.append(value)
    else:
        value_list.append(float(data))

value = pd.Series(value_list)
useful_BindingDB['IC50 (nM)'] = value

In [17]:
useful_BindingDB.to_csv('./data/BindingDB clean.csv', index=None)

# 3. merge datasets

In [18]:
merged_data = pd.concat([useful_BindingDB, useful_CheMBL], axis=0)
merged_data.to_csv('./data/merged_data clean.csv', index=None)