In [39]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

# 1. read datasets

In [40]:
CheMBL_raw_Data = pd.read_csv('./data/PD-L1 CheMBL.csv', sep=';')
BindingDB_raw_Data = pd.read_csv('./data/PD-L1 BindingDB.tsv', sep='\t')

In [41]:
CheMBL_raw_Data.columns

Index(['Molecule ChEMBL ID', 'Molecule Name', 'Molecule Max Phase',
       'Molecular Weight', '#RO5 Violations', 'AlogP', 'Compound Key',
       'Smiles', 'Standard Type', 'Standard Relation', 'Standard Value',
       'Standard Units', 'pChEMBL Value', 'Data Validity Comment', 'Comment',
       'Uo Units', 'Ligand Efficiency BEI', 'Ligand Efficiency LE',
       'Ligand Efficiency LLE', 'Ligand Efficiency SEI', 'Potential Duplicate',
       'Assay ChEMBL ID', 'Assay Description', 'Assay Type', 'BAO Format ID',
       'BAO Label', 'Assay Organism', 'Assay Tissue ChEMBL ID',
       'Assay Tissue Name', 'Assay Cell Type', 'Assay Subcellular Fraction',
       'Assay Parameters', 'Assay Variant Accession', 'Assay Variant Mutation',
       'Target ChEMBL ID', 'Target Name', 'Target Organism', 'Target Type',
       'Document ChEMBL ID', 'Source ID', 'Source Description',
       'Document Journal', 'Document Year', 'Cell ChEMBL ID', 'Properties',
       'Action Type', 'Standard Text Value'],
 

# 2. missing and duplicate

## 2.1 CheMBL

In [42]:
useful_CheMBL = CheMBL_raw_Data[['Molecule ChEMBL ID', 'Smiles', 'Standard Value' ]]
useful_CheMBL = useful_CheMBL.rename(columns={
    'Standard Value': 'IC50 (nM)',
    'Molecule ChEMBL ID': 'ID'
})
print('number of nan \n:', useful_CheMBL.isna().sum(), '\n')
print('unique \n:', useful_CheMBL.nunique())

number of nan 
: ID           0
Smiles       0
IC50 (nM)    9
dtype: int64 

unique 
: ID           895
Smiles       895
IC50 (nM)    560
dtype: int64


In [43]:
def canonical_smiles(dataframe):
    smiles = []
    
    canonical_smiles = [Chem.MolToSmiles(Chem.MolFromSmiles(x)) if Chem.MolFromSmiles(x) else pd.NA for x in dataframe['Smiles']]


    for i in canonical_smiles:  # select the longest smiles (avoid mixture/multi-components system)
        if pd.isna(i):
            smiles.append('')
        else:
            compound = i.split('.')
            longest_compound = max(compound, key=len)
            smiles.append(longest_compound)

    canonical_smiles = pd.Series(smiles, name='canonical_smiles')
    return pd.concat([dataframe, canonical_smiles], axis=1)

useful_CheMBL = canonical_smiles(useful_CheMBL)

In [44]:
useful_CheMBL = useful_CheMBL.dropna(subset='IC50 (nM)')
useful_CheMBL = useful_CheMBL.drop_duplicates(subset='canonical_smiles').reset_index(drop=True)
useful_CheMBL

Unnamed: 0,ID,Smiles,IC50 (nM),canonical_smiles
0,CHEMBL4862360,C[C@@](CO)(NCc1cc(Cl)c(O[C@H]2CCc3c(-c4cccc5c4...,0.058,C[C@@](CO)(NCc1cc(Cl)c(O[C@H]2CCc3c(-c4cccc5c4...
1,CHEMBL4869359,Cc1c(Nc2cccc(-c3ccc(CNCCO)cc3)n2)cccc1-c1ccc2c...,1000.000,Cc1c(Nc2cccc(-c3ccc(CNCCO)cc3)n2)cccc1-c1ccc2c...
2,CHEMBL4877288,Cc1c(-c2ccc3c(c2)OCCO3)cccc1N1Cc2cc(CNCCO)ccc2...,1000.000,Cc1c(-c2ccc3c(c2)OCCO3)cccc1N1Cc2cc(CNCCO)ccc2...
3,CHEMBL4868972,COc1cc(-c2cccc(-c3cccc(-c4ccc(CNCC5CCC(=O)N5)c...,139.000,COc1cc(-c2cccc(-c3cccc(-c4ccc(CNCC5CCC(=O)N5)c...
4,CHEMBL4861800,CC(=O)NCCNCc1cc(Cl)c(OCc2cccc(-c3cccc4c3CCN4CC...,58.000,CC(=O)NCCNCc1cc(Cl)c(OCc2cccc(-c3cccc4c3CCN4CC...
...,...,...,...,...
881,CHEMBL5071400,COC(=O)c1ccc(-c2cccc(-c3ccccc3)c2C)nc1OC,326.100,COC(=O)c1ccc(-c2cccc(-c3ccccc3)c2C)nc1OC
882,CHEMBL5081208,O=C(O)[C@@H](CO)NCc1cc(Cl)c(OCc2cccc(-c3ccccc3...,3.500,O=C(O)[C@@H](CO)NCc1cc(Cl)c(OCc2cccc(-c3ccccc3...
883,CHEMBL5176631,N#Cc1cccc(COc2cc(Oc3ncnc4c(-c5ccc6c(c5)OCCO6)c...,500.000,N#Cc1cccc(COc2cc(Oc3ncnc4c(-c5ccc6c(c5)OCCO6)c...
884,CHEMBL5206885,N#Cc1cncc(COc2cc(Oc3ncnc4c(-c5ccc6c(c5)OCCO6)c...,5.370,N#Cc1cncc(COc2cc(Oc3ncnc4c(-c5ccc6c(c5)OCCO6)c...


The Lipinski's Rule stated the following:
* Molecular weight < 500 Dalton
* Octanol-water partition coefficient (LogP) < 5
* Hydrogen bond donors < 5
* Hydrogen bond acceptors < 10 

In [45]:
def Lipinski_descriptors(smiles):
    mols = [Chem.MolFromSmiles(x) for x in smiles]
    
    Molecular_Weight = np.array([Descriptors.MolWt(mol) for mol in mols])
    LogP = np.array([Descriptors.MolLogP(mol) for mol in mols])
    num_H_acceptors = np.array([Lipinski.NumHAcceptors(mol) for mol in mols])
    num_H_Donors = np.array([Lipinski.NumHDonors(mol) for mol in mols])

    data = np.vstack([smiles, Molecular_Weight, LogP, num_H_acceptors, num_H_Donors]).transpose()

    dataframe = pd.DataFrame(data=data, 
                             columns=['canonical_smiles', 'Molecular Weight', 'LogP', 'num_H_acceptors', 'num_H_Donors'])
    return dataframe

Lipinski_CheMBL = Lipinski_descriptors(useful_CheMBL['canonical_smiles'])
Lipinski_CheMBL

Unnamed: 0,canonical_smiles,Molecular Weight,LogP,num_H_acceptors,num_H_Donors
0,C[C@@](CO)(NCc1cc(Cl)c(O[C@H]2CCc3c(-c4cccc5c4...,1122.072,6.1884,18,6
1,Cc1c(Nc2cccc(-c3ccc(CNCCO)cc3)n2)cccc1-c1ccc2c...,467.569,5.32072,6,3
2,Cc1c(-c2ccc3c(c2)OCCO3)cccc1N1Cc2cc(CNCCO)ccc2...,430.504,3.67552,5,2
3,COc1cc(-c2cccc(-c3cccc(-c4ccc(CNCC5CCC(=O)N5)c...,646.832,6.05804,6,4
4,CC(=O)NCCNCc1cc(Cl)c(OCc2cccc(-c3cccc4c3CCN4CC...,736.357,6.7801,8,3
...,...,...,...,...,...
881,COC(=O)c1ccc(-c2cccc(-c3ccccc3)c2C)nc1OC,333.387,4.51922,4,0
882,O=C(O)[C@@H](CO)NCc1cc(Cl)c(OCc2cccc(-c3ccccc3...,577.996,5.3754,8,3
883,N#Cc1cccc(COc2cc(Oc3ncnc4c(-c5ccc6c(c5)OCCO6)c...,663.13,7.40348,9,1
884,N#Cc1cncc(COc2cc(Oc3ncnc4c(-c5ccc6c(c5)OCCO6)c...,626.069,4.80218,11,3


In [46]:
useful_CheMBL = pd.merge(useful_CheMBL, Lipinski_CheMBL, on='canonical_smiles')
# useful_CheMBL = useful_CheMBL[useful_CheMBL['Molecular Weight'] < 500]
# useful_CheMBL = useful_CheMBL[useful_CheMBL['LogP'] < 5]
# useful_CheMBL = useful_CheMBL[useful_CheMBL['num_H_Donors'] < 5]
# useful_CheMBL = useful_CheMBL[useful_CheMBL['num_H_acceptors'] < 10].reset_index(drop=True)
# useful_CheMBL

In [47]:
useful_CheMBL.to_csv('./data/CheMBL_clean.csv', index=None)

## 2.2 BindingDB

In [48]:
BindingDB_raw_Data.columns

Index(['BindingDB Reactant_set_id', 'Ligand SMILES', 'Ligand InChI',
       'Ligand InChI Key', 'BindingDB MonomerID', 'BindingDB Ligand Name',
       'Target Name',
       'Target Source Organism According to Curator or DataSource', 'Ki (nM)',
       'IC50 (nM)', 'Kd (nM)', 'EC50 (nM)', 'kon (M-1-s-1)', 'koff (s-1)',
       'pH', 'Temp (C)', 'Curation/DataSource', 'Article DOI',
       'BindingDB Entry DOI', 'PMID', 'PubChem AID', 'Patent Number',
       'Authors', 'Institution', 'Link to Ligand in BindingDB',
       'Link to Target in BindingDB',
       'Link to Ligand-Target Pair in BindingDB', 'Ligand HET ID in PDB',
       'PDB ID(s) for Ligand-Target Complex', 'PubChem CID', 'PubChem SID',
       'ChEBI ID of Ligand', 'ChEMBL ID of Ligand', 'DrugBank ID of Ligand',
       'IUPHAR_GRAC ID of Ligand', 'KEGG ID of Ligand', 'ZINC ID of Ligand',
       'Number of Protein Chains in Target (>1 implies a multichain complex)',
       'BindingDB Target Chain Sequence', 'PDB ID(s) of Target

In [49]:
useful_BindingDB = BindingDB_raw_Data[['BindingDB Reactant_set_id', 'Ligand SMILES', 'IC50 (nM)']]
useful_BindingDB = useful_BindingDB.rename(columns={'Ligand SMILES': 'Smiles', 'BindingDB Reactant_set_id':'ID'})
print('number of nan \n:', useful_BindingDB.isna().sum(), '\n')
print('unique \n:', useful_BindingDB.nunique())

number of nan 
: ID             0
Smiles         0
IC50 (nM)    424
dtype: int64 

unique 
: ID           5729
Smiles       3792
IC50 (nM)    1378
dtype: int64


In [50]:
useful_BindingDB = canonical_smiles(useful_BindingDB)
useful_BindingDB

[12:53:12] Explicit valence for atom # 47 N, 4, is greater than permitted
[12:53:14] Explicit valence for atom # 15 N, 4, is greater than permitted


Unnamed: 0,ID,Smiles,IC50 (nM),canonical_smiles
0,51455869,OC(=O)[C@@H]1CCCCN1Cc1ccc(CCc2cccc(c2Br)-c2ccc...,0.000,N#Cc1cc(COc2cc(CCc3cccc(-c4ccccc4)c3Br)ccc2CN2...
1,51240354,OC(=O)[C@@H]1CCCCN1Cc1cc(Cl)c(CCc2cccc(c2Br)-c...,0.000,N#Cc1cc(COc2cc(CCc3cccc(-c4ccccc4)c3Br)c(Cl)cc...
2,1021612,OC[C@H](NCc1cc(Cl)c(OCc2cccc(c2Br)-c2ccccc2)cc...,<0.0001,O=C(O)[C@H](CO)NCc1cc(Cl)c(OCc2cccc(-c3ccccc3)...
3,1212892,COc1cc(ncc1CNCC1(O)CC1)C(=O)Nc1cccc(c1C)-c1ccc...,0.032,COc1cc(C(=O)Nc2cccc(-c3cccc(NC(=O)c4cc(OC)c(CN...
4,1212930,COc1cc(ncc1CN[C@H]1C[C@H](C1)C(O)=O)C(=O)Nc1cc...,0.034,COc1cc(C(=O)Nc2cccc(-c3cccc(NC(=O)c4cc(OC)c(CN...
...,...,...,...,...
5724,964428,Cc1c(cccc1-c1cccc(Nc2nccc3cc(CN4CC[C@H](O)C4)c...,3000,Cc1c(-c2nc3cc(CN4CC[C@@](C)(C(=O)O)C4)cc(C#N)c...
5725,964542,Cc1c(COc2cc(OCc3cncc(c3)C#N)c(CN3CC[C@@H](O)C3...,3000,Cc1c(COc2cc(OCc3cncc(C#N)c3)c(CN3CC[C@@H](O)C3...
5726,964545,Cc1c(cccc1-c1cccc(NC(=O)c2cc(CN3CC[C@H](C3)C(O...,3000,Cc1c(-c2nc3cc(CN4CC[C@@H](O)C4)cc(Cl)c3o2)cccc...
5727,964546,Cc1c(Nc2nccn3c(CN4CC[C@H](O)C4)cnc23)cccc1-c1c...,3000,Cc1c(Nc2nccn3c(CN4CC[C@H](O)C4)cnc23)cccc1-c1c...


In [51]:
useful_BindingDB.isna().sum()

ID                    0
Smiles                0
IC50 (nM)           424
canonical_smiles      0
dtype: int64

In [52]:
useful_BindingDB = useful_BindingDB.dropna(subset='IC50 (nM)')
useful_BindingDB = useful_BindingDB.drop_duplicates(subset='canonical_smiles').reset_index(drop=True)
useful_BindingDB

Unnamed: 0,ID,Smiles,IC50 (nM),canonical_smiles
0,51455869,OC(=O)[C@@H]1CCCCN1Cc1ccc(CCc2cccc(c2Br)-c2ccc...,0.000,N#Cc1cc(COc2cc(CCc3cccc(-c4ccccc4)c3Br)ccc2CN2...
1,51240354,OC(=O)[C@@H]1CCCCN1Cc1cc(Cl)c(CCc2cccc(c2Br)-c...,0.000,N#Cc1cc(COc2cc(CCc3cccc(-c4ccccc4)c3Br)c(Cl)cc...
2,1021612,OC[C@H](NCc1cc(Cl)c(OCc2cccc(c2Br)-c2ccccc2)cc...,<0.0001,O=C(O)[C@H](CO)NCc1cc(Cl)c(OCc2cccc(-c3ccccc3)...
3,1212892,COc1cc(ncc1CNCC1(O)CC1)C(=O)Nc1cccc(c1C)-c1ccc...,0.032,COc1cc(C(=O)Nc2cccc(-c3cccc(NC(=O)c4cc(OC)c(CN...
4,1212930,COc1cc(ncc1CN[C@H]1C[C@H](C1)C(O)=O)C(=O)Nc1cc...,0.034,COc1cc(C(=O)Nc2cccc(-c3cccc(NC(=O)c4cc(OC)c(CN...
...,...,...,...,...
3765,996276,COc1nc(ccc1CN1CC(O)C1)-c1cccc(c1F)-c1cccc(-c2c...,2909,COc1nc(-c2cccc(-c3cccc(-c4ccc(CN5CC(O)C5)c(OC)...
3766,1324686,OC[C@@H](NCc1ccc(NCc2cccc(-c3ccccc3)c2C(F)(F)F...,2933,N#Cc1cncc(COc2cc(NCc3cccc(-c4ccccc4)c3C(F)(F)F...
3767,51285840,Cc1c(NC(=O)c2cc(CNCC(N)=O)cn3cnnc23)cccc1-c1cc...,2945,Cc1c(NC(=O)c2cc(CNCC(N)=O)cn3cnnc23)cccc1-c1cc...
3768,995979,COc1nc(ccc1CNC[C@@H]1CCC(=O)N1)-c1ccc(F)c(c1Cl...,2959,COc1nc(-c2ccc(F)c(-c3c(F)ccc(-c4ccc(CNC[C@@H]5...


In [53]:
Lipinski_BindingDB = Lipinski_descriptors(useful_BindingDB['canonical_smiles'])
Lipinski_BindingDB

Unnamed: 0,canonical_smiles,Molecular Weight,LogP,num_H_acceptors,num_H_Donors
0,N#Cc1cc(COc2cc(CCc3cccc(-c4ccccc4)c3Br)ccc2CN2...,610.552,7.18608,5,1
1,N#Cc1cc(COc2cc(CCc3cccc(-c4ccccc4)c3Br)c(Cl)cc...,644.997,7.83948,5,1
2,O=C(O)[C@H](CO)NCc1cc(Cl)c(OCc2cccc(-c3ccccc3)...,597.893,5.8576,6,3
3,COc1cc(C(=O)Nc2cccc(-c3cccc(NC(=O)c4cc(OC)c(CN...,680.806,4.51124,10,6
4,COc1cc(C(=O)Nc2cccc(-c3cccc(NC(=O)c4cc(OC)c(CN...,736.826,5.18784,10,6
...,...,...,...,...,...
3765,COc1nc(-c2cccc(-c3cccc(-c4ccc(CN5CC(O)C5)c(OC)...,624.635,5.0057,8,2
3766,N#Cc1cncc(COc2cc(NCc3cccc(-c4ccccc4)c3C(F)(F)F...,576.575,5.36528,7,4
3767,Cc1c(NC(=O)c2cc(CNCC(N)=O)cn3cnnc23)cccc1-c1cc...,414.469,2.53192,6,3
3768,COc1nc(-c2ccc(F)c(-c3c(F)ccc(-c4ccc(CNC[C@@H]5...,725.624,5.8162,8,4


In [54]:
useful_BindingDB = pd.merge(useful_BindingDB, Lipinski_BindingDB, on='canonical_smiles')
# useful_BindingDB = useful_BindingDB[useful_BindingDB['Molecular Weight'] < 500]
# useful_BindingDB = useful_BindingDB[useful_BindingDB['LogP'] < 5]
# useful_BindingDB = useful_BindingDB[useful_BindingDB['num_H_Donors'] < 5]
# useful_BindingDB = useful_BindingDB[useful_BindingDB['num_H_acceptors'] < 10].reset_index(drop=True)
useful_BindingDB

Unnamed: 0,ID,Smiles,IC50 (nM),canonical_smiles,Molecular Weight,LogP,num_H_acceptors,num_H_Donors
0,51455869,OC(=O)[C@@H]1CCCCN1Cc1ccc(CCc2cccc(c2Br)-c2ccc...,0.000,N#Cc1cc(COc2cc(CCc3cccc(-c4ccccc4)c3Br)ccc2CN2...,610.552,7.18608,5,1
1,51240354,OC(=O)[C@@H]1CCCCN1Cc1cc(Cl)c(CCc2cccc(c2Br)-c...,0.000,N#Cc1cc(COc2cc(CCc3cccc(-c4ccccc4)c3Br)c(Cl)cc...,644.997,7.83948,5,1
2,1021612,OC[C@H](NCc1cc(Cl)c(OCc2cccc(c2Br)-c2ccccc2)cc...,<0.0001,O=C(O)[C@H](CO)NCc1cc(Cl)c(OCc2cccc(-c3ccccc3)...,597.893,5.8576,6,3
3,1212892,COc1cc(ncc1CNCC1(O)CC1)C(=O)Nc1cccc(c1C)-c1ccc...,0.032,COc1cc(C(=O)Nc2cccc(-c3cccc(NC(=O)c4cc(OC)c(CN...,680.806,4.51124,10,6
4,1212930,COc1cc(ncc1CN[C@H]1C[C@H](C1)C(O)=O)C(=O)Nc1cc...,0.034,COc1cc(C(=O)Nc2cccc(-c3cccc(NC(=O)c4cc(OC)c(CN...,736.826,5.18784,10,6
...,...,...,...,...,...,...,...,...
3765,996276,COc1nc(ccc1CN1CC(O)C1)-c1cccc(c1F)-c1cccc(-c2c...,2909,COc1nc(-c2cccc(-c3cccc(-c4ccc(CN5CC(O)C5)c(OC)...,624.635,5.0057,8,2
3766,1324686,OC[C@@H](NCc1ccc(NCc2cccc(-c3ccccc3)c2C(F)(F)F...,2933,N#Cc1cncc(COc2cc(NCc3cccc(-c4ccccc4)c3C(F)(F)F...,576.575,5.36528,7,4
3767,51285840,Cc1c(NC(=O)c2cc(CNCC(N)=O)cn3cnnc23)cccc1-c1cc...,2945,Cc1c(NC(=O)c2cc(CNCC(N)=O)cn3cnnc23)cccc1-c1cc...,414.469,2.53192,6,3
3768,995979,COc1nc(ccc1CNC[C@@H]1CCC(=O)N1)-c1ccc(F)c(c1Cl...,2959,COc1nc(-c2ccc(F)c(-c3c(F)ccc(-c4ccc(CNC[C@@H]5...,725.624,5.8162,8,4


In [55]:
value_list = []
for idx, row in useful_BindingDB.iterrows():
    data = row['IC50 (nM)']
    if data.startswith(r'>'):
        value = float(data.split('>')[-1])
        value_list.append(value)
    elif data.startswith(r'<'):
        value = float(data.split('<')[-1])
        value_list.append(value)
    else:
        value_list.append(float(data))

value = pd.Series(value_list)
useful_BindingDB['IC50 (nM)'] = value

In [56]:
useful_BindingDB.to_csv('./data/BindingDB clean.csv', index=None)

# 3. merge datasets

In [57]:
merged_data = pd.concat([useful_BindingDB, useful_CheMBL], axis=0)
merged_data.to_csv('./data/merged_data clean.csv', index=None)