In [None]:
!pip install rdkit-pypi
!pip install matplotlib-venn
!pip install venn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.3.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (22.7 MB)
[K     |████████████████████████████████| 22.7 MB 1.4 MB/s 
Installing collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.3.4
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting venn
  Downloading venn-0.1.3.tar.gz (19 kB)
Building wheels for collected packages: venn
  Building wheel for venn (setup.py) ... [?25l[?25hdone
  Created wheel for venn: filename=venn-0.1.3-py3-none-any.whl size=19716 sha256=2e7ee8cbea0d0b805f60603485e5d28b929880ac15af73591ef406eee3bdf8d2
  Stored in directory: /root/.cache/pip/wheels/85/8c/61/85a84e4c69c79c6475b9689755fc4ee4dae0bdc32b101011cb
Successfully built ve

In [None]:
# https://www.bindingdb.org/bind/chemsearch/marvin/SDFdownload.jsp?download_file=/bind/downloads/BindingDB_All_2022m3.tsv.zip
!wget https://www.bindingdb.org/bind/downloads/BindingDB_All_2022m3.tsv.zip

--2022-07-14 10:12:47--  https://www.bindingdb.org/bind/downloads/BindingDB_All_2022m3.tsv.zip
Resolving www.bindingdb.org (www.bindingdb.org)... 137.110.139.247
Connecting to www.bindingdb.org (www.bindingdb.org)|137.110.139.247|:443... connected.
HTTP request sent, awaiting response... 200 200
Length: 417326199 (398M) [application/zip]
Saving to: ‘BindingDB_All_2022m3.tsv.zip’


2022-07-14 10:15:05 (2.90 MB/s) - ‘BindingDB_All_2022m3.tsv.zip’ saved [417326199/417326199]



In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from tqdm.auto import tqdm


tqdm.pandas()

In [None]:
COLS = ['Ligand SMILES',
        'Target Source Organism According to Curator or DataSource',
        'Ki (nM)',
        'IC50 (nM)',
        'Kd (nM)',
        'EC50 (nM)',
        'pH',
        'Temp (C)',
        'Number of Protein Chains in Target (>1 implies a multichain complex)',
        'BindingDB Target Chain  Sequence']


NAMES = {'Ligand SMILES' : 'smiles',
         'Target Source Organism According to Curator or DataSource': 'target_name',
        'Ki (nM)' : 'Ki_nM',
        'IC50 (nM)': 'IC50_nM',
        'Kd (nM)': 'Kd_nM',
        'EC50 (nM)': 'EC50_nM',
         'kon (M-1-s-1)': 'kon',
         'koff (s-1)' : 'koff',
         'Temp (C)' : 'T',
         'Number of Protein Chains in Target (>1 implies a multichain complex)' : 'N',
        'BindingDB Target Chain  Sequence' : 'target'}

In [None]:
data = pd.read_csv("BindingDB_All_2022m3.tsv.zip", compression='zip', sep = '\t', usecols=COLS)
data.rename(columns=NAMES, inplace=True)

In [None]:
print(data.shape)

data = data[data['target_name'].str.lower().str.strip().isin(['homo sapiens'])]
# data = data[data[['Ki_nM', 'IC50_nM','Kd_nM', 'EC50_nM']].notna().sum(axis=1)>=2] # <- Keep record with at least 2 values
data = data[data.EC50_nM.notna()] # <- Keep mainly Kd

print(data.shape)

(2407381, 10)
(112211, 10)


In [None]:
def to_non_isomeric_canonical(s):
    """
    Removes isomeric features and makes RDKit canonical.

    Example:
    pubchem_torin1 = "CCC(=O)N1CCN(CC1)C2=C(C=C(C=C2)N3C(=O)C=CC4=CN=C5C=CC(=CC5=C43)C6=CC7=CC=CC=C7N=C6)C(F)(F)F"
    to_non_isomeric_canonical(pubchem_torin1)
    >>CCC(=O)N1CCN(c2ccc(-n3c(=O)ccc4cnc5ccc(-c6cnc7ccccc7c6)cc5c43)cc2C(F)(F)F)CC1
    """
    try:
        mol = Chem.MolFromSmiles(s)
        s = Chem.MolToSmiles(mol, isomericSmiles=False, canonical=True)
        return s
    except Exception as e:
        # if smiles is invalid return NaN
        return np.nan

In [None]:
def certain_to_float(S):
    S = str(S)
    if not set(S).intersection({"+", '-', '>', '<'}):
        S = S.replace("E", 'e')
        S = S.replace("C", '')

        try:
            F = float(S.strip())
            return F
        except Exception as e:
            print(e)
            return np.nan
    else:
        return np.nan

In [None]:
constants = ['Ki_nM', 'IC50_nM','Kd_nM', 'EC50_nM', 'pH']
for col in constants:
    data[col] = data[col].astype('str')
    data[col] = data[col].progress_apply(certain_to_float)

  0%|          | 0/112211 [00:00<?, ?it/s]

  0%|          | 0/112211 [00:00<?, ?it/s]

  0%|          | 0/112211 [00:00<?, ?it/s]

  0%|          | 0/112211 [00:00<?, ?it/s]

  0%|          | 0/112211 [00:00<?, ?it/s]

In [None]:
data = data[data[['Ki_nM', 'IC50_nM','Kd_nM', 'EC50_nM']].notna().max(axis=1)]
data.shape # 1,273,849

(86324, 10)

In [None]:
data['smiles'].nunique()

55774

In [None]:
data['smiles'] = data['smiles'].progress_apply(to_non_isomeric_canonical)

  0%|          | 0/86324 [00:00<?, ?it/s]

In [None]:
data['smiles'].nunique()

53149

In [None]:
data = data[data['smiles'] .notna()]
data.shape

(86302, 10)

In [None]:
data.head(2)

Unnamed: 0,smiles,target_name,Ki_nM,IC50_nM,Kd_nM,EC50_nM,pH,T,N,target
11815,Cc1nc(Nc2[nH]nc3c2CN(C(=O)NC2CC2c2ccccc2)C3(C)...,Homo sapiens,3.5,,,19.0,,,1,MFGKRKKRVEISAPSNFEHRVHTGFDQHEQKFTGLPRQWQSLIEES...
32435,CC12CCC3C(CCC4CC(=O)CCC43C)C1CCC2O,Homo sapiens,0.3,,,2.8,7.4,22.00 C,1,MEVQLGLGRVYPRPPSKTYRGAFQNLFQSVREVIQNPGPRHPEAAS...


In [None]:
def is_active(Ki_nM, IC50_nM, Kd_nM, EC50_nM):
    for x in [Ki_nM, IC50_nM, Kd_nM, EC50_nM]:
        if (x / 1000) < 1:
            return 1    
    return 0

In [None]:
data["is_active"] = data.progress_apply(lambda x: is_active(x.Ki_nM, x.IC50_nM, x.Kd_nM, x.EC50_nM), axis=1)

  0%|          | 0/86302 [00:00<?, ?it/s]

In [None]:
def to_tdc_log(y):
    y = -np.log10(y*1e-9 + 1e-10)
    if y<0:
        return np.nan
    return y

In [None]:
constants = ['Ki_nM', 'IC50_nM','Kd_nM', 'EC50_nM']
for col in constants:
    data['p' + col.split('_')[0]] = data[col].apply(to_tdc_log)

In [None]:
data.corr()

Unnamed: 0,Ki_nM,IC50_nM,Kd_nM,EC50_nM,pH,N,is_active,pKi,pIC50,pKd,pEC50
Ki_nM,1.0,0.999875,,0.37298,-0.026554,,-0.360705,-0.35064,-0.537804,,-0.37383
IC50_nM,0.999875,1.0,,0.407621,-0.154993,,-0.40793,-0.496666,-0.426354,,-0.28694
Kd_nM,,,1.0,0.970497,0.612139,,,,,-0.865231,-0.826858
EC50_nM,0.37298,0.407621,0.970497,1.0,-0.006567,-0.000715,-0.009299,-0.243155,-0.375056,-0.749997,-0.095156
pH,-0.026554,-0.154993,0.612139,-0.006567,1.0,0.023101,0.107824,0.183329,0.212743,-0.900407,0.161582
N,,,,-0.000715,0.023101,1.0,-0.020072,,,,-0.033271
is_active,-0.360705,-0.40793,,-0.009299,0.107824,-0.020072,1.0,0.63343,0.673494,,0.780044
pKi,-0.35064,-0.496666,,-0.243155,0.183329,,0.63343,1.0,0.175862,,0.655141
pIC50,-0.537804,-0.426354,,-0.375056,0.212743,,0.673494,0.175862,1.0,,0.608992
pKd,,,-0.865231,-0.749997,-0.900407,,,,,1.0,0.836137


In [None]:
constants = ['pKi', 'pIC50','pKd', 'pEC50', 'is_active', 'pH']

In [None]:
data['smiles'] = data['smiles'].str.strip()
data['target'] = data['target'].str.strip().str.upper()
data.head(2)

Unnamed: 0,smiles,target_name,Ki_nM,IC50_nM,Kd_nM,EC50_nM,pH,T,N,target,is_active,pKi,pIC50,pKd,pEC50
11815,Cc1nc(Nc2[nH]nc3c2CN(C(=O)NC2CC2c2ccccc2)C3(C)...,Homo sapiens,3.5,,,19.0,,,1,MFGKRKKRVEISAPSNFEHRVHTGFDQHEQKFTGLPRQWQSLIEES...,1,8.443697,,,7.718967
32435,CC12CCC3C(CCC4CC(=O)CCC43C)C1CCC2O,Homo sapiens,0.3,,,2.8,7.4,22.00 C,1,MEVQLGLGRVYPRPPSKTYRGAFQNLFQSVREVIQNPGPRHPEAAS...,1,9.39794,,,8.537602


In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = data.groupby(['smiles', 'target'], as_index=False, sort=False).quantile(0.5)
data.shape

(63589, 13)

In [None]:
data = data[data.pEC50.notna()]
data.shape

(63487, 13)

In [None]:
data[constants].notna().sum()

pKi            430
pIC50          635
pKd              5
pEC50        63487
is_active    63487
pH            3376
dtype: int64

In [None]:
from rdkit.Chem import QED
from functools import lru_cache

@lru_cache(100000)
def get_qed(s):
    try:
        return QED.qed(Chem.MolFromSmiles(s))
    except:
        return np.nan

data['qed'] = data.smiles.progress_apply(get_qed)

  0%|          | 0/63487 [00:00<?, ?it/s]

In [None]:
data = data[['smiles', 'target'] + constants+ ['qed']]

In [None]:
data['is_active'] = data['is_active'].round()

In [None]:
data = data[data.target.str.startswith('M')]
data['is_active'].value_counts(normalize=True)

1.0    0.639243
0.0    0.360757
Name: is_active, dtype: float64

In [None]:
data.tail()

Unnamed: 0,smiles,target,pKi,pIC50,pKd,pEC50,is_active,pH,qed
63584,CC(C)c1onc(-c2c(Cl)cccc2Cl)c1COc1ccc(COc2ccc(C...,MVMQFQGLENPIQISPHCSCTPSGFFMEMMSMKPAKGVLTEQVAGP...,,,,5.698948,0.0,,0.229747
63585,CC(C)c1onc(-c2c(Cl)cccc2Cl)c1COc1ccc(CNc2ccc(C...,MVMQFQGLENPIQISPHCSCTPSGFFMEMMSMKPAKGVLTEQVAGP...,,,,5.795853,0.0,,0.221969
63586,Cc1cc(NCc2ccc(OCc3c(-c4c(Cl)cccc4Cl)noc3C(C)C)...,MVMQFQGLENPIQISPHCSCTPSGFFMEMMSMKPAKGVLTEQVAGP...,,,,5.920783,0.0,,0.213178
63587,CC(C)c1onc(-c2c(Cl)cccc2Cl)c1COc1ccc(COc2ccc(C...,MVMQFQGLENPIQISPHCSCTPSGFFMEMMSMKPAKGVLTEQVAGP...,,,,6.096856,1.0,,0.210414
63588,CC(C)c1onc(-c2c(Cl)cccc2Cl)c1COc1ccc(CNc2ccc(C...,MVMQFQGLENPIQISPHCSCTPSGFFMEMMSMKPAKGVLTEQVAGP...,,,,6.026826,1.0,,0.203468


In [None]:
data = data.sample(frac=1)

In [None]:
train = data.iloc[:data.shape[0]//3]
valid = data.iloc[data.shape[0]//3:(data.shape[0]//3)*2]
test = data.iloc[(data.shape[0]//3)*2:]

train.shape, valid.shape, test.shape

((21153, 9), (21153, 9), (21155, 9))

In [None]:
train.to_csv("data_human_agg05_EC50_train.csv", index = False)
valid.to_csv("data_human_agg05_EC50_valid.csv", index = False)
test.to_csv("data_human_agg05_EC50_test.csv", index = False)

In [None]:
train[constants].notna().sum()

pKi            159
pIC50          201
pKd              3
pEC50        21153
is_active    21153
pH            1144
dtype: int64