In [1]:
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
# from rdkit import Chem
# from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator as md
# from rdkit import DataStructs
# from chembl_webresource_client.new_client import new_client

In [2]:
# Save dict 
def save_obj(obj, name):
    with open('data/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

# Load dict
def load_obj(name):
    with open('data/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

# Extract drug descriptors from smile string
def mol2des(chembl_id, attribs):
    try: 
        m = Chem.MolFromSmiles(new_client.activity.filter(molecule_chembl_id=chembl_id).only(['canonical_smiles'])[0]['canonical_smiles'])
        calc = md(attribs)
        des = pd.DataFrame(calc.CalcDescriptors(m)).T
        des.columns = attribs
        des['Drug'] = str(chembl_id)
        status = True
    except:
        des=[]
        status = False
    return des, status

# load dict 
drug_target = load_obj('drug_target')

In [7]:
# Extract Drug-Target pairs for Homo-Sapiens with IC50 values
DTI = pd.DataFrame(columns=['target_organism', 'drug', 'target', 'IC50', 'unit'])
for drug in tqdm(drug_target):
    for i in range(len(drug_target[drug]['target_chembl_id'])):
        if drug_target[drug]['type'][i]=='IC50' and drug_target[drug]['target_organism'][i]=='Homo sapiens':
            dict = {'target_organism':drug_target[drug]['target_organism'][i], 
                    'drug':drug,
                    'target':drug_target[drug]['target_chembl_id'][i],
                    'IC50':drug_target[drug]['value'][i],
                    'unit':drug_target[drug]['units'][i]}
            DTI = DTI.append(dict, True)

# Save to file
DTI.to_csv('data/DTI2.csv')
DTI.tail()

100%|██████████| 52498/52498 [43:19<00:00, 20.20it/s]


Unnamed: 0,target_organism,drug,target,IC50,unit
175885,Homo sapiens,CHEMBL388956,CHEMBL308,214.0,nM
175886,Homo sapiens,CHEMBL388956,CHEMBL331,1420.0,nM
175887,Homo sapiens,CHEMBL388956,CHEMBL2185,2140.0,nM
175888,Homo sapiens,CHEMBL388966,CHEMBL2431,10.0,nM
175889,Homo sapiens,CHEMBL388966,CHEMBL2534,1.0,uM


In [31]:
chembl2uniprot = pd.read_csv('data/chembl2uniprot.txt', sep='\t', header=None)  #Import CHEMBL_ID to uniprot_ID mapping
units=['nM','uM','pM','mM']     # Units to be selected
DTI_screened_units = DTI.loc[DTI['unit'].isin(units)]   # Extract datapoints with required units
DTI_screened_mapping = DTI_screened_units[DTI_screened_units['target'].isin(chembl2uniprot[0].tolist())]    # Extract datapoints for which uniprot_ID ia available

In [51]:
# Summary statistics
print('Total targets and datapoints acquired   : %s | %s'%(len(DTI['target'].unique()), len(DTI)))
print('Targets and datapoints after IC50 screen: %s | %s'%(len(DTI_screened_units['target'].unique()), len(DTI_screened_units)))
print('Removing NaNs and unannotated CHEMBL_ID : %s | %s'%(len(DTI_screened_mapping['target'].unique()), len(DTI_screened_mapping)))

Total targets and datapoints acquired   : 2042 | 175890
Targets and datapoints after IC50 screen: 1976 | 128448
Removing NaNs and unannotated CHEMBL_ID : 968 | 77396


In [46]:
DTI_screened_mapping.head()

Unnamed: 0,target_organism,drug,target,IC50,unit
0,Homo sapiens,CHEMBL10,CHEMBL260,0.29,uM
9,Homo sapiens,CHEMBL10,CHEMBL221,5000.0,nM
10,Homo sapiens,CHEMBL10,CHEMBL4439,30.0,uM
11,Homo sapiens,CHEMBL10,CHEMBL260,0.038,uM
13,Homo sapiens,CHEMBL10,CHEMBL1906,330.0,nM


In [50]:
from collections import Counter
Counter(DTI['unit'])

Counter({'uM': 93520,
         'nM': 34712,
         'umol/L': 232,
         None: 44707,
         'ug ml-1': 912,
         'uM l-1': 247,
         'uM dm**-3': 32,
         'mM': 201,
         'um': 918,
         'microM': 54,
         "10'-3 ug/ml": 2,
         'pM': 15,
         'uM/L': 6,
         "10'-2microM": 8,
         'mg.min/m3': 1,
         'M': 72,
         'ng/ml': 17,
         'uM/ml': 7,
         "10'-1microM": 3,
         "10'-5M": 20,
         'umol': 29,
         'g/ml': 2,
         "10'-8M": 6,
         "10'-4mM": 1,
         "10'-3mM": 1,
         'nmol/L': 14,
         'mg/ml': 12,
         "10'1 ug/ml": 11,
         "10'-3M": 2,
         'ug/mL': 27,
         "10'4nM": 1,
         "10'-7mg/ml": 1,
         "10'3 uM": 1,
         '%': 10,
         'microM/L': 3,
         "10'-4microM": 3,
         "10'-3microM": 8,
         "10'-9mol/L": 1,
         'mg ml-1': 1,
         "10'-7M": 15,
         "10'-4nM": 1,
         "10'2 uM": 4,
         "10'-6M": 10,
         '

In [None]:
# Summary Statistic for DTI
units, targets = [], []
for drug in drug_target:
    units.append(drug_target[drug]['type'])
    targets.append(drug_target[drug]['target_chembl_id'])
flatten_units = [j for sub in units for j in sub] 
flatten_targets = [j for sub in targets for j in sub] 

print('Measurements recorded: %s IC50, %s Kd and %s Ki'%(flatten_units.count('IC50'), flatten_units.count('Kd'), flatten_units.count('Ki')))
print('Total targets recorded: %s'%len(np.unique(np.array(flatten_targets))))
print('Targets with IC50 values: %s'%(len(DTI['target'].unique())))

In [None]:
# Extract drug descriptors
attribs = [
      'MaxAbsPartialCharge', 'MinPartialCharge', 'MinAbsPartialCharge',
      'HeavyAtomMolWt', 'MaxAbsEStateIndex', 'NumRadicalElectrons',
      'NumValenceElectrons', 'MinAbsEStateIndex', 'MaxEStateIndex',
      'MaxPartialCharge', 'MinEStateIndex', 'ExactMolWt', 'MolWt', 'BalabanJ',
      'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n',
      'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc',
      'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10',
      'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2',
      'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7',
      'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3',
      'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9',
      'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2',
      'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7',
      'SlogP_VSA8', 'SlogP_VSA9', 'TPSA', 'EState_VSA1', 'EState_VSA10',
      'EState_VSA11', 'EState_VSA2', 'EState_VSA3', 'EState_VSA4',
      'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9',
      'VSA_EState1', 'VSA_EState10', 'VSA_EState2', 'VSA_EState3',
      'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 'VSA_EState8',
      'VSA_EState9', 'FractionCSP3', 'HeavyAtomCount', 'NHOHCount', 'NOCount',
      'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles',
      'NumAliphaticRings', 'NumAromaticCarbocycles', 'NumAromaticHeterocycles',
      'NumAromaticRings', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms',
      'NumRotatableBonds', 'NumSaturatedCarbocycles',
      'NumSaturatedHeterocycles', 'NumSaturatedRings', 'RingCount', 'MolLogP',
      'MolMR'
    ]

fset_mol=pd.DataFrame()
KIBA_drugs = pd.read_csv('data/KIBA_drugs.csv')

for chembl_id in tqdm(KIBA_drugs['drugs']):
    descriptors, status = mol2des(chembl_id, attribs)
    if status:
        fset_mol = fset_mol.append(descriptors)
fset_mol.to_csv('fset_KIBA_drugs.csv')