In [2]:
import numpy as np
import pandas as pd
import pickle
# from rdkit import Chem
# from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator as md
# from rdkit import DataStructs
from chembl_webresource_client.new_client import new_client

In [3]:
# Save dict 
def save_obj(obj, name):
    with open('data/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

# Load dict
def load_obj(name):
    with open('data/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

# Extract drug descriptors from smile string
def mol2des(chembl_id, attribs):
    try: 
        m = Chem.MolFromSmiles(new_client.activity.filter(molecule_chembl_id=chembl_id).only(['canonical_smiles'])[0]['canonical_smiles'])
        calc = md(attribs)
        des = pd.DataFrame(calc.CalcDescriptors(m)).T
        des.columns = attribs
        des['Drug'] = str(chembl_id)
        status = True
    except:
        des=[]
        status = False
    return des, status

# load dict 
drug_target = load_obj('drug_target')

In [92]:
# Extract Drug-Target pairs with IC50 values
DTI = pd.DataFrame(columns=['drug', 'target', 'IC50', 'unit'])
for drug in tqdm(drug_target):
    for i in range(len(drug_target[drug]['target_chembl_id'])):
        if drug_target[drug]['type'][i]=='IC50':
            dict = {'drug':drug, 'target':drug_target[drug]['target_chembl_id'][i],
                    'IC50':drug_target[drug]['value'][i], 'unit':drug_target[drug]['units'][i]}
            DTI = DTI.append(dict, True)

# Save to file
DTI.to_csv('data/DTI.csv')

# Summary Statistic for DTI
units, targets = [], []
for drug in drug_target:
    units.append(drug_target[drug]['type'])
    targets.append(drug_target[drug]['target_chembl_id'])
flatten_units = [j for sub in units for j in sub] 
flatten_targets = [j for sub in targets for j in sub] 

print('Measurements recorded: %s IC50, %s Kd and %s Ki'%(flatten_units.count('IC50'), flatten_units.count('Kd'), flatten_units.count('Ki')))
print('Total targets recorded: %s'%len(np.unique(np.array(flatten_targets))))
print('Targets with IC50 values: %s'%(len(DTI['target'].unique())))

DTI.tail()

100%|██████████| 52498/52498 [55:51<00:00, 15.67it/s]


In [None]:
# Extract drug descriptors
attribs = [
      'MaxAbsPartialCharge', 'MinPartialCharge', 'MinAbsPartialCharge',
      'HeavyAtomMolWt', 'MaxAbsEStateIndex', 'NumRadicalElectrons',
      'NumValenceElectrons', 'MinAbsEStateIndex', 'MaxEStateIndex',
      'MaxPartialCharge', 'MinEStateIndex', 'ExactMolWt', 'MolWt', 'BalabanJ',
      'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n',
      'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc',
      'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10',
      'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2',
      'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7',
      'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3',
      'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9',
      'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2',
      'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7',
      'SlogP_VSA8', 'SlogP_VSA9', 'TPSA', 'EState_VSA1', 'EState_VSA10',
      'EState_VSA11', 'EState_VSA2', 'EState_VSA3', 'EState_VSA4',
      'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9',
      'VSA_EState1', 'VSA_EState10', 'VSA_EState2', 'VSA_EState3',
      'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 'VSA_EState8',
      'VSA_EState9', 'FractionCSP3', 'HeavyAtomCount', 'NHOHCount', 'NOCount',
      'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles',
      'NumAliphaticRings', 'NumAromaticCarbocycles', 'NumAromaticHeterocycles',
      'NumAromaticRings', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms',
      'NumRotatableBonds', 'NumSaturatedCarbocycles',
      'NumSaturatedHeterocycles', 'NumSaturatedRings', 'RingCount', 'MolLogP',
      'MolMR'
    ]

fset_mol=pd.DataFrame()
KIBA_drugs = pd.read_csv('data/KIBA_drugs.csv')

for chembl_id in tqdm(KIBA_drugs['drugs']):
    descriptors, status = mol2des(chembl_id, attribs)
    if status:
        fset_mol = fset_mol.append(descriptors)
fset_mol.to_csv('fset_KIBA_drugs.csv')

In [33]:
# Extract target descriptors


Unnamed: 0.1,Unnamed: 0,drug,target,IC50,unit
224179,224179,CHEMBL388956,CHEMBL331,1420.0,nM
224180,224180,CHEMBL388956,CHEMBL2185,2140.0,nM
224181,224181,CHEMBL388956,CHEMBL612545,35335.0,nM
224182,224182,CHEMBL388966,CHEMBL2431,10.0,nM
224183,224183,CHEMBL388966,CHEMBL2534,1.0,uM


Measurements recorded: 224184 IC50, 28763 Kd and 79203 Ki
Total targets recorded: 5724
Targets with IC50 values: 3146
