In [10]:
# Create sample train and test keys
data_name = "dude"
data_dir = "./dude"

import os
list_receptors = os.listdir(data_dir)
list_receptors = list(filter(lambda x: os.path.isdir(os.path.join(data_dir, x)), list_receptors))

valid_keys = []
for receptor in list_receptors:
    list_ligands = os.listdir(os.path.join(data_dir, receptor))[:-1]
    for ligand in list_ligands:
        valid_keys.append(receptor+"_"+ligand[:-4])
        
print(valid_keys[:5])

['andr_C36276925', 'andr_CHEMBL519112', 'egfr_C04955856', 'egfr_CHEMBL144760']


In [11]:
# Split train test
test_gene = ['egfr', 'parp1', 'fnta', 'aa2ar', 'pygm', 'kith', 'met', 'abl1', 'ptn1', 'casp3', 'hdac8', 'grik1', 'kpcb', 'ada', 'pyrd', 'ace', 'aces', 'pgh1', 'aldr', 'kit', 'fa10', 'pa2ga', 'fgfr1', 'cp3a4', 'wee1', 'tgfr1']
train_gene = [p for p in list_receptors if p not in test_gene]

train_keys = [k for k in valid_keys if k.split('_')[0] in train_gene]    
test_keys = [k for k in valid_keys if k.split('_')[0] in test_gene]  

print(train_keys[:5])
print(test_keys[:5])

# Notice that key which has "CHEMBL" is active, otherwise inactive

['andr_C36276925', 'andr_CHEMBL519112']
['egfr_C04955856', 'egfr_CHEMBL144760']


In [16]:
# Save train and test keys
import pickle

with open("keys/train_%s.pkl"%data_name, 'wb') as f:
    pickle.dump(train_keys, f)
    
with open("keys/test_%s.pkl"%data_name, 'wb') as f:
    pickle.dump(test_keys, f)

In [17]:
from rdkit.Chem.rdmolfiles import *
from rdkit.Chem.rdchem import *
from rdkit.Chem.rdmolops import *
from rdkit.Chem.Draw import *
from rdkit.Chem.Lipinski import *

# Setting for auto draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
IPythonConsole.drawOptions.addAtomIndices = True
IPythonConsole.molSize = 1000,1000

In [18]:
def load_receptor_ligand_data(keys):
    result_list = {}
    
    for key in keys:
        receptor_name, ligand_name = key.split("_")
        
        # Load ligand
        ligands_sdf = SDMolSupplier("%s/%s/%s.sdf" % (data_dir, receptor_name, ligand_name))
        ligand = ligands_sdf[0]
        print("ligand %s" % ligand_name, ligand != None)
        
        # Load receptor
        receptor = MolFromPDBFile("%s/%s/receptor.pdb" % (data_dir, receptor_name))
        print("receptor %s" % receptor_name, receptor != None)
        
        result_list[key] = (ligand, receptor)
        
    return result_list

In [19]:
# Load and save
def load_and_save_data_by_keys(keys):
    train_dict = load_receptor_ligand_data(keys)
    for key, data in train_dict.items():
        with open('data/'+key, 'wb') as f:
            pickle.dump(data, f)

# Train data
load_and_save_data_by_keys(train_keys)

# Test data
load_and_save_data_by_keys(test_keys)

ligand C36276925 True
receptor andr True
ligand CHEMBL519112 True
receptor andr True
ligand C04955856 True
receptor egfr True
ligand CHEMBL144760 True
receptor egfr True
