In [10]:
import numpy as np
import rdkit.Chem as Chem
import sys
import os
sys.path.append("../../")

import h5py
from tqdm import tqdm
from src.data.utils import pdb_to_rdkit_mol, mol2_to_rdkit_mol
from src.data.utils import rdkit_mol_featurizer
from src.data.pocket_utils import combine_and_filter, remove_waters

In [11]:
def extend_protein_data_dict(protein_data, protein_name, vdw_graph, ionic_graph, cov_graph):
    protein_data[protein_name] = {}
    protein_data[protein_name]['vdw'] = {}
    protein_data[protein_name]['vdw']['node_features'] = vdw_graph[0]
    protein_data[protein_name]['vdw']['edge_features'] = vdw_graph[1]
    protein_data[protein_name]['vdw']['edge_list'] = vdw_graph[2]

    protein_data[protein_name]['ionic'] = {}
    protein_data[protein_name]['ionic']['node_features'] = ionic_graph[0]
    protein_data[protein_name]['ionic']['edge_features'] = ionic_graph[1]
    protein_data[protein_name]['ionic']['edge_list'] = ionic_graph[2]

    protein_data[protein_name]['cov'] = {}
    protein_data[protein_name]['cov']['node_features'] = cov_graph[0]
    protein_data[protein_name]['cov']['edge_features'] = cov_graph[1]
    protein_data[protein_name]['cov']['edge_list'] = cov_graph[2]

    return protein_data

# Function to save data to HDF5
def save_protein_data_to_hdf5(file_name, protein_data):
    with h5py.File(file_name, 'w') as f:
        for protein, graphs in protein_data.items():
            protein_group = f.create_group(protein)
            for graph_name, graph_data in graphs.items():
                graph_group = protein_group.create_group(graph_name)
                for data_name, data in graph_data.items():
                    graph_group.create_dataset(data_name, data=data)

def load_protein_data_from_hdf5(file_name):
    with h5py.File(file_name, 'r') as f:
        protein_data = {}
        for protein in f.keys():
            protein_data[protein] = {}
            for graph_name in f[protein].keys():
                protein_data[protein][graph_name] = {
                    'node_features': f[protein][graph_name]['node_features'][:],
                    'edge_features': f[protein][graph_name]['edge_features'][:],
                    'edge_list': f[protein][graph_name]['edge_list'][:]
                }
    return protein_data


In [12]:
# protein_data = {
#     'protein_1': {
#         'graph_1': {
#             'node_features': np.random.rand(5, 10),
#             'edge_features': np.random.rand(4, 5),
#             'edge_list': np.array([(0, 1), (1, 2), (2, 3), (3, 4)])
#         },
#         'graph_2': {
#             'node_features': np.random.rand(5, 10),
#             'edge_features': np.random.rand(4, 5),
#             'edge_list': np.array([(0, 1), (1, 2), (2, 3), (3, 4)])
#         },
#         'graph_3': {
#             'node_features': np.random.rand(5, 10),
#             'edge_features': np.random.rand(4, 5),
#             'edge_list': np.array([(0, 1), (1, 2), (2, 3), (3, 4)])
#         }
#     },
#     # Add more proteins as needed
# }
protein_data = {}

In [13]:
#define a timer 
from time import time
global t0
t0 = time()
def time_block():    
    global t0 
    print(f"Time elapsed: {time()-t0:.2f}")
    t0 = time()

In [5]:
from time import time
import os
from tqdm import tqdm
from rdkit import Chem
from contextlib import contextmanager

@contextmanager
def time_block(label):
    start = time()
    try:
        yield
    finally:
        end = time()
        print(f"{label}: {end - start:.2f} seconds")

# Specify the path to the PDB data
#pdb_data = '/Users/tsachmackey/dfs/affinity_net/PDBbind/v2020-other-PL'
pdb_data = '../../test_data/pdb'

# Initialize the protein data dictionary
protein_data = {}

for filename in tqdm(os.listdir(pdb_data)):
    if os.path.isdir(os.path.join(pdb_data, filename)):
        protein_name = filename

        ligand_mol2_path = os.path.join(pdb_data, f'{protein_name}/{protein_name}_ligand.mol2')
        protein_pdb_path = os.path.join(pdb_data, f'{protein_name}/{protein_name}_protein.pdb')

        with time_block("Time for loading files"):
            ligand_mol = mol2_to_rdkit_mol(ligand_mol2_path, sanitize=False)
            protein_mol = pdb_to_rdkit_mol(protein_pdb_path, sanitize=False)
            
        # Set by_residue to True to include entire residues
        with time_block("Time for combining and filtering"):
            pocket_mol_res = combine_and_filter(ligand_mol, remove_waters(protein_mol), by_residue=True)

        # Save the resulting complex_mol to a PDB file
        output_path = os.path.join(pdb_data, f'{protein_name}/{protein_name}_pocket_res.pdb')
        pdb_block = Chem.MolToPDBBlock(pocket_mol_res)

        with time_block("Time for writing to PDB"):
            with open(output_path, 'w') as file:
                file.write(pdb_block)

        with time_block("Time for featurizing"):
            vdw_graph = rdkit_mol_featurizer(pocket_mol_res, "vdw interactions")
            ionic_graph = rdkit_mol_featurizer(pocket_mol_res, "ionic interactions")
            cov_graph = rdkit_mol_featurizer(pocket_mol_res, "covalent bonds")

        protein_data = extend_protein_data_dict(protein_data, protein_name, vdw_graph, ionic_graph, cov_graph)

# Save the example data
save_protein_data_to_hdf5('protein_data.h5', protein_data)


  0%|          | 0/9 [00:00<?, ?it/s]

Time for loading files: 0.01 seconds


 11%|█         | 1/9 [00:00<00:05,  1.43it/s]

Time for combining and filtering: 0.48 seconds
Time for writing to PDB: 0.00 seconds
Time for featurizing: 0.20 seconds
Time for loading files: 0.06 seconds


 44%|████▍     | 4/9 [00:02<00:02,  1.74it/s]

Time for combining and filtering: 1.47 seconds
Time for writing to PDB: 0.00 seconds
Time for featurizing: 0.10 seconds
Time for loading files: 0.03 seconds


100%|██████████| 9/9 [00:03<00:00,  2.92it/s]

Time for combining and filtering: 0.56 seconds
Time for writing to PDB: 0.00 seconds
Time for featurizing: 0.16 seconds





In [6]:
# Load the example data
loaded_protein_data = load_protein_data_from_hdf5('protein_data.h5')

In [7]:
loaded_protein_data

{'1a0q': {'cov': {'node_features': array([[-1.   ,  7.   , 14.007, ...,  1.   ,  0.   ,  0.   ],
          [-1.   ,  1.   ,  1.008, ...,  0.   ,  0.   ,  0.   ],
          [-1.   ,  6.   , 12.011, ...,  1.   ,  0.   ,  0.   ],
          ...,
          [ 1.   ,  1.   ,  1.008, ...,  0.   ,  0.   ,  0.   ],
          [ 1.   ,  1.   ,  1.008, ...,  0.   ,  0.   ,  0.   ],
          [ 1.   ,  1.   ,  1.008, ...,  0.   ,  0.   ,  0.   ]]),
   'edge_features': array([[1., 0., 0., 0., 0.],
          [1., 0., 0., 0., 0.],
          [1., 0., 0., 0., 0.],
          ...,
          [1., 0., 0., 0., 0.],
          [1., 0., 0., 0., 0.],
          [1., 0., 0., 0., 0.]]),
   'edge_list': array([[378, 375],
          [375, 378],
          [814, 811],
          ...,
          [994, 969],
          [977, 995],
          [995, 977]])},
  'ionic': {'node_features': array([[-1.   ,  7.   , 14.007, ...,  1.   ,  0.   ,  0.   ],
          [-1.   ,  1.   ,  1.008, ...,  0.   ,  0.   ,  0.   ],
          [-1.  

In [8]:
os.remove('protein_data.h5')

In [14]:
pdb_data = '/Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set'
new_pdb_data_dir = '/Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set_featurized'

# Initialize the protein data dictionary
protein_data = {}

for filename in tqdm(os.listdir(pdb_data)):
    if os.path.isdir(os.path.join(pdb_data, filename)):
        #make a new directory in the new_pdb_data_dir
        protein_name = filename
        new_protein_dir = os.path.join(new_pdb_data_dir, protein_name)
        os.makedirs(new_protein_dir, exist_ok=True)

        #copy {protein_name}_ligand.mol2, {protein_name}_protein.pdb, {protein_name}_pocket_res.pdb, and protein_data.h5
        #to the new_protein_dir
        os.system(f"cp {pdb_data}/{protein_name}/{protein_name}_ligand.mol2 {new_protein_dir}")
        os.system(f"cp {pdb_data}/{protein_name}/{protein_name}_protein.pdb {new_protein_dir}")
        os.system(f"cp {pdb_data}/{protein_name}/{protein_name}_pocket_res.pdb {new_protein_dir}")
        os.system(f"cp {pdb_data}/{protein_name}/protein_data.h5 {new_protein_dir}")
        
        try: 
            #load in the protein data
            protein_data_i = load_protein_data_from_hdf5(f"{new_protein_dir}/protein_data.h5")
        except:
            print(f"Error loading protein data for {protein_name}")
            continue

        #extend the protein_data dictionary
        protein_data.update(protein_data_i)

#save the data to new_pdb_data_dir/protein_data.h5
save_protein_data_to_hdf5(f"{new_pdb_data_dir}/protein_data.h5", protein_data)

  3%|▎         | 148/5321 [00:02<01:13, 69.99it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/3l4y/protein_data.h5: No such file or directory
  3%|▎         | 164/5321 [00:02<01:12, 71.31it/s]

Error loading protein data for 3l4y


  4%|▎         | 188/5321 [00:02<01:13, 70.27it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/4kcx/4kcx_pocket_res.pdb: No such file or directory
cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/4kcx/protein_data.h5: No such file or directory
  4%|▍         | 203/5321 [00:02<01:13, 69.29it/s]

Error loading protein data for 4kcx


  6%|▌         | 309/5321 [00:04<01:10, 71.09it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/4zbf/protein_data.h5: No such file or directory
  6%|▌         | 325/5321 [00:04<01:12, 69.22it/s]

Error loading protein data for 4zbf


  7%|▋         | 353/5321 [00:05<01:16, 65.15it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/3l4x/protein_data.h5: No such file or directory
  7%|▋         | 369/5321 [00:05<01:11, 69.58it/s]

Error loading protein data for 3l4x


  8%|▊         | 415/5321 [00:06<01:14, 65.47it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/3l4v/protein_data.h5: No such file or directory
  8%|▊         | 430/5321 [00:06<01:13, 66.57it/s]

Error loading protein data for 3l4v


  9%|▊         | 462/5321 [00:06<01:08, 70.45it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/3dx2/protein_data.h5: No such file or directory
  9%|▉         | 478/5321 [00:07<01:08, 71.07it/s]

Error loading protein data for 3dx2


 10%|▉         | 508/5321 [00:07<01:12, 66.39it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/4bup/protein_data.h5: No such file or directory
 10%|▉         | 524/5321 [00:07<01:09, 69.42it/s]

Error loading protein data for 4bup


 10%|█         | 556/5321 [00:08<01:05, 73.11it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/mol2files_nocharges/mol2files_nocharges_ligand.mol2: No such file or directory
cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/mol2files_nocharges/mol2files_nocharges_protein.pdb: No such file or directory
cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/mol2files_nocharges/mol2files_nocharges_pocket_res.pdb: No such file or directory
cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/mol2files_nocharges/protein_data.h5: No such file or directory
 11%|█         | 572/5321 [00:08<01:03, 74.71it/s]

Error loading protein data for mol2files_nocharges


 21%|██        | 1119/5321 [00:16<01:02, 67.46it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/2h21/protein_data.h5: No such file or directory
 21%|██        | 1127/5321 [00:16<01:01, 68.25it/s]

Error loading protein data for 2h21


 22%|██▏       | 1178/5321 [00:17<01:03, 65.49it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/3r24/protein_data.h5: No such file or directory
 22%|██▏       | 1194/5321 [00:17<01:13, 56.34it/s]

Error loading protein data for 3r24


 25%|██▌       | 1351/5321 [00:20<01:39, 40.01it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/6std/protein_data.h5: No such file or directory
 26%|██▌       | 1361/5321 [00:20<01:43, 38.41it/s]

Error loading protein data for 6std


 27%|██▋       | 1440/5321 [00:22<01:05, 59.02it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/6ift/protein_data.h5: No such file or directory
 27%|██▋       | 1453/5321 [00:22<01:11, 54.35it/s]

Error loading protein data for 6ift


 28%|██▊       | 1470/5321 [00:22<01:26, 44.38it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/1nw5/protein_data.h5: No such file or directory
 28%|██▊       | 1482/5321 [00:23<01:17, 49.24it/s]

Error loading protein data for 1nw5


 46%|████▌     | 2424/5321 [00:38<00:48, 59.25it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/1ksn/1ksn_pocket_res.pdb: No such file or directory
cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/1ksn/protein_data.h5: No such file or directory
 46%|████▌     | 2431/5321 [00:38<00:47, 60.38it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/3lpp/protein_data.h5: No such file or directory
 46%|████▌     | 2438/5321 [00:39<00:46, 61.50it/s]

Error loading protein data for 1ksn
Error loading protein data for 3lpp


 49%|████▉     | 2594/5321 [00:41<00:43, 62.21it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/3gcp/protein_data.h5: No such file or directory
 49%|████▉     | 2608/5321 [00:41<00:42, 63.84it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/readme/readme_ligand.mol2: No such file or directory
cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/readme/readme_protein.pdb: No such file or directory
cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/readme/readme_pocket_res.pdb: No such file or directory
cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/readme/protein_data.h5: No such file or directory


Error loading protein data for 3gcp
Error loading protein data for readme


 49%|████▉     | 2622/5321 [00:42<00:43, 62.62it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/4qy3/protein_data.h5: No such file or directory
 49%|████▉     | 2630/5321 [00:42<00:41, 64.90it/s]

Error loading protein data for 4qy3


 55%|█████▌    | 2945/5321 [00:48<01:24, 27.99it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/1sl3/1sl3_pocket_res.pdb: No such file or directory
cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/1sl3/protein_data.h5: No such file or directory
 55%|█████▌    | 2950/5321 [00:48<01:16, 30.81it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/3l4z/protein_data.h5: No such file or directory


Error loading protein data for 1sl3
Error loading protein data for 3l4z


 62%|██████▏   | 3304/5321 [01:32<00:48, 41.61it/s]  cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/3l4u/protein_data.h5: No such file or directory
 62%|██████▏   | 3314/5321 [01:32<00:49, 40.51it/s]

Error loading protein data for 3l4u


 66%|██████▋   | 3536/5321 [01:38<00:48, 36.83it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/5h5f/protein_data.h5: No such file or directory
 67%|██████▋   | 3542/5321 [01:38<00:42, 41.85it/s]

Error loading protein data for 5h5f


 73%|███████▎  | 3868/5321 [01:47<00:30, 47.43it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/index/index_ligand.mol2: No such file or directory
cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/index/index_protein.pdb: No such file or directory
cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/index/index_pocket_res.pdb: No such file or directory
cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/index/protein_data.h5: No such file or directory
 73%|███████▎  | 3880/5321 [01:47<00:28, 49.75it/s]

Error loading protein data for index


 85%|████████▍ | 4500/5321 [02:10<00:18, 43.27it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/4ymg/protein_data.h5: No such file or directory
 85%|████████▍ | 4510/5321 [02:10<00:18, 43.97it/s]

Error loading protein data for 4ymg


 86%|████████▌ | 4567/5321 [02:12<00:22, 33.89it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/5bw4/protein_data.h5: No such file or directory
 86%|████████▌ | 4571/5321 [02:12<00:27, 27.54it/s]

Error loading protein data for 5bw4


 87%|████████▋ | 4617/5321 [03:13<04:09,  2.83it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/1d7i/protein_data.h5: No such file or directory
 87%|████████▋ | 4620/5321 [03:13<03:08,  3.72it/s]

Error loading protein data for 1d7i


 88%|████████▊ | 4678/5321 [03:16<00:18, 34.62it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/5twj/protein_data.h5: No such file or directory
 88%|████████▊ | 4688/5321 [03:16<00:15, 40.07it/s]

Error loading protein data for 5twj


 89%|████████▉ | 4745/5321 [03:19<00:21, 27.24it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/3acl/protein_data.h5: No such file or directory
 89%|████████▉ | 4758/5321 [03:19<00:15, 37.12it/s]

Error loading protein data for 3acl


 91%|█████████ | 4826/5321 [03:21<00:14, 35.02it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/1mue/1mue_pocket_res.pdb: No such file or directory
cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/1mue/protein_data.h5: No such file or directory
 91%|█████████ | 4834/5321 [03:21<00:10, 44.70it/s]

Error loading protein data for 1mue


 91%|█████████ | 4852/5321 [03:21<00:09, 47.15it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/2ewa/protein_data.h5: No such file or directory
 91%|█████████▏| 4859/5321 [03:22<00:08, 51.46it/s]

Error loading protein data for 2ewa


 95%|█████████▍| 5033/5321 [03:24<00:04, 66.90it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/3pgl/protein_data.h5: No such file or directory
 95%|█████████▍| 5040/5321 [03:24<00:04, 63.62it/s]

Error loading protein data for 3pgl


 95%|█████████▍| 5053/5321 [03:25<00:05, 53.50it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/5kva/protein_data.h5: No such file or directory
 95%|█████████▌| 5067/5321 [03:25<00:04, 60.54it/s]

Error loading protein data for 5kva


 98%|█████████▊| 5219/5321 [03:27<00:01, 70.10it/s]cp: /Users/tsachmackey/dfs/affinity_net/PDBbind/refined-set/6bm5/protein_data.h5: No such file or directory
 98%|█████████▊| 5227/5321 [03:27<00:01, 70.75it/s]

Error loading protein data for 6bm5


100%|██████████| 5321/5321 [03:29<00:00, 25.39it/s]
