# overview

We start from the raw PDBbind dataset downloaded from http://www.pdbbind.org.cn/download.php

1. filter out those unable to process using RDKit.

2. Process the protein by only preserving the chains that with at least one atom within 10Å from any atom of the ligand.

3. Use p2rank to segment protein into blocks.

4. extract protein and ligand features.

5. construct the training and test dataset.


In [None]:
tankbind_src_folder_path = "/fs/pool/pool-marsot/tankbind_philip/TankBind/tankbind"
import sys
sys.path.insert(0, tankbind_src_folder_path)

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# process the raw PDBbind dataset.

In [None]:
from utils import read_pdbbind_data

In [None]:
# raw PDBbind dataset could be downloaded from http://www.pdbbind.org.cn/download.php
pre = "./pdbbind/pdbbind2020/"
df_pdb_id = pd.read_csv(f'/fs/pool/pool-marsot/tankbind_enzo/bind/dataset/index/INDEX_general_PL_name.2020', sep="  ", comment='#', header=None, names=['pdb', 'year', 'uid', 'd', 'e','f','g','h','i','j','k','l','m','n','o'], engine='python')
df_pdb_id = df_pdb_id[['pdb','uid']]
data = read_pdbbind_data(f'/fs/pool/pool-marsot/tankbind_enzo/bind/dataset/index/INDEX_general_PL_data.2020')
data = data.merge(df_pdb_id, on=['pdb'])


# ligand file should be readable by RDKit.

In [None]:
from feature_utils import read_mol

In [None]:
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
pdb_list = []
probem_list = []
for pdb in tqdm(data.pdb):
    sdf_fileName = f"/fs/pool/pool-marsot/tankbind_enzo/bind/dataset/pdbbind_files/{pdb}/{pdb}_ligand.sdf"
    mol2_fileName = f"/fs/pool/pool-marsot/tankbind_enzo/bind/dataset/pdbbind_files/{pdb}/{pdb}_ligand.mol2"
    mol, problem = read_mol(sdf_fileName, mol2_fileName)
    if problem:
        probem_list.append(pdb)
        continue
    pdb_list.append(pdb)

In [None]:
data = data.query("pdb in @pdb_list").reset_index(drop=True)

In [None]:
data.shape

### for ease of RMSD evaluation later, we renumber the atom index to be consistent with the smiles

In [None]:
from feature_utils import write_renumbered_sdf

In [None]:
import os

In [None]:
toFolder = f"/fs/pool/pool-marsot/tankbind_philip/TankBind/data/renumber_atom_index_same_as_smiles"
os.system(f"mkdir -p {toFolder}")
for pdb in tqdm(pdb_list):
    sdf_fileName = f"/fs/pool/pool-marsot/tankbind_enzo/bind/dataset/pdbbind_files/{pdb}/{pdb}_ligand.sdf"
    mol2_fileName = f"/fs/pool/pool-marsot/tankbind_enzo/bind/dataset/pdbbind_files/{pdb}/{pdb}_ligand.mol2"
    toFile = f"{toFolder}/{pdb}.sdf"
    write_renumbered_sdf(toFile, sdf_fileName, mol2_fileName)


# process PDBbind proteins, removing extra chains, cutoff 10A

In [None]:
toFolder = f"/fs/pool/pool-marsot/tankbind_philip/TankBind/data/protein_remove_extra_chains_10A/"
os.system(f"mkdir -p {toFolder}")

In [None]:
input_ = []
cutoff = 10
for pdb in data.pdb.values:
    pdbFile = f"/fs/pool/pool-marsot/tankbind_enzo/bind/dataset/pdbbind_files/{pdb}/{pdb}_protein.pdb"
    ligandFile = f"/fs/pool/pool-marsot/tankbind_philip/TankBind/data/renumber_atom_index_same_as_smiles/{pdb}.sdf"
    toFile = f"{toFolder}/{pdb}_protein.pdb"
    x = (pdbFile, ligandFile, cutoff, toFile)
    input_.append(x)

In [None]:
from feature_utils import select_chain_within_cutoff_to_ligand_v2

In [None]:
import mlcrate as mlc
import os
pool = mlc.SuperPool(20)
pool.pool.restart()
_ = pool.map(select_chain_within_cutoff_to_ligand_v2,input_)
pool.exit()

In [None]:
# previously, I found that 2r1w has no chain near the ligand.
data = data.query("pdb != '2r1w'").reset_index(drop=True)

# p2rank segmentation

In [None]:
import os

In [None]:
p2rank_prediction_folder = f"/fs/pool/pool-marsot/tankbind_philip/TankBind/data/p2rank_protein_remove_extra_chains_10A"
os.system(f"mkdir -p {p2rank_prediction_folder}")
ds = f"{p2rank_prediction_folder}/protein_list.ds"
with open(ds, "w") as out:
    for pdb in data.pdb.values:
        out.write(f"../protein_remove_extra_chains_10A/{pdb}_protein.pdb\n")

In [None]:
import os
os.system("module load jdk/17.0.6")

In [None]:
# takes about 30 minutes.
p2rank = "bash /fs/pool/pool-marsot/tankbind_philip/TankBind/p2rank/p2rank/prank"
#cmd = f"{p2rank} predict {ds} -o {p2rank_prediction_folder}/p2rank -threads 16"
cmd = f"{p2rank} predict {ds} -o {p2rank_prediction_folder}/p2rank -threads 20"
os.system(cmd)

In [None]:
data.to_csv(f"/fs/pool/pool-marsot/tankbind_philip/TankBind/data/data.csv")

In [None]:
# added by Enzo
data = pd.read_csv(f"/fs/pool/pool-marsot/tankbind_philip/TankBind/data/data.csv")

In [23]:
pdb_list = data.pdb.values

In [None]:
name_list = pdb_list
d_list = []

for name in tqdm(name_list):
    p2rankFile = f"/fs/pool/pool-marsot/tankbind_philip/TankBind/data/p2rank_protein_remove_extra_chains_10A/p2rank/{name}_protein.pdb_predictions.csv"
    d = pd.read_csv(p2rankFile)
    d.columns = d.columns.str.strip()
    d_list.append(d.assign(name=name))
d = pd.concat(d_list).reset_index(drop=True)
d.reset_index(drop=True).to_feather(f"/fs/pool/pool-marsot/tankbind_philip/TankBind/data/p2rank_result.feather")

In [None]:
d = pd.read_feather(f"/fs/pool/pool-marsot/tankbind_philip/TankBind/data/p2rank_result.feather")

In [None]:
pockets_dict = {}
for name in tqdm(name_list):
    pockets_dict[name] = d[d.name == name].reset_index(drop=True)

# protein feature

In [None]:
from feature_utils import get_protein_feature

In [25]:
tankbind_data_path = f"/fs/pool/pool-marsot/tankbind_philip/TankBind/data"

In [None]:
input_ = []
protein_embedding_folder = f"{tankbind_data_path}/gvp_protein_embedding"
os.system(f"mkdir -p {protein_embedding_folder}")
for pdb in pdb_list:
    proteinFile = f"/fs/pool/pool-marsot/tankbind_philip/TankBind/data/protein_remove_extra_chains_10A/{pdb}_protein.pdb"
    toFile = f"{protein_embedding_folder}/{pdb}.pt"
    x = (pdb, proteinFile, toFile)
    input_.append(x)

In [None]:
from Bio.PDB import PDBParser
from feature_utils import get_clean_res_list
import torch
torch.set_num_threads(1)

def batch_run(x):
    protein_dict = {}
    pdb, proteinFile, toFile = x
    parser = PDBParser(QUIET=True)
    s = parser.get_structure(pdb, proteinFile)
    res_list = get_clean_res_list(s.get_residues(), verbose=False, ensure_ca_exist=True)
    protein_dict[pdb] = get_protein_feature(res_list)
    torch.save(protein_dict, toFile)

In [None]:
import mlcrate as mlc
import os
pool = mlc.SuperPool(20)
pool.pool.restart()
_ = pool.map(batch_run,input_)
pool.exit()

In [None]:
len(protein_dict)

In [None]:
pdb_list

In [None]:
protein_dict = {}
for pdb in tqdm(pdb_list):
    try:
        protein_dict.update(torch.load(f"{protein_embedding_folder}/{pdb}.pt"))
    except:
        print(pdb)

# Compound Features

In [24]:
from feature_utils import extract_torchdrug_feature_from_mol
from tqdm.notebook import tqdm
import torch
compound_dict = {}
skip_pdb_list = []

#added by Enzo: try statements
for pdb in tqdm(pdb_list):
    #added by Enzo as correction of future change
    if len(pdb) == 4:
        try:
            mol, _ = read_mol(f"/fs/pool/pool-marsot/tankbind_philip/TankBind/data/renumber_atom_index_same_as_smiles/{pdb}.sdf", None)
            # extract features from sdf.
            try:
                compound_dict[pdb] = extract_torchdrug_feature_from_mol(mol, has_LAS_mask=True)  # self-dock set has_LAS_mask to true
            except Exception as e:
                print(e)
                skip_pdb_list.append(pdb)
                print(pdb)
        except:
            print(pdb)
# added by Enzo
torch.save(compound_dict, f"{tankbind_data_path}/compound_dict.pt")

  0%|          | 0/19420 [00:00<?, ?it/s]

Invalid SMILES `Nc1ncnc2c1ncn2[C@@H]1O[C@H](CO[P+](=O)(=O)[O-])[C@H]2OB3(O[C@H]21)c1ccc(F)cc1C[O-]~3`
5agi

3kqs
Invalid SMILES `Nc1ncnc2c1ncn2[C@@H]1O[C@H](CO[P@@H](=O)[O-])[C@H]2OB3(O[C@H]21)c1ccccc1[C@@H](C[NH3+])[O-]~3`
3zjt
Invalid SMILES `Nc1ncnc2c1ncn2[C@@H]1O[C@H](CO[P+](=O)(=O)[O-])[C@H]2OB3(O[C@H]21)c1ccc(F)cc1C[O-]~3`
5agj
'NoneType' object has no attribute 'GetConformer'
1g7p


NameError: name 'tankbind_data_path' is not defined

In [27]:
f"{tankbind_data_path}/compound_dict.pt"

'/fs/pool/pool-marsot/tankbind_philip/TankBind/data/compound_dict.pt'

In [None]:
print(len(pdb_list))

8520


In [None]:
torch.save(compound_dict, f"{tankbind_data_path}/compound_torchdrug_features.pt")

In [None]:
skip_pdb_list

In [None]:
data = data.query("pdb not in @skip_pdb_list").reset_index(drop=True)

# construct dataset.

In [None]:
# we use the time-split defined in EquiBind paper.
# https://github.com/HannesStark/EquiBind/tree/main/data
valid = np.loadtxt("/fs/pool/pool-marsot/tankbind_philip/TankBind/packages/EquiBind/data/timesplit_no_lig_overlap_val", dtype=str)
test = np.loadtxt("/fs/pool/pool-marsot/tankbind_philip/TankBind/packages/EquiBind/data/timesplit_test", dtype=str)
def assign_group(pdb, valid=valid, test=test):
    if pdb in valid:
        return 'valid'
    if pdb in test:
        return 'test'
    return 'train'

data['group'] = data.pdb.map(assign_group)

In [None]:
data.value_counts("group")

In [None]:
data['name'] = data['pdb']

In [None]:
info = []
for i, line in tqdm(data.iterrows(), total=data.shape[0]):
    pdb = line['pdb']
    uid = line['uid']
    # smiles = line['smiles']
    smiles = ""
    affinity = line['affinity']
    group = line['group']

    compound_name = line['name']
    protein_name = line['name']

    pocket = pockets_dict[pdb].head(10)
    pocket.columns = pocket.columns.str.strip()
    pocket_coms = pocket[['center_x', 'center_y', 'center_z']].values
    # native block.
    info.append([protein_name, compound_name, pdb, smiles, affinity, uid, None, True, False, group])
    # protein center as a block.
    protein_com = protein_dict[protein_name][0].numpy().mean(axis=0).astype(float).reshape(1, 3)
    info.append([protein_name, compound_name, pdb+"_c", smiles, affinity, uid, protein_com, False, False, group])
    
    for idx, pocket_line in pocket.iterrows():
        pdb_idx = f"{pdb}_{idx}"
        info.append([protein_name, compound_name, pdb_idx, smiles, affinity, uid, pocket_coms[idx].reshape(1, 3), False, False, group])
info = pd.DataFrame(info, columns=['protein_name', 'compound_name', 'pdb', 'smiles', 'affinity', 'uid', 'pocket_com', 
                                   'use_compound_com', 'use_whole_protein',
                                  'group'])



In [None]:
info.shape

In [None]:
from data import TankBindDataSet

In [None]:
toFilePre

In [None]:
from pathlib import Path
toFilePre_path = Path(toFilePre)
toFilePre_path.resolve()

In [None]:
import os

In [None]:
toFilePre = f"{pre}/dataset"
os.system(f"mkdir -p {toFilePre}")

In [None]:
dataset = TankBindDataSet(toFilePre, data=info, protein_dict=protein_dict, compound_dict=compound_dict)

In [None]:
dataset = TankBindDataSet(toFilePre)

In [None]:
from tqdm.notebook import tqdm
t = []
data = dataset.data
pre_pdb = None
for i, line in tqdm(data.iterrows(), total=data.shape[0]):
    pdb = line['compound_name']
    d = dataset[i]
    p_length = d['node_xyz'].shape[0]
    c_length = d['coords'].shape[0]
    y_length = d['y'].shape[0]
    num_contact = (d.y > 0).sum()
    t.append([i, pdb, p_length, c_length, y_length, num_contact])



In [None]:
# data = data.drop(['p_length', 'c_length', 'y_length', 'num_contact'], axis=1)

In [None]:
import pandas as pd

In [None]:
t = pd.DataFrame(t, columns=['index', 'pdb' ,'p_length', 'c_length', 'y_length', 'num_contact'])
t['num_contact'] = t['num_contact'].apply(lambda x: x.item())

In [None]:
data = pd.concat([data, t[['p_length', 'c_length', 'y_length', 'num_contact']]], axis=1)

In [None]:
native_num_contact = data.query("use_compound_com").set_index("protein_name")['num_contact'].to_dict()
data['native_num_contact'] = data.protein_name.map(native_num_contact)
# data['fract_of_native_contact'] = data['num_contact'] / data['native_num_contact']

In [None]:
import torch

In [None]:
torch.save(data, f"{toFilePre}/processed/data.pt")

In [None]:
info = torch.load(f"{toFilePre}/processed/data.pt")


In [None]:
test = info.query("group == 'test'").reset_index(drop=True)
test_pdb_list = info.query("group == 'test'").protein_name.unique()

In [None]:
subset_protein_dict = {}
for pdb in tqdm(test_pdb_list):
    subset_protein_dict[pdb] = protein_dict[pdb]

In [None]:
subset_compound_dict = {}
for pdb in tqdm(test_pdb_list):
    subset_compound_dict[pdb] = compound_dict[pdb]

In [None]:
toFilePre = f"/fs/pool/pool-marsot/tankbind_philip/TankBind/dataset/test_dataset"
os.system(f"mkdir -p {toFilePre}")
dataset = TankBindDataSet(toFilePre, data=test, protein_dict=subset_protein_dict, compound_dict=subset_compound_dict)