# Simple featurization of protein pockets

Let's try loading our pocket structures, and adding simple metrics about them. Maybe this will contain enough information

In [3]:
import os
from pathlib import Path

from biopandas.pdb import PandasPdb
import numpy as np

In [4]:
# Load the data
DATA_DIR = "../data"
PDBBIND_DIR = Path(DATA_DIR, "v2015")
INTERIM_DIR = Path(DATA_DIR, "interim")
DATASET_PATH = Path(INTERIM_DIR, "reg_preprocessed_1.npz")

data = np.load(DATASET_PATH)
train_ids = data["train_ids"]
test_ids = data["test_ids"]

### We're going to encodre the properties of each aminoacid

Aminoacid characteristics are what's going to go into our pocket features, hopefully aiding our model in making a prediction of which ligands will bind best to which pockets

In [5]:
# Amino acid properties
AA_PROPERTIES = {
    'ALA': {'charge': 0, 'hydrophobic': 1, 'aromatic': 0, 'hbond_donor': 0, 'hbond_acceptor': 0},
    'ARG': {'charge': 1, 'hydrophobic': 0, 'aromatic': 0, 'hbond_donor': 1, 'hbond_acceptor': 0},
    'ASN': {'charge': 0, 'hydrophobic': 0, 'aromatic': 0, 'hbond_donor': 1, 'hbond_acceptor': 1},
    'ASP': {'charge': -1, 'hydrophobic': 0, 'aromatic': 0, 'hbond_donor': 0, 'hbond_acceptor': 1},
    'CYS': {'charge': 0, 'hydrophobic': 1, 'aromatic': 0, 'hbond_donor': 0, 'hbond_acceptor': 0},
    'GLN': {'charge': 0, 'hydrophobic': 0, 'aromatic': 0, 'hbond_donor': 1, 'hbond_acceptor': 1},
    'GLU': {'charge': -1, 'hydrophobic': 0, 'aromatic': 0, 'hbond_donor': 0, 'hbond_acceptor': 1},
    'GLY': {'charge': 0, 'hydrophobic': 0, 'aromatic': 0, 'hbond_donor': 0, 'hbond_acceptor': 0},
    'HIS': {'charge': 0.5, 'hydrophobic': 0, 'aromatic': 1, 'hbond_donor': 1, 'hbond_acceptor': 1},
    'ILE': {'charge': 0, 'hydrophobic': 1, 'aromatic': 0, 'hbond_donor': 0, 'hbond_acceptor': 0},
    'LEU': {'charge': 0, 'hydrophobic': 1, 'aromatic': 0, 'hbond_donor': 0, 'hbond_acceptor': 0},
    'LYS': {'charge': 1, 'hydrophobic': 0, 'aromatic': 0, 'hbond_donor': 1, 'hbond_acceptor': 0},
    'MET': {'charge': 0, 'hydrophobic': 1, 'aromatic': 0, 'hbond_donor': 0, 'hbond_acceptor': 0},
    'PHE': {'charge': 0, 'hydrophobic': 1, 'aromatic': 1, 'hbond_donor': 0, 'hbond_acceptor': 0},
    'PRO': {'charge': 0, 'hydrophobic': 1, 'aromatic': 0, 'hbond_donor': 0, 'hbond_acceptor': 0},
    'SER': {'charge': 0, 'hydrophobic': 0, 'aromatic': 0, 'hbond_donor': 1, 'hbond_acceptor': 1},
    'THR': {'charge': 0, 'hydrophobic': 0, 'aromatic': 0, 'hbond_donor': 1, 'hbond_acceptor': 1},
    'TRP': {'charge': 0, 'hydrophobic': 1, 'aromatic': 1, 'hbond_donor': 1, 'hbond_acceptor': 0},
    'TYR': {'charge': 0, 'hydrophobic': 1, 'aromatic': 1, 'hbond_donor': 1, 'hbond_acceptor': 1},
    'VAL': {'charge': 0, 'hydrophobic': 1, 'aromatic': 0, 'hbond_donor': 0, 'hbond_acceptor': 0},
}


In [9]:
pdb_id = test_ids[0]


# The mol2 file is stored as such, see above file structure
pocket_pdb_path = Path(PDBBIND_DIR, pdb_id, pdb_id + "_pocket.pdb")

# We'll make sure that every retained ID actually has available pocket data
if not os.path.exists(pocket_pdb_path):
    raise ValueError(f"Sorry, this complex has no pocket data: {pdb_id}")

ppdb = PandasPdb().read_pdb(pocket_pdb_path)

ppdb

<biopandas.pdb.pandas_pdb.PandasPdb at 0x7f9175ef1550>

In [None]:
def featurize_pockets_for_ids(pdb_ids: list[str]):
    for pdb_id in pdb_ids:
        # The mol2 file is stored as such, see above file structure
        pocket_pdb_path = Path(PDBBIND_DIR, pdb_id, pdb_id + "_pocket.pdb")

        # We'll make sure that every retained ID actually has available pocket data
        if not os.path.exists(pocket_pdb_path):
            raise ValueError(f"Sorry, this complex has no pocket data: {pdb_id}")

        ppdb = PandasPdb().read_pdb(pocket_pdb_path)

        

featurize_pockets_for_ids(train_ids)
featurize_pockets_for_ids(test_ids)

In [3]:
!ls

01_data_formatting.ipynb		  04_pca_on_ligands.ipynb
02_linear_regression_on_signatures.ipynb  05_protein_featurization.ipynb
03_xgboost_on_signatures.ipynb
