# Simple featurization of protein pockets

Let's try loading our pocket structures, and adding simple metrics about them. Maybe this will contain enough information

In [1]:
import os
from pathlib import Path

from biopandas.pdb import PandasPdb
import numpy as np

In [2]:
# Load the data
DATA_DIR = "../data"
PDBBIND_DIR = Path(DATA_DIR, "v2015")
INTERIM_DIR = Path(DATA_DIR, "interim")
DATASET_PATH = Path(INTERIM_DIR, "reg_preprocessed_1.npz")

data = np.load(DATASET_PATH)
train_ids = data["train_ids"]
test_ids = data["test_ids"]

### We're going to encodre the properties of each aminoacid

Aminoacid characteristics are what's going to go into our pocket features, hopefully aiding our model in making a prediction of which ligands will bind best to which pockets

In [3]:
# Amino acid properties
AA_PROPERTIES = {
    'ALA': {'charge': 0, 'hydrophobic': 1, 'aromatic': 0, 'hbond_donor': 0, 'hbond_acceptor': 0},
    'ARG': {'charge': 1, 'hydrophobic': 0, 'aromatic': 0, 'hbond_donor': 1, 'hbond_acceptor': 0},
    'ASN': {'charge': 0, 'hydrophobic': 0, 'aromatic': 0, 'hbond_donor': 1, 'hbond_acceptor': 1},
    'ASP': {'charge': -1, 'hydrophobic': 0, 'aromatic': 0, 'hbond_donor': 0, 'hbond_acceptor': 1},
    'CYS': {'charge': 0, 'hydrophobic': 1, 'aromatic': 0, 'hbond_donor': 0, 'hbond_acceptor': 0},
    'GLN': {'charge': 0, 'hydrophobic': 0, 'aromatic': 0, 'hbond_donor': 1, 'hbond_acceptor': 1},
    'GLU': {'charge': -1, 'hydrophobic': 0, 'aromatic': 0, 'hbond_donor': 0, 'hbond_acceptor': 1},
    'GLY': {'charge': 0, 'hydrophobic': 0, 'aromatic': 0, 'hbond_donor': 0, 'hbond_acceptor': 0},
    'HIS': {'charge': 0.5, 'hydrophobic': 0, 'aromatic': 1, 'hbond_donor': 1, 'hbond_acceptor': 1},
    'ILE': {'charge': 0, 'hydrophobic': 1, 'aromatic': 0, 'hbond_donor': 0, 'hbond_acceptor': 0},
    'LEU': {'charge': 0, 'hydrophobic': 1, 'aromatic': 0, 'hbond_donor': 0, 'hbond_acceptor': 0},
    'LYS': {'charge': 1, 'hydrophobic': 0, 'aromatic': 0, 'hbond_donor': 1, 'hbond_acceptor': 0},
    'MET': {'charge': 0, 'hydrophobic': 1, 'aromatic': 0, 'hbond_donor': 0, 'hbond_acceptor': 0},
    'PHE': {'charge': 0, 'hydrophobic': 1, 'aromatic': 1, 'hbond_donor': 0, 'hbond_acceptor': 0},
    'PRO': {'charge': 0, 'hydrophobic': 1, 'aromatic': 0, 'hbond_donor': 0, 'hbond_acceptor': 0},
    'SER': {'charge': 0, 'hydrophobic': 0, 'aromatic': 0, 'hbond_donor': 1, 'hbond_acceptor': 1},
    'THR': {'charge': 0, 'hydrophobic': 0, 'aromatic': 0, 'hbond_donor': 1, 'hbond_acceptor': 1},
    'TRP': {'charge': 0, 'hydrophobic': 1, 'aromatic': 1, 'hbond_donor': 1, 'hbond_acceptor': 0},
    'TYR': {'charge': 0, 'hydrophobic': 1, 'aromatic': 1, 'hbond_donor': 1, 'hbond_acceptor': 1},
    'VAL': {'charge': 0, 'hydrophobic': 1, 'aromatic': 0, 'hbond_donor': 0, 'hbond_acceptor': 0},
}


In [4]:
pdb_id = test_ids[0]


# The mol2 file is stored as such, see above file structure
pocket_pdb_path = Path(PDBBIND_DIR, pdb_id, pdb_id + "_pocket.pdb")

# We'll make sure that every retained ID actually has available pocket data
if not os.path.exists(pocket_pdb_path):
    raise ValueError(f"Sorry, this complex has no pocket data: {pdb_id}")

ppdb = PandasPdb().read_pdb(pocket_pdb_path)

df = ppdb.df['ATOM']
# Filter only protein atoms and get residues
protein_df = df[df['residue_name'].isin(AA_PROPERTIES.keys())]

protein_df

Unnamed: 0,record_name,atom_number,blank_1,atom_name,alt_loc,residue_name,blank_2,chain_id,residue_number,insertion,...,x_coord,y_coord,z_coord,occupancy,b_factor,blank_4,segment_id,element_symbol,charge,line_idx
0,ATOM,1,,N,,TYR,,,83,,...,2.506,15.909,2.578,1.0,25.31,,,N,,3
1,ATOM,2,,H,,TYR,,,83,,...,2.601,14.995,3.066,1.0,0.00,,,H,,4
2,ATOM,3,,CA,,TYR,,,83,,...,1.333,16.177,1.759,1.0,25.17,,,C,,5
3,ATOM,4,,C,,TYR,,,83,,...,0.481,17.311,2.345,1.0,24.49,,,C,,6
4,ATOM,5,,O,,TYR,,,83,,...,0.276,17.383,3.551,1.0,24.78,,,O,,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382,ATOM,383,,O,,ASP,,,207,,...,-1.414,2.375,-10.300,1.0,54.07,,,O,,385
383,ATOM,384,,CB,,ASP,,,207,,...,0.590,1.388,-8.192,1.0,54.52,,,C,,386
384,ATOM,385,,CG,,ASP,,,207,,...,1.222,2.764,-8.427,1.0,56.00,,,C,,387
385,ATOM,386,,OD1,,ASP,,,207,,...,1.160,3.288,-9.563,1.0,58.43,,,O,,388


In [7]:
def featurize_pockets_for_ids(pdb_ids: list[str]):
    """
    Extract pocket features: one-hot encoding of AA composition + aggregated properties.
    Returns a list of feature vectors, one per pocket.
    """
    # Define the 20 standard amino acids for one-hot encoding
    AA_LIST = sorted(AA_PROPERTIES.keys())
    aa_to_idx = {aa: i for i, aa in enumerate(AA_LIST)}
    
    pocket_features = []
    valid_ids = []
    
    for pdb_id in pdb_ids:
        pocket_pdb_path = Path(PDBBIND_DIR, pdb_id, pdb_id + "_pocket.pdb")
        
        # Skip if pocket data doesn't exist
        if not os.path.exists(pocket_pdb_path):
            print(f"Skipping {pdb_id}: no pocket data")
            continue
        
        try:
            ppdb = PandasPdb().read_pdb(pocket_pdb_path)
            df = ppdb.df['ATOM']
            
            # Filter only protein atoms and get residues
            protein_df = df[df['residue_name'].isin(AA_PROPERTIES.keys())]
            
            if protein_df.empty:
                print(f"Skipping {pdb_id}: no valid residues")
                continue
            
            # Get unique residues (by chain, residue_number, residue_name)
            unique_residues = protein_df.groupby(['chain_id', 'residue_number', 'residue_name']).first().reset_index()
            
            # 1. One-hot encoding: count of each amino acid type
            aa_counts = np.zeros(len(AA_LIST))
            for aa_name in unique_residues['residue_name']:
                if aa_name in aa_to_idx:
                    aa_counts[aa_to_idx[aa_name]] += 1
            
            # Normalize by total residues to get composition
            aa_composition = aa_counts / aa_counts.sum() if aa_counts.sum() > 0 else aa_counts
            
            # 2. Aggregate properties across all residues
            property_keys = ['charge', 'hydrophobic', 'aromatic', 'hbond_donor', 'hbond_acceptor']
            property_sums = {key: 0 for key in property_keys}
            
            for aa_name in unique_residues['residue_name']:
                if aa_name in AA_PROPERTIES:
                    for key in property_keys:
                        property_sums[key] += AA_PROPERTIES[aa_name][key]
            
            # Convert to array
            property_features = np.array([property_sums[key] for key in property_keys])
            
            # 3. Additional pocket statistics
            num_residues = len(unique_residues)
            pocket_stats = np.array([num_residues])
            
            # Combine all features
            feature_vector = np.concatenate([aa_composition, property_features, pocket_stats])
            
            pocket_features.append(feature_vector)
            valid_ids.append(pdb_id)
            
        except Exception as e:
            print(f"Skipping {pdb_id}: error {e}")
            continue
    
    return np.array(pocket_features), valid_ids

# Generate features for train and test sets
X_train_pockets, train_ids_retained = featurize_pockets_for_ids(train_ids)

X_test_pockets, test_ids_retained = featurize_pockets_for_ids(test_ids)

Featurizing train pockets...
Train: 3509 pockets, feature shape: (3509, 26)

Featurizing test pockets...
Test: 195 pockets, feature shape: (195, 26)


In [8]:
# Save the pocket features along with the retained IDs
POCKET_FEATURES_PATH = Path(INTERIM_DIR, "pocket_features.npz")

np.savez(
    POCKET_FEATURES_PATH,
    X_train_pockets=X_train_pockets,
    X_test_pockets=X_test_pockets,
    train_ids_retained=train_ids_retained,
    test_ids_retained=test_ids_retained
)

print(f"\nSaved pocket features to {POCKET_FEATURES_PATH}")


Saved pocket features to ../data/interim/pocket_features.npz
