# Sample inference pipeline (& runtime evaluation)

In [1]:
# Import required dependencies
import os
import torch
import pandas as pd
import numpy as np
import pytorch_lightning as pl
from argparse import ArgumentParser
from pytorch_lightning import Trainer
import pytorch_lightning.callbacks as plc
from pytorch_lightning.loggers import TensorBoardLogger

## Graph generation dependencies
from create_graph.Ligand_graph import construct_ligand_graph, convert_to_pyg
from create_graph.Protein_graph import parallel_process_proteins

## Model dependecies
from model import MInterface
from data import DInterface
from utils import load_model_path_by_args, plot_rmsd_metrics

from data.dataset import prepare_data_binary, prepare_data_point, prepare_data_pose, prepare_data_rmsd
from sklearn.model_selection import train_test_split, KFold
from scipy import stats

from utils import ndcg_score


Using device: cuda


## Prepare the graph data

In [2]:
path = "dataset/raw_inference"
save_dir_protein = "dataset/protein_g_inference"
save_dir_ligand = "dataset/ligand_g_inference"

# Generate the index for protein/ligand pairs
index = os.listdir(path)
print(len(index)) # Datapoint count

100


In [None]:
# Protein graph
## Configuration for protein graph construction
from graphein.protein.config import ProteinGraphConfig
from graphein.protein.features.nodes.amino_acid import amino_acid_one_hot
from graphein.protein.edges.distance import (
    add_peptide_bonds,
    add_hydrogen_bond_interactions,
    add_disulfide_interactions,
    add_ionic_interactions,
    add_aromatic_interactions,
    add_aromatic_sulphur_interactions,
    add_cation_pi_interactions
)

config = ProteinGraphConfig(
    granularity="centroids",
    node_metadata_functions=[amino_acid_one_hot],
    edge_construction_functions=[
        add_peptide_bonds,
        add_aromatic_interactions,
        add_hydrogen_bond_interactions,
        add_disulfide_interactions,
        add_ionic_interactions,
        add_aromatic_sulphur_interactions,
        add_cation_pi_interactions,
    ]
)

## Construct the protein graph
parallel_process_proteins(index, config, save_dir_protein, num_workers=1, chunk_size=100, path=path)

Starting processing of 100 proteins with 1 workers
ESM processing will be serialized to prevent CUDA memory conflicts


Output()

Output()

ESM processing failed for task 1b0p_TPP_B_1236_A_135898603714240: CUDA out of memory. Tried to allocate 3.74 GiB. GPU 0 has a total capacity of 7.62 GiB of which 1.15 GiB is free. Including non-PyTorch memory, this process has 6.45 GiB memory in use. Of the allocated memory 6.26 GiB is allocated by PyTorch, and 78.25 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Failed to process 1b0p_TPP_B_1236: ESM processing failed for task 1b0p_TPP_B_1236_A_135898603714240


Output()

ESM task 1amr_PMP_A_413_A_135898603714240: GPU memory before: 2.52GB, after: 2.90GB
Successfully processed 1amr_PMP_A_413


Output()

ESM task 1awb_IPD_A_281_A_135898603714240: GPU memory before: 2.90GB, after: 2.69GB
ESM task 1awb_IPD_A_281_B_135898603714240: GPU memory before: 2.69GB, after: 2.69GB
Successfully processed 1awb_IPD_A_281


ESM task 1b3d_S27_B_401_A_135898603714240: GPU memory before: 2.69GB, after: 2.58GB
ESM task 1b3d_S27_B_401_B_135898603714240: GPU memory before: 2.58GB, after: 2.58GB
Successfully processed 1b3d_S27_B_401


Output()

ESM task 1ajs_PLA_A_415_A_135898603714240: GPU memory before: 2.58GB, after: 2.93GB
ESM task 1ajs_PLA_A_415_B_135898603714240: GPU memory before: 2.93GB, after: 2.93GB
Successfully processed 1ajs_PLA_A_415


Output()

ESM task 1a49_ATP_C_1735_A_135898603714240: GPU memory before: 2.93GB, after: 3.21GB
ESM task 1a49_ATP_C_1735_B_135898603714240: GPU memory before: 3.21GB, after: 3.21GB
ESM task 1a49_ATP_C_1735_C_135898603714240: GPU memory before: 3.21GB, after: 3.21GB
ESM task 1a49_ATP_C_1735_D_135898603714240: GPU memory before: 3.21GB, after: 3.21GB
ESM task 1a49_ATP_C_1735_E_135898603714240: GPU memory before: 3.21GB, after: 3.21GB
ESM task 1a49_ATP_C_1735_F_135898603714240: GPU memory before: 3.21GB, after: 3.21GB
ESM task 1a49_ATP_C_1735_G_135898603714240: GPU memory before: 3.21GB, after: 3.21GB


Output()

ESM task 1a49_ATP_C_1735_H_135898603714240: GPU memory before: 3.21GB, after: 3.21GB
Successfully processed 1a49_ATP_C_1735


Output()

ESM processing failed for task 1ami_MIC_A_755_A_135898603714240: CUDA out of memory. Tried to allocate 1.41 GiB. GPU 0 has a total capacity of 7.62 GiB of which 1.36 GiB is free. Including non-PyTorch memory, this process has 6.24 GiB memory in use. Of the allocated memory 6.04 GiB is allocated by PyTorch, and 92.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Failed to process 1ami_MIC_A_755: ESM processing failed for task 1ami_MIC_A_755_A_135898603714240


ESM task 1aex_THP_A_151_A_135898603714240: GPU memory before: 3.21GB, after: 2.56GB
Successfully processed 1aex_THP_A_151


Output()

ESM task 1aqx_GTD_B_2201_A_135898603714240: GPU memory before: 2.56GB, after: 2.61GB
ESM task 1aqx_GTD_B_2201_B_135898603714240: GPU memory before: 2.61GB, after: 2.61GB
ESM task 1aqx_GTD_B_2201_C_135898603714240: GPU memory before: 2.61GB, after: 2.61GB


Output()

ESM task 1aqx_GTD_B_2201_D_135898603714240: GPU memory before: 2.61GB, after: 2.61GB
Successfully processed 1aqx_GTD_B_2201


Output()

ESM task 1a5w_Y3_A_1_A_135898603714240: GPU memory before: 2.61GB, after: 2.56GB
Successfully processed 1a5w_Y3_A_1
Completed 10/100 proteins. GPU memory: 2.56GB allocated, 2.65GB reserved


ESM task 1aqx_GTD_C_2301_A_135898603714240: GPU memory before: 2.56GB, after: 2.61GB
ESM task 1aqx_GTD_C_2301_B_135898603714240: GPU memory before: 2.61GB, after: 2.61GB
ESM task 1aqx_GTD_C_2301_C_135898603714240: GPU memory before: 2.61GB, after: 2.61GB


Output()

ESM task 1aqx_GTD_C_2301_D_135898603714240: GPU memory before: 2.61GB, after: 2.61GB
Successfully processed 1aqx_GTD_C_2301


ESM task 1aog_MAE_A_500_A_135898603714240: GPU memory before: 2.61GB, after: 3.09GB
ESM task 1aog_MAE_A_500_B_135898603714240: GPU memory before: 3.09GB, after: 3.09GB
Successfully processed 1aog_MAE_A_500


Output()

ESM processing failed for task 1b0p_TPP_A_1236_A_135898603714240: CUDA out of memory. Tried to allocate 3.74 GiB. GPU 0 has a total capacity of 7.62 GiB of which 559.19 MiB is free. Including non-PyTorch memory, this process has 7.05 GiB memory in use. Of the allocated memory 6.85 GiB is allocated by PyTorch, and 96.92 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Failed to process 1b0p_TPP_A_1236: ESM processing failed for task 1b0p_TPP_A_1236_A_135898603714240


Output()

Output()

ESM task 1amq_PMP_A_413_A_135898603714240: GPU memory before: 3.10GB, after: 2.90GB
Successfully processed 1amq_PMP_A_413


ESM task 1aj2_2PH_A_283_A_135898603714240: GPU memory before: 2.90GB, after: 2.70GB
Successfully processed 1aj2_2PH_A_283


Output()

In [None]:
# Ligand graph generation
## Configuration for ligand graph construction
import graphein.molecule as gm
from functools import partial

config = gm.MoleculeGraphConfig(
    node_metadata_functions=[
        gm.atom_type_one_hot,
        gm.atomic_mass,
        gm.degree,
        gm.total_degree,
        gm.total_valence,
        gm.explicit_valence,
        gm.implicit_valence,
        gm.num_explicit_h,
        gm.num_implicit_h,
        gm.total_num_h,
        gm.num_radical_electrons,
        gm.formal_charge,
        gm.is_aromatic,
        gm.is_isotope,
        gm.is_ring,
        partial(gm.is_ring_size, ring_size=5),
        partial(gm.is_ring_size, ring_size=7)
    ]
)

proteins = pd.Series(index)
graphs = proteins.apply(lambda p: construct_ligand_graph(p, path))
# Convert all graphs
pyg_graphs = [convert_to_pyg(g) for g in graphs]
# Assuming pyg_graphs is a list containing your graph data objects
proteins = proteins.to_list()
os.makedirs(save_dir_ligand, exist_ok=True)
for idx, pyg_graph in enumerate(pyg_graphs):
    protein_name = proteins[idx]
    file_name = f"{save_dir_ligand}/pyg_graph_{protein_name}.pt"
    torch.save(pyg_graph, file_name)
    print(f'{protein_name} saved')


100
1b0p_TPP_B_1236 saved
1amr_PMP_A_413 saved
1awb_IPD_A_281 saved
1b3d_S27_B_401 saved
1ajs_PLA_A_415 saved
1a49_ATP_C_1735 saved
1ami_MIC_A_755 saved
1aex_THP_A_151 saved
1aqx_GTD_B_2201 saved
1a5w_Y3_A_1 saved
1aqx_GTD_C_2301 saved
1aog_MAE_A_500 saved
1b0p_TPP_A_1236 saved
1amq_PMP_A_413 saved
1aj2_2PH_A_283 saved
1a49_ATP_D_2335 saved
1a3u_THP_A_151 saved
1a80_NDP_A_300 saved
1aog_FAD_A_492 saved
1aer_TIA_B_700 saved
1a49_OXL_F_4133 saved
1afq_0FG_B_304 saved
1aej_NVI_A_296 saved
1akd_CAM_A_420 saved
1axd_GGL_C_1 saved
1aiq_CB3_B_267 saved
1ac4_TMT_A_296 saved
1a0g_PMP_A_285 saved
1aiq_UMP_B_266 saved
1a96_PCP_B_301 saved
1afe_ASP_I_55 saved
1a49_OXL_H_5333 saved
1aog_FAD_B_492 saved
1a59_COA_A_380 saved
1anc_BEN_A_290 saved
1abn_NDP_A_351 saved
1ah4_NAP_A_318 saved
1a59_CIT_A_379 saved
1b7y_FYA_A_1002 saved
1axg_NAD_C_403 saved
1af7_SAH_A_287 saved
1a96_PCP_C_302 saved
1a0g_PMP_B_285 saved
1aer_TAD_A_700 saved
1aia_PMP_A_411 saved
1a49_OXL_E_3533 saved
1aj8_CIT_A_1000 saved
1a4i

## Run inference

In [None]:
# TODO: Change the data modules and add a `inference` function in main.py to handle this feature!