In [1]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)
import sys 
installation_path = "/content/drive/MyDrive/Colab_Installations_V2"
# The path is being modified so that everything installed in the installation path can now be used without re-installing (in this case, I just need biopython)
sys.path.insert(0,installation_path)
protein_mpnn_path = "/content/drive/MyDrive/Protein_MPNN_Digging/ProteinMPNN/vanilla_proteinmpnn"
sys.path.insert(0,protein_mpnn_path)

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Protein_MPNN_Digging

/content/drive/MyDrive/Protein_MPNN_Digging


In [17]:
import re
import matplotlib.pyplot as plt
import shutil
import warnings
import numpy as np
import torch
from torch import optim
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split, Subset
import copy
import torch.nn as nn
import torch.nn.functional as F
import random
import os
from protein_mpnn_utils import loss_nll, loss_smoothed, gather_edges, gather_nodes, gather_nodes_t, cat_neighbors_nodes, _scores, _S_to_seq, tied_featurize, parse_PDB
from protein_mpnn_utils import StructureDataset, StructureDatasetPDB, ProteinMPNN
from Bio.PDB import *

device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")

In [4]:
weights_path = os.path.join(protein_mpnn_path,"vanilla_model_weights")
model_name = "v_48_020"
checkpoint_path = os.path.join(weights_path,model_name+".pt")

In [5]:
# Now, load and dig into the checkpoint object
checkpoint = torch.load(checkpoint_path, map_location=device) 

In [None]:
hidden_dim = 128
num_layers = 3 
# Seems like, backbone_noise is set to 0 at inference path which seems logical
backbone_noise=0.00
model = ProteinMPNN(num_letters=21, node_features=hidden_dim, edge_features=hidden_dim, hidden_dim=hidden_dim, num_encoder_layers=num_layers, num_decoder_layers=num_layers, augment_eps=backbone_noise, k_neighbors=checkpoint['num_edges'])
model.to(device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

In [13]:
print(checkpoint['model_state_dict'].keys())

odict_keys(['features.embeddings.linear.weight', 'features.embeddings.linear.bias', 'features.edge_embedding.weight', 'features.norm_edges.weight', 'features.norm_edges.bias', 'W_e.weight', 'W_e.bias', 'W_s.weight', 'encoder_layers.0.norm1.weight', 'encoder_layers.0.norm1.bias', 'encoder_layers.0.norm2.weight', 'encoder_layers.0.norm2.bias', 'encoder_layers.0.norm3.weight', 'encoder_layers.0.norm3.bias', 'encoder_layers.0.W1.weight', 'encoder_layers.0.W1.bias', 'encoder_layers.0.W2.weight', 'encoder_layers.0.W2.bias', 'encoder_layers.0.W3.weight', 'encoder_layers.0.W3.bias', 'encoder_layers.0.W11.weight', 'encoder_layers.0.W11.bias', 'encoder_layers.0.W12.weight', 'encoder_layers.0.W12.bias', 'encoder_layers.0.W13.weight', 'encoder_layers.0.W13.bias', 'encoder_layers.0.dense.W_in.weight', 'encoder_layers.0.dense.W_in.bias', 'encoder_layers.0.dense.W_out.weight', 'encoder_layers.0.dense.W_out.bias', 'encoder_layers.1.norm1.weight', 'encoder_layers.1.norm1.bias', 'encoder_layers.1.norm2.

In [30]:
# read in the PDB files from the directory where the S_2648 PDB Files are stored, and set-them up one by one for featuirization, and passing through the model
pdbDirectory = "/content/drive/MyDrive/ACCRE_PyRun_Setup/S_2648_PDB_Files"
parser = PDBParser(QUIET=True)
for i,filename in enumerate(os.listdir(pdbDirectory)):
    if i > 2:
        break
    filepath = os.path.join(pdbDirectory,filename)
    structure = parser.get_structure(id=filename.split(".")[0],file=filepath)
    model = structure[0]
    
    # Since there is only one chain, and that same chain is both fixed designable for different residues, extracting that name, and putting them in pertinent lists
    chain_name = list(model.child_dict.keys())[0]
    fixed_chain_list = [chain_name]
    designed_chain_list = [chain_name]
    chain_list = [chain_name]

    # Using the programs custome PDB parser for processing the PDB files
    pdb_dict_list = parse_PDB(filepath, input_chain_list=chain_list)
    # tacking max_length parameter value from the original colab notebook since I need to process all residues at the same time
    # all the PDB files can technically be processed together and put inside the dataset_valid list-like object, but right now
    # I am trying to keep everything consistent
    # Each element of dataset_valid is a dictionary 
    dataset_valid = StructureDatasetPDB(pdb_dict_list, truncate=None, max_length=20000)

    # At this point, probably need to put None values in a lot of parameters that are not relevant to my usecase, but need to be sent to featurizer before running model forward
    # For now, I will not tie positions together
    tied_positions_dict = None
    pssm_dict = None
    omit_AA_dict = None
    bias_AA_dict = None
    tied_positions_dict = None
    bias_by_res_dict = None
    alphabet = 'ACDEFGHIKLMNPQRSTVWYX'
    bias_AAs_np = np.zeros(len(alphabet))
    
    # I do not even think temperature is necessary for my usecase since I am not trying to generate sequences,
    # still keeping a value for code consistency
    temperatures = [0.1]

In [46]:
# Now, have to trace the function protein_mpnn_utils.tied_featurize
# after tracing and understanding this function in a workable way, I can run through model forward, get probabilities, play around, find simple correlation
# submit batch jobs, and then move onto other ideas listed in my workbook, add those slides to the presentation to be shown to Jens in a week
# seems like full-chain design vs. specific position design is the issue here
# Where is the key to going from full-chain-design to specific-position-design?