In [1]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)
import sys 
installation_path = "/content/drive/MyDrive/Colab_Installations_V2"
# The path is being modified so that everything installed in the installation path can now be used without re-installing (in this case, I just need biopython)
sys.path.insert(0,installation_path)
protein_mpnn_path = "/content/drive/MyDrive/Protein_MPNN_Digging/ProteinMPNN/vanilla_proteinmpnn"
sys.path.insert(0,protein_mpnn_path)

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Protein_MPNN_Digging

/content/drive/MyDrive/Protein_MPNN_Digging


In [3]:
import re
import matplotlib.pyplot as plt
import shutil
import warnings
import numpy as np
import torch
from torch import optim
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split, Subset
import copy
import torch.nn as nn
import torch.nn.functional as F
import random
import os
from protein_mpnn_utils import loss_nll, loss_smoothed, gather_edges, gather_nodes, gather_nodes_t, cat_neighbors_nodes, _scores, _S_to_seq, tied_featurize, parse_PDB
from protein_mpnn_utils import StructureDataset, StructureDatasetPDB, ProteinMPNN
from Bio.PDB import *

device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")

In [31]:
import pandas as pd
from tqdm.notebook import tqdm
from Bio.PDB.Polypeptide import *

In [4]:
weights_path = os.path.join(protein_mpnn_path,"vanilla_model_weights")
model_name = "v_48_020"
checkpoint_path = os.path.join(weights_path,model_name+".pt")

In [5]:
# Now, load and dig into the checkpoint object
checkpoint = torch.load(checkpoint_path, map_location=device) 

In [None]:
hidden_dim = 128
num_layers = 3 
# Seems like, backbone_noise is set to 0 at inference path which seems logical
backbone_noise=0.00
model = ProteinMPNN(num_letters=21, node_features=hidden_dim, edge_features=hidden_dim, hidden_dim=hidden_dim, num_encoder_layers=num_layers, num_decoder_layers=num_layers, augment_eps=backbone_noise, k_neighbors=checkpoint['num_edges'])
model.to(device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

In [7]:
print(checkpoint['model_state_dict'].keys())

odict_keys(['features.embeddings.linear.weight', 'features.embeddings.linear.bias', 'features.edge_embedding.weight', 'features.norm_edges.weight', 'features.norm_edges.bias', 'W_e.weight', 'W_e.bias', 'W_s.weight', 'encoder_layers.0.norm1.weight', 'encoder_layers.0.norm1.bias', 'encoder_layers.0.norm2.weight', 'encoder_layers.0.norm2.bias', 'encoder_layers.0.norm3.weight', 'encoder_layers.0.norm3.bias', 'encoder_layers.0.W1.weight', 'encoder_layers.0.W1.bias', 'encoder_layers.0.W2.weight', 'encoder_layers.0.W2.bias', 'encoder_layers.0.W3.weight', 'encoder_layers.0.W3.bias', 'encoder_layers.0.W11.weight', 'encoder_layers.0.W11.bias', 'encoder_layers.0.W12.weight', 'encoder_layers.0.W12.bias', 'encoder_layers.0.W13.weight', 'encoder_layers.0.W13.bias', 'encoder_layers.0.dense.W_in.weight', 'encoder_layers.0.dense.W_in.bias', 'encoder_layers.0.dense.W_out.weight', 'encoder_layers.0.dense.W_out.bias', 'encoder_layers.1.norm1.weight', 'encoder_layers.1.norm1.bias', 'encoder_layers.1.norm2.

In [27]:
# Parse and create dictionaries for all the mutations in PremPS 2648
# This dictionary will be a dictionary of dictionaries, where outer-dict keys will be pdbid+mutchain and inner-dict keys will be (wild+pos+mut) and ddg
# the icodes can be brought to picture later
git_url = "https://raw.githubusercontent.com/SajidAhmeduiu/PremPS/main/Datasets/S2648/S2648.txt"
dataset =  pd.read_csv(git_url,delimiter="\t")

pdbIds = list(dataset["PDB Id"])
mutChains = list(dataset["Mutated Chain"])
mutations = list(dataset["Mutation_PDB"])
ddgs = list(dataset["DDGexp"])

two_level_dict = {}

for pdbId, mutChain, mutation, ddg in tqdm(zip(pdbIds,mutChains,mutations,ddgs)):
    pos = [int(s) for s in re.findall('-?\d+',mutation)][0]
    wild = mutation[0]
    mut = mutation[len(mutation)-1]

    pdbId = pdbId.lower()

    inner_dict = {}
    inner_dict["mut"] = f"{wild}{pos}{mut}"
    inner_dict["ddg"] = float(ddg)
    outer_key = f"{pdbId}{mutChain}"
    if outer_key not in two_level_dict:
        two_level_dict[f"{pdbId}{mutChain}"] = [inner_dict]
    else:
        two_level_dict[f"{pdbId}{mutChain}"].append(inner_dict)

0it [00:00, ?it/s]

In [39]:
# create a seqres to position mapping dictionary
# This dictionary will be a dictionary of dictionaries, where outer-dict keys will be pdbid+mutchain and inner-dict key will be (wild+pos) and value of 0-indexed position
# the icodes can be brought to picture later
mapping_dict = {}
pdbDirectory = "/content/drive/MyDrive/ACCRE_PyRun_Setup/S_2648_PDB_Files"
parser = PDBParser(QUIET=True)
# some proteins need to be skipped for now due to ICODE related discrapency
proteins_to_skip = []

for filename in tqdm(os.listdir(pdbDirectory)):
    filepath = os.path.join(pdbDirectory,filename)
    structure = parser.get_structure(id=filename.split(".")[0],file=filepath)
    model = structure[0]
    inner_dict = {}
    outer_key = filename.split(".")[0]
    skip_flag = False
    # single chain-assumption in action again
    for chain in model:
        for i,residue in enumerate(chain):
            inner_key = f"{three_to_one(residue.get_resname())}{residue.get_id()[1]}"
            if inner_key not in inner_dict:
                inner_dict[inner_key] = i
            else:
                # For "2immA:N31" and "1lveA:S27", I have been fucked
                # Need to think whether this will effect other positions or I can just avoid these two-protein related mutations for now?
                # Let me just avoid these two proteins for now
                print("YOU HAVE JUST BEEN FUCKED BY ICODE")
                print(f"{outer_key}:{inner_key}")
                skip_flag = True
    # The ICODE related problematic proteins will not be considered for now
    if not skip_flag:
        mapping_dict[outer_key] = inner_dict
    else:
        proteins_to_skip.append(outer_key)

  0%|          | 0/131 [00:00<?, ?it/s]

YOU HAVE JUST BEEN FUCKED BY ICODE
1lveA:S27
YOU HAVE JUST BEEN FUCKED BY ICODE
1lveA:S27
YOU HAVE JUST BEEN FUCKED BY ICODE
2immA:N31
YOU HAVE JUST BEEN FUCKED BY ICODE
2immA:N31


In [41]:
# read in the PDB files from the directory where the S_2648 PDB Files are stored, and set-them up one by one for featuirization, and passing through the model
pdbDirectory = "/content/drive/MyDrive/ACCRE_PyRun_Setup/S_2648_PDB_Files"
parser = PDBParser(QUIET=True)
for i,filename in enumerate(os.listdir(pdbDirectory)):
    if i > 2:
        break
    #ICODE related problematic proteins will be skipped from analysis for now
    if filename.split(".")[0] not in proteins_to_skip:
        filepath = os.path.join(pdbDirectory,filename)
        structure = parser.get_structure(id=filename.split(".")[0],file=filepath)
        model = structure[0]
        
        # Since there is only one chain, and that same chain is both fixed designable for different residues, extracting that name, and putting them in pertinent lists
        chain_name = list(model.child_dict.keys())[0]
        fixed_chain_list = []
        # the trick is to put the single chain as designable chain, and then create the "fixed_positions_dict" dictionary  
        designed_chain_list = [chain_name]
        chain_list = list(set(designed_chain_list + fixed_chain_list))

        # Using the programs custome PDB parser for processing the PDB files
        pdb_dict_list = parse_PDB(filepath, input_chain_list=chain_list)
        # tacking max_length parameter value from the original colab notebook since I need to process all residues at the same time
        # all the PDB files can technically be processed together and put inside the dataset_valid list-like object, but right now
        # I am trying to keep everything consistent
        # Each element of dataset_valid is a dictionary 
        dataset_valid = StructureDatasetPDB(pdb_dict_list, truncate=None, max_length=20000)

        # At this point, probably need to put None values in a lot of parameters that are not relevant to my usecase, but need to be sent to featurizer before running model forward
        # For now, I will not tie positions together
        tied_positions_dict = None
        pssm_dict = None
        omit_AA_dict = None
        bias_AA_dict = None
        tied_positions_dict = None
        bias_by_res_dict = None
        alphabet = 'ACDEFGHIKLMNPQRSTVWYX'
        bias_AAs_np = np.zeros(len(alphabet))
        
        # I do not even think temperature is necessary for my usecase since I am not trying to generate sequences,
        # still keeping a value for code consistency
        temperatures = [0.1]

        # This "chain_id_dict" seems like a parameter that definitely has some effect on  
        chain_id_dict = {}
        chain_id_dict[pdb_dict_list[0]['name']]= (designed_chain_list, fixed_chain_list)

        BATCH_COPIES = 1   
        
        # print(chain_id_dict)
        # Simplying the sequence generation loop
        protein = dataset_valid[0]
        wildtype_seq = protein[f"seq_chain_{designed_chain_list[0]}"]
        # print(wildtype_seq)
        # only need to mask the mutated position position in "wildtype_seq" for now

        # "muts_for_prot" is a list with information about all the mutations in "protein", whose sequence only version is "wildtype_seq" 
        muts_for_prot = two_level_dict[filename.split(".")[0]]
        # "cur_map_dict" will give the 0-based sequence index for the mutations, which will be almost directly used for masking and then running the model
        # Now, for the last time need to make sure whether the fixed positions need to be 0-indexed or 1-indexed
        cur_map_dict = mapping_dict[filename.split(".")[0]]
        for mut in muts_for_prot:
            wild_aa = mut["mut"][0]
            seq_pos = cur_map_dict[mut["mut"][0:-1]]
            print(f"{wild_aa},{wildtype_seq[seq_pos]}")

Y,Y
Y,Y
V,V
V,V
V,V
F,F
Y,Y
A,A
V,V
W,W
V,V
T,T
G,G
V,V
V,V
W,W
I,I
T,T
E,E
I,I
L,L
V,V
Q,Q
T,T
T,T
N,N
I,I
H,H
H,H
F,F
F,F
R,R
R,R
M,M
D,D
D,D
S,S
S,S
W,W
I,I
I,I
E,E
E,E
K,K
V,V
T,T
T,T
V,V
N,N
N,N
L,L
K,K
K,K
H,H
R,R
R,R
L,L
A,A
L,L
A,A
H,H
I,I
I,I
Q,Q
Q,Q
V,V
D,D
D,D
K,K
K,K
L,L
I,I
I,I
Q,Q
Q,Q
L,L
A,A
F,F
F,F
D,D
D,D
W,W
K,K
K,K
L,L
Q,Q
Q,Q
A,A
A,A
R,R


In [9]:
# Now, have to trace the function protein_mpnn_utils.tied_featurize
# after tracing and understanding this function in a workable way, I can run through model forward, get probabilities, play around, find simple correlation
# submit batch jobs, and then move onto other ideas listed in my workbook, add those slides to the presentation to be shown to Jens in a week
# seems like full-chain design vs. specific position design is the issue here
# Where is the key to going from full-chain-design to specific-position-design?
# Dig into the "mask" and "chain_M" variables of the forward method of "ProteinMPNN" for figuring out how to control "designable" vs. "fixed" residues
# How to populate the "fixed_positions_dict" dictionary? putting the only chain as designable and populating this dictionary 
# seems like "fixed_pos_list"  can be an 1-indexed integer list corresponding to positions in "chain_seq"
# I think, "fixed_position_mask" and "chain_mask" interact with each other at some point
# very possibly, there could be an element-wise product going on at some point during decoding, which prevents fixed_positions in designable chains from being masked out