In [1]:
# get library

import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
os.environ['XLA_PYTHON_CLIENT_PREALLOCATE'] = 'false'
from jax import random, vmap
import numpyro
from numpyro.diagnostics import hpdi
import numpyro.distributions as dist
from numpyro import handlers
from numpyro.infer import MCMC, NUTS
import jax.numpy, jax.scipy
import scipy.stats
from Bio.Seq import Seq
from jax.lib import xla_bridge
print(xla_bridge.get_backend().platform)

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
else:
    print(gpu_info)
    
from sklearn.preprocessing import OneHotEncoder
import pickle



gpu
Sun Dec  4 09:18:36 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 455.32.00    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 3090    On   | 00000000:01:00.0 Off |                  N/A |
|  0%   56C    P2    36W / 420W |    283MiB / 24265MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Pr

In [2]:
###############################
#
#  Import data for scramble sequences
#
###############################

df = pd.read_csv('./K50_scrambles_for_STEP3.csv')
df

Unnamed: 0,name,dna_seq,log10_K50_t,log10_K50_c,aa_seq_for_unfolded_model,group,aa_seq_experimental
0,2MLB.pdb_scramble_50%_1,ACCAAAGAAGAATTCGCGGGTTCTCGTGAATCTCTGCTCACTGATA...,-0.283107,-1.415875,XXXXGGGTKEEFAGSRESLLTDTPTGTTTSEDDITIVLTAHTDTLG...,experimental_data,TKEEFAGSRESLLTDTPTGTTTSEDDITIVLTAHTDTLGTTPDKYT...
1,2MLB.pdb_scramble_50%_2,ACCGACCTGACTGACGAAACCGCGAAAACCCACATCGAAAAATCTG...,-0.915977,-1.815945,XXXXGGGTDLTDETAKTHIEKSGPTKVDEELGDLTKTTGTPGSGRL...,experimental_data,TDLTDETAKTHIEKSGPTKVDEELGDLTKTTGTPGSGRLGTHKNKK...
2,2MLB.pdb_scramble_50%_3,ACCGAAGGTCCGTCTACTCCGCGTACCTTCACCAACGAAGAACCGA...,-1.416164,-1.749838,XXXXGGGTEGPSTPRTFTNEEPILRRANLEEVTIKEGETHLGKEKT...,experimental_data,TEGPSTPRTFTNEEPILRRANLEEVTIKEGETHLGKEKTDTSDTTI...
3,2MLB.pdb_scramble_50%_4,ATTACCACCTCCGAACACTACACCAAAGAAGGTAAAGTTCTGCGTC...,-0.768396,-2.253292,XXXXGGGITTSEHYTKEGKVLRLTTTTKNLGTAHDTKTKTSTTDGE...,experimental_data,ITTSEHYTKEGKVLRLTTTTKNLGTAHDTKTKTSTTDGETFEEGGE...
4,2MLB.pdb_scramble_50%_5,ACCGAAGAACTGATCACCCGTGAAACCGACCTCACCACTCACACCA...,-1.260843,-2.451885,XXXXGGGTEELITRETDLTTHTNDDILTLKGTKTKESTHLTTRNGT...,experimental_data,TEELITRETDLTTHTNDDILTLKGTKTKESTHLTTRNGTGILLTVK...
...,...,...,...,...,...,...,...
127712,2ICT.pdb_scramble_100%_11_72aas,CAACTGGCGCATGAAGCGGCGTTCGGTCTGGTTGAAATCCTGCCGG...,-1.297789,-0.615096,GGGGAGSQLAHEAAFGLVEILPEQRAPNAILLPFSEPASDSEDDSE...,duplicated_data,QLAHEAAFGLVEILPEQRAPNAILLPFSEPASDSEDDSERRGREQQ...
127713,2ICT.pdb_scramble_100%_12_72aas,GCGCACATCCTGGAAGACGTTATCGACGGTCCAGAGGAGCCAGAAA...,-0.634577,-1.477620,GGGGAGSAHILEDVIDGPEEPENAVILQLRLEDFSESESFLDAAEE...,duplicated_data,AHILEDVIDGPEEPENAVILQLRLEDFSESESFLDAAEEERPRRFP...
127714,2ICT.pdb_scramble_100%_13_72aas,AACCGTCGTTTCCAGTCTGTTGCGCGTGCGGAGCCGGACATCCACC...,-1.407286,-1.433920,GGGGAGSNRRFQSVARAEPDIHPEFSAAHLLEIAEDFSPRRGDVAE...,duplicated_data,NRRFQSVARAEPDIHPEFSAAHLLEIAEDFSPRRGDVAEDGEIIND...
127715,2ICT.pdb_scramble_100%_14_72aas,CTGTTTGGTCTGATCATCAACGAAGAACGTCCGGAAGAGCGTTCTC...,-1.613882,-2.624576,GGGGAGSLFGLIINEERPEERSPRARLSASQHRLLFSQERPESRPN...,duplicated_data,LFGLIINEERPEERSPRARLSASQHRLLFSQERPESRPNALLIRLI...


In [3]:
# make onehot vector data from amino acid sequences
# seq_input: onehot vector representing amino acid sequences [# of scrambles, # of 20 amino acids +' X' (21), length of amino acids +6x Gly (86)]

seqs = np.array([[x for x in s] for s in df['aa_seq_for_unfolded_model']])


aas='ACDEFGHIKLMNPQRSTVWXY'
enc=OneHotEncoder()
enc.fit(np.array([aa for aa in aas]).reshape(-1, 1))
seq_inputs = np.array([enc.transform(s.reshape([-1,1])).toarray().T for s in seqs])
print (np.shape(seq_inputs))



(127717, 21, 86)


In [4]:
###############################
#
#  Model to calculate unfolded K from K50_t,c and amino acid sequences
#
###############################

def kunfold_from_sequence_twoprotease(kT_kC, seq_inputs, filter_center, filter_sigma,filter_lowlim,filter_highlim):
    # kT_kC: log10 K50 values in trypsin and chymotrypsin challenge [[＃ of scrambles (for trypsin)],[＃ of scrambles (for chymotrypsin)]]
    # seq_inputs: onehot vector representing amino acid sequences [# of scrambles, # of 20 amino acids +' X' (21), length of amino acids +6x Gly (86)]
    # filter_center: prior of centers of convolutional filter [trypsin/chymotrypsin(2), # of 20 amino acids +' X' (21), length of filter (9)]
    # filter_sigma: prior of sigma of convolutional filter [trypsin/chymotrypsin(2), # of 20 amino acids +' X' (21), length of filter (9)]
    # filter_lowlim: prior of lower limit of convolutional filter [trypsin/chymotrypsin(2), # of 20 amino acids +' X' (21), length of filter (9)]
    # filter_highlim: prior of upper limit of convolutional filter [trypsin/chymotrypsin(2), # of 20 amino acids +' X' (21), length of filter (9)]
    
###############################
#  Set up the parameters for protease filter (PSSM)
###############################
    
    # protease_filter:  convolutional filter were sampled in truncated normal distribution [trypsin/chymotrypsin(2), # of 20 amino acids +' X' (21), length of filter (9)]
    protease_filter = numpyro.sample("protease_filter", dist.TruncatedDistribution(dist.Normal(filter_center, filter_sigma),filter_lowlim,filter_highlim))   
    
    # filtered_seqs: filtered onehot vectors [# of scrambles, trypsin/chymotrypsin(2) , length of amino acids +6x Gly (86) - length of filter (9) + 1 (78)]
    filtered_seqs=jax.lax.conv_general_dilated(seq_inputs, protease_filter, [1], "VALID")

###############################
#  Calculate local protease sensitivity (Site saturation)
###############################
    # saturated_sites: local protease sensitivity with satulation [# of scrambles, trypsin/chymotrypsin(2) , length of amino acids +6x Gly - length of window + 1 (78)]
    # saturated_sites = logistic(PSSM(aa_site,site))
    saturated_sites=jax.scipy.special.expit(filtered_seqs)
    
###############################
#  Calculate K50,U based on the sum of saturated sites
###############################
    # sum_saturated_sites: sum of local protease sensitivity [# of scrambles, trypsin/chymotrypsin(2)]
    # sum_saturated_sites = sum(saturated_sites)
    sum_saturated_sites = jax.numpy.sum(saturated_sites,axis=2)
    numpyro.deterministic('sum_saturated_sites', sum_saturated_sites)

    #bin_num: the number of logistic functions for the final activation
    bin_num = 10
    # change the shape of sum_saturated_sites
    # sum_saturated_sites: sum of local protease sensitivity [# of scrambles, trypsin/chymotrypsin(2), # of bins (10)]        
    sum_saturated_sites = jax.numpy.transpose(jax.numpy.resize(sum_saturated_sites,(bin_num,len(saturated_sites),2)),(1,2,0))
       
    #logstic_center_TC: 10 (bin_num) thresholds for the final activation [trypsin/chymotrypsin(2), # of bins (10)]
    logstic_center_TC = numpyro.sample("logstic_center_TC", dist.Normal( np.resize(np.linspace(0, 20, num=bin_num),(2,bin_num)), 2)) 
    
    #min/max_K50unfolded_TC: min/max of K50,U for final activation [-, trypsin/chymotrypsin(2)]
    min_K50unfolded_TC = numpyro.sample("min_K50unfolded_TC", dist.Normal(np.resize(-4, (1,2)) , np.resize(1, (1,2))))
    max_K50unfolded_TC = numpyro.sample("max_K50unfolded_TC", dist.Normal(np.resize(2, (1,2)) , np.resize(1, (1,2))))
    
    # K50unfolded_TC: log10 K50 unfolded values for trypsin/chymotrypsin [# of scrambles, trypsin/chymotrypsin(2)]
    # K50unfolded = max_K50,U - Scale * sum_saturated_sites
    K50unfolded_TC = max_K50unfolded_TC - (max_K50unfolded_TC - min_K50unfolded_TC)*jax.numpy.sum(jax.scipy.special.expit(sum_saturated_sites-logstic_center_TC),axis=2)/bin_num
    numpyro.deterministic('K50unfolded_TC', K50unfolded_TC)
    
###############################
#  Calculate theoretical K50_t/c using K50_unfolded, K50_folded 
###############################
    # K50folded_TC: log10 K50 folded values for trypsin/chymotrypsin [-, trypsin/chymotrypsin(2)]
    K50folded_TC = np.resize([3.0, 2.0], (1,2))
    
    # deltaG: folding stability for each scrambles [# of scrambles], sampled in wide normal distribution, shared between trypsin and chymotrypsin challenge
    deltaG = numpyro.sample("deltaG", dist.Normal(np.resize(-1, (len(kT_kC), 1)), 4))
    
    # fraction_unfolded: fraction of unfolded state calculated by deltaG [# of scrambles]
    # fraction_unfolded = 1/(1+ΔG/RT) 
    fraction_unfolded = 1.0 / (1.0 + jax.numpy.exp(deltaG / 0.58))
    
    # TC_theo: theoretical log10 K50 values for trypin/chymotrypsin computed fraction_unfolded (i.e. deltaG) [# of scrambles, trypsin/chymotrypsin(2)]
    # 1/K50 = fraction_unfolded/K50,U + (1-fraction_unfolded)/K50,F 
    TC_theo = -jax.numpy.log10( ((10.0 ** -K50unfolded_TC   ) * fraction_unfolded) +  (10.0 ** -K50folded_TC)*(1-fraction_unfolded))
    numpyro.deterministic('TC_theo', TC_theo)

###############################
#  Fitting paramters by assuming theoretical K50 values match observed K50 values
###############################
    # TC_sigma: uncertainty of K50 values [# of scrambles, trypsin/chymotrypsin(2)]
    sigma = numpyro.sample("sigma", dist.Exponential(1))    
    TC_sigma = jax.numpy.resize(jax.numpy.array([sigma, sigma]), np.shape(TC_theo))
    numpyro.sample("obs_kT_kC", dist.TransformedDistribution(dist.Normal(0, 1), dist.transforms.AffineTransform(TC_theo, TC_sigma)),obs = kT_kC)

In [5]:
###############################
#
#  Set up parameters for priors for protease filter center
#
###############################

# filter_center: prior of centers of convolutional filter [trypsin/chymotrypsin(2), # of 20 amino acids +' X' (21), length of filter (9)]
# filter_sigma: prior of sigma of convolutional filter [trypsin/chymotrypsin(2), # of 20 amino acids +' X' (21), length of filter (9)]
# filter_lowlim: prior of lower limit of convolutional filter [trypsin/chymotrypsin(2), # of 20 amino acids +' X' (21), length of filter (9)]
# filter_highlim: prior of upper limit of convolutional filter [trypsin/chymotrypsin(2), # of 20 amino acids +' X' (21), length of filter (9)]

# priors for protease filter center
filter_center=np.ones((2,21,9)) * -0.3 



# Wider sigma for center
filter_sigma=np.ones((2,21,9))* 0.1 
filter_sigma[:,:,4] = 2

# Wider sigma for known inhibitory amino acids
filter_sigma[:,aas.index('D'),:] = 2 
filter_sigma[:,aas.index('E'),:] = 2
filter_sigma[:,aas.index('P'),:] = 2


# Wider sigma for known cleavable sites
# basic amino acids (KR) and aromatic amino aicds (FYW) are cleavage sites for trypsin/chymotrypsin
filter_sigma[0,aas.index('K'),4] = 4 
filter_sigma[0,aas.index('R'),4] = 4
filter_sigma[1,aas.index('F'),4] = 4
filter_sigma[1,aas.index('W'),4] = 4
filter_sigma[1,aas.index('Y'),4] = 4

# Upper limit for inhibitory amino acids
filter_highlim=np.ones((2,21,9)) * 10 
filter_highlim[:,aas.index('D'),:] = 0
filter_highlim[:,aas.index('E'),:] = 0
filter_highlim[:,aas.index('P'),:] = 0
filter_highlim[:,aas.index('D'),4] = -1
filter_highlim[:,aas.index('E'),4] = -1
filter_highlim[:,aas.index('P'),4] = -1

# Lower limit for known cleavable sites
filter_lowlim=np.ones((2,21,9)) * -10
filter_lowlim[0,aas.index('R'),:] = -1
filter_lowlim[1,aas.index('F'),:] = -1
filter_lowlim[1,aas.index('Y'),:] = -1
filter_lowlim[1,aas.index('W'),:] = -1
filter_lowlim[0,aas.index('R'),4] = 1
filter_lowlim[1,aas.index('F'),4] = 1
filter_lowlim[1,aas.index('Y'),4] = 1
filter_lowlim[1,aas.index('W'),4] = 1



In [None]:
# stack K50 trypsin and chymotrypsin
kT_kC = np.stack((df['log10_K50_t'].values,df['log10_K50_c'].values)).T

# run the model
rng_key = random.PRNGKey(0)
rng_key, rng_key_ = random.split(rng_key)

kernel = NUTS(kunfold_from_sequence_twoprotease)

mcmc = MCMC(kernel, num_warmup=500, num_samples=100,num_chains=2)
mcmc.run(rng_key_, kT_kC=kT_kC, seq_inputs=seq_inputs, filter_center = filter_center, filter_sigma=filter_sigma,filter_lowlim=filter_lowlim,filter_highlim=filter_highlim,extra_fields=("potential_energy",))
samples_1 = mcmc.get_samples(group_by_chain=True)

warmup:   1%| | 8/600 [00:39<1:09:21,  7.03s/it, 1023 steps of size 1.07e-03. acc. p

In [None]:
# remove unneeded parameters
conv_params = samples_1.copy()
conv_params.pop('TC_theo')
conv_params.pop('deltaG')
conv_params.pop('K50unfolded_TC')
conv_params.pop('sigma')
conv_params.pop('sum_saturated_sites')
conv_params

In [14]:
# save the parameters needed for unfolded K50 prediciton as pickle file
with open('STEP3_out_unfolded_model_params', 'wb') as p:
    pickle.dump(conv_params, p)