# protein scaffolding


In [1]:
# check path
import sys
print(sys.path)

['/home/yuan/anaconda3/envs/SE3nv/lib/python39.zip', '/home/yuan/anaconda3/envs/SE3nv/lib/python3.9', '/home/yuan/anaconda3/envs/SE3nv/lib/python3.9/lib-dynload', '', '/home/yuan/anaconda3/envs/SE3nv/lib/python3.9/site-packages', '/home/yuan/anaconda3/envs/SE3nv/lib/python3.9/site-packages/se3_transformer-1.0.0-py3.9.egg', '/home/yuan/data/RFdiffusion']


In [4]:
# Check available GPUs
import torch

if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name(torch.cuda.current_device())
    print(f"Found GPU with device_name {device_name}. Will run RFdiffusion on {device_name}")
else:
    print("///// NO GPU DETECTED! Falling back to CPU /////")

Found GPU with device_name NVIDIA GeForce RTX 3060. Will run RFdiffusion on NVIDIA GeForce RTX 3060


In [5]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import py3Dmol

%load_ext autoreload
%autoreload 2 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. prepare inputs

### pdb

In [6]:
from process_pdb import ProcessPdb

p = ProcessPdb('./pdb')
p.load_structure('insulin_target.pdb')
p.get_chains()
p.export_fasta('insulin_target.fasta')

Chain=A, AA residues=150
Try to create ./pdb/insulin_target.fasta...


In [7]:
# Create a view
view = py3Dmol.view(width=400, height=300)
view.addModel(open('pdb/insulin_target.pdb').read(), 'pdb')
view.setStyle({'cartoon': {'color': 'spectrum'}})
view.zoomTo()
view.show()

### guided inputs

In [62]:
# confirm pytorch file: secondary structure (SS)
ss = torch.load('pdb/insulin_target_ss.pt')
ss

tensor([2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2,
        2, 2, 2, 2, 2, 2])

In [63]:
# confirm pytorch file: adjacency
adj = torch.load('pdb/insulin_target_adj.pt')
adj

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

## 2. configurations

In [45]:
import yaml
from pathlib import Path
from omegaconf import DictConfig, OmegaConf

# load default parameters
args = yaml.safe_load(Path('./base.yaml').read_text())
conf = OmegaConf.create(args)

conf.inference.num_designs=10
conf.inference.output_prefix='outputs/demo/scaffold'

# specify the target protein
conf.scaffoldguided.target_pdb=True
conf.scaffoldguided.target_path='pdb/insulin_target.pdb'
# the target with secondary structuree
conf.scaffoldguided.target_ss='pdb/insulin_target_ss.pt'
# block adjacency input
conf.scaffoldguided.target_adj='pdb/insulin_target_adj.pt'
# specify the fold of the protein
conf.scaffoldguided.scaffoldguided=True
# different scaffords providing scaffold_list
conf.scaffoldguided.scaffold_dir = './ppi_scaffolds'
# the binding residues 
conf.ppi.hotspot_res = ['A59', 'A83', 'A91']

# reduce the noise added during inference to 0
conf.denoiser.noise_scale_ca = 0
conf.denoiser.noise_scale_frame = 0

In [46]:
conf.scaffoldguided

{'scaffoldguided': True, 'target_pdb': True, 'target_path': 'pdb/insulin_target.pdb', 'scaffold_list': None, 'scaffold_dir': './ppi_scaffolds', 'sampled_insertion': 0, 'sampled_N': 0, 'sampled_C': 0, 'ss_mask': 0, 'systematic': False, 'target_ss': 'pdb/insulin_target_ss.pt', 'target_adj': 'pdb/insulin_target_adj.pt', 'mask_loops': True, 'contig_crop': None}

In [47]:
# Initialize sampler and target/contig.
from rfdiffusion.inference import utils as iu

sampler = iu.sampler_selector(conf)

Reading models from /home/yuan/data/RFdiffusion/rfdiffusion/inference/../../models
This is inf_conf.ckpt_path
/home/yuan/data/RFdiffusion/rfdiffusion/inference/../../models/Complex_Fold_base_ckpt.pt
Assembling -model, -diffuser and -preprocess configs from checkpoint
USING MODEL CONFIG: self._conf[model][n_extra_block] = 4
USING MODEL CONFIG: self._conf[model][n_main_block] = 32
USING MODEL CONFIG: self._conf[model][n_ref_block] = 4
USING MODEL CONFIG: self._conf[model][d_msa] = 256
USING MODEL CONFIG: self._conf[model][d_msa_full] = 64
USING MODEL CONFIG: self._conf[model][d_pair] = 128
USING MODEL CONFIG: self._conf[model][d_templ] = 64
USING MODEL CONFIG: self._conf[model][n_head_msa] = 8
USING MODEL CONFIG: self._conf[model][n_head_pair] = 4
USING MODEL CONFIG: self._conf[model][n_head_templ] = 4
USING MODEL CONFIG: self._conf[model][d_hidden] = 32
USING MODEL CONFIG: self._conf[model][d_hidden_templ] = 32
USING MODEL CONFIG: self._conf[model][p_drop] = 0.15
USING MODEL CONFIG: sel

In [48]:
# unconditional
type(sampler)

rfdiffusion.inference.model_runners.ScaffoldedSampler

In [49]:
# Loop over number of designs to sample.
# default is zero
design_startnum = sampler.inf_conf.design_startnum
if sampler.inf_conf.design_startnum == -1:
    existing = glob.glob(sampler.inf_conf.output_prefix + "*.pdb")
    indices = [-1]
    for e in existing:
        print(e)
        m = re.match(".*_(\d+)\.pdb$", e)
        print(m)
        if not m:
            continue
        m = m.groups()[0]
        indices.append(int(m))
    design_startnum = max(indices) + 1
conf.inference.design_startnum = design_startnum

## 3. design

In [50]:
# run model
from run_rfd import RunRfd

c = RunRfd(sampler)
c.run(overwrite=True)

Making design outputs/scaffold_0
Scaffold constrained based on file:  HHH_b2_03617
With this beta schedule (linear schedule, beta_0 = 0.04, beta_T = 0.28), alpha_bar_T = 0.00013696048699785024
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEVCPGMDIRNNLTRLHELENCSVIEGHLQILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYALVIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNHIVLNKDDNEEC
meta data: outputs/scaffold_0.trb
Finished design in 1.64 minutes
Making design outputs/scaffold_1
Scaffold constrained based on file:  HHH_b1_04678
With this beta schedule (linear schedule, beta_0 = 0.04, beta_T = 0.28), alpha_bar_T = 0.00013696048699785024
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEVCPGMDIRNNLTRLHELENCSVIEGHLQILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYALVIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNHIVLNKDDNEEC
meta data: outputs/scaffold_1.trb
Finished design in 1.59 minutes
Making design outputs/scaffold_2
Scaffold constrained based on file:  

In [51]:
# view structure of one design
c.display_pdb(9)

outputs/scaffold_9.pdb


'outputs/scaffold_9.pdb'

In [52]:
trb = c.from_trb(9)
trb['plddt']

array([[0.10233958, 0.10923111, 0.09795915, ..., 0.9931515 , 0.994971  ,
        0.99746275],
       [0.13933015, 0.14439552, 0.13467807, ..., 0.99305123, 0.995118  ,
        0.99780923],
       [0.1960969 , 0.19049552, 0.191713  , ..., 0.99562174, 0.996843  ,
        0.9983656 ],
       ...,
       [0.9701367 , 0.9764459 , 0.9791571 , ..., 0.9996361 , 0.9997456 ,
        0.9997075 ],
       [0.97505516, 0.9810769 , 0.98291683, ..., 0.99968505, 0.9997794 ,
        0.9997334 ],
       [0.98005533, 0.9852895 , 0.9868253 , ..., 0.9997398 , 0.9998027 ,
        0.9997579 ]], dtype=float32)

In [53]:
p = ProcessPdb('./outputs')
p.load_structure('scaffold_9.pdb')
p.get_chains()

Chain=A, AA residues=53
Chain=B, AA residues=150


## 4. analysis