In [2]:
import os
import subprocess


## Run binder diffusion to hotspots!
we used to thread diffused binders with their sequence and perform FastRelax before proteinMPNN, but we don't get more candidates

In [64]:
prefix="mCD20_dim_sh"
scaffolds="/home/tsatler/RFdif/ClusterProteinDesign/scaffolds/rfdiff_scaffolds"
target="input/mcd20_dim_sh.pdb"
hotspots='ppi.hotspot_res=[A69,A70,A39,A146,A156,A157,A160]'
# 69,70,39,156,157,160,164,168

num_of_diffusions=20 # Number of RF diffusions per job
num_seqs=30 # How many MPNN sequences to generate per RF diffusion
num_filtered_seqs=30 # How many of the MPNN generated sequences per RF diffusion to keep for AF2
diff_steps = 50

# Af2 Mpnn parameters
num_recycles=3 # AF2 recycles
sampling_temp=0.0001 # ProteinMPNN sampling temperature

# Slurm parameters
num_jobs=400 # Number of jobs to submit
array_limit=5

print(f"This will diffuse {num_of_diffusions*num_jobs} binders and design {num_of_diffusions*num_jobs*num_seqs} sequences in total")

This will diffuse 8000 binders and design 240000 sequences in total


## Runs slurm array script

In [6]:
print(prefix)

mCD20_dim_sh


In [5]:

target_dir = os.path.dirname(target)
target_base = os.path.basename(os.path.splitext(target)[0])
ss = os.path.join(target_dir, f"{target_base}_ss.pt")
adj = os.path.join(target_dir, f"{target_base}_adj.pt")

output_folder = f"output/{prefix}"
os.makedirs(output_folder, exist_ok=True)

rf_out_folder = f"{output_folder}/rf_dock/{prefix}"


script = f"""#!/bin/bash
#SBATCH --partition=gpu
#SBATCH --gres=gpu:A40:1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=2
#SBATCH --job-name={prefix}_binder_docking
#SBATCH --array=0-{num_jobs-1}%{array_limit}

set -e
# Load Anaconda environment
source /home/tsatler/anaconda3/etc/profile.d/conda.sh


########################
# RF diffusion
########################

# Generate ss, adj, and out_dir variables based on target_pdb
# Check if ss and adj files exist
if [ ! -f "{ss}" ] || [ ! -f "{adj}" ]; then
  echo "One or both of the SS and ADJ files do not exist. Running Python script to generate them..."
  
  conda activate pyro
  # Run Python script to generate ss and adj files
  python helper_scripts/make_secstruc_adj.py --input_pdb "{target}" --out_dir "{target_dir}"
else
  echo "SS and ADJ files already exist."
fi


echo Running RFdiffusion for docking scaffolds to the target...
conda activate SE3nv

rf_out={rf_out_folder}_$SLURM_ARRAY_TASK_ID/{prefix}_$SLURM_ARRAY_TASK_ID
echo rf_out: $rf_out

python /home/tsatler/RFdif/RFdiffusion/scripts/run_inference.py \
diffuser.T={diff_steps} \
scaffoldguided.target_path={target} \
inference.output_prefix=$rf_out \
scaffoldguided.scaffoldguided=True \
{hotspots} \
scaffoldguided.target_pdb=True \
scaffoldguided.target_ss={ss} \
scaffoldguided.target_adj={adj} \
scaffoldguided.scaffold_dir={scaffolds} \
'potentials.guiding_potentials=["type:binder_ROG,weight:3","type:interface_ncontacts","type:binder_distance_ReLU"]' \
potentials.guide_scale=2 \
potentials.guide_decay="quadratic" \
inference.num_designs={num_of_diffusions} \
denoiser.noise_scale_ca=0 \
denoiser.noise_scale_frame=0

# Remove some files
if [ -e "$rf_out*.trb" ]; then
    rm "$rf_out*.trb" # delete trb files
fi
if [ -d "{rf_out_folder}_$SLURM_ARRAY_TASK_ID/traj" ]; then
    rm -r "{rf_out_folder}_$SLURM_ARRAY_TASK_ID/traj" # delete trajectories
fi


########################
# ProteinMPNN and AF2
########################
echo "Running AFDesign with MPNN sampling"

conda activate colabthread

input_files=($rf_out*.pdb)
script="helper_scripts/colabinder.py"

for ((i=0; i<${{#input_files[@]}}; i++)); do
  pdb_file=${{input_files[$i]}}
  echo $pdb_file - ProteinMPNN and AF2
  af_out={output_folder}/mpnn_af2/{prefix}_$SLURM_ARRAY_TASK_ID
  python $script $pdb_file $af_out B A --sampling_temp {sampling_temp} \
  --num_recycles {num_recycles} --num_seqs {num_seqs} --num_filt_seq {num_filtered_seqs} \
  --results_dataframe {output_folder} --save_best_only
done
"""

# Write the script to a file
script_file = f"{output_folder}/run_docking.sh"
with open(script_file, "w") as f:
    f.write(script)

In [5]:
# Submit the script
subprocess.run(["sbatch", script_file])

Submitted batch job 547023


CompletedProcess(args=['sbatch', 'output/mCD20_dim_sh/run_docking.sh'], returncode=0)

In [9]:
!squeue --me

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
   547014_[8-56%8]       gpu predict_  tsatler PD       0:00      1 (JobArrayTaskLimit)
  547023_[5-399%5]       gpu mCD20_di  tsatler PD       0:00      1 (JobArrayTaskLimit)
            547009       gpu test_cd2  tsatler  R      46:23      1 compute-0-11
            547010       gpu test_cd2  tsatler  R      46:20      1 compute-0-11
          547014_0       gpu predict_  tsatler  R       3:49      1 compute-0-11
          547014_1       gpu predict_  tsatler  R       3:49      1 compute-0-12
          547014_2       gpu predict_  tsatler  R       3:49      1 compute-0-12
          547014_3       gpu predict_  tsatler  R       3:49      1 compute-0-12
          547014_4       gpu predict_  tsatler  R       3:49      1 compute-0-12
          547014_5       gpu predict_  tsatler  R       3:49      1 compute-0-12
          547014_6       gpu predict_  tsatler  R       3:49      1 compute-0-12
          

## Get best diffused (and predicted) backbones

<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x7f7a5035cbe0>>


In [156]:
import glob
import pandas as pd

### With glob get all the pdb files in the rf_out_folder
# rf_pdbs = glob.glob(f"{rf_out_folder}_*/*.pdb")
# print(f"Found {len(rf_pdbs)} pdb files in {rf_out_folder}")

all_predictions = pd.read_csv(f"{output_folder}/af2_results_all.csv")
rf_pdbs = all_predictions["input_pdb"].unique().tolist()
print(f"RF diffusions: {len(rf_pdbs)}")

out_csv = f"{output_folder}/rf_pdbs_all_analyzed.csv"
#all_predictions.to_csv(out_csv, index=False)
# Predicted files are in df in column "model_path"

# get af2 results
af2_results = pd.read_csv(f"{output_folder}/af2_best.csv")
af2_results

RF diffusions: 2354


Unnamed: 0,mpnn,plddt,i_ptm,i_pae,rmsd,seq,model_path,binder-rmsd,input_pdb
0,1.178795,0.758304,0.333302,20.555202,2.406745,TLGGLLMIPTGVFAPICLSVWYPLWGIMDILNMTLSHFLKMRRLEL...,output/mCD20_dim_sh/mpnn_af2/mCD20_dim_sh_0/af...,,output/mCD20_dim_sh/rf_dock/mCD20_dim_sh_0/mCD...
1,1.175050,0.759496,0.310303,20.968319,2.996210,TLGGLLMIPTGVFAPICLSVWYPLWGIMDILNMTLSHFLKMRRLEL...,output/mCD20_dim_sh/mpnn_af2/mCD20_dim_sh_0/af...,,output/mCD20_dim_sh/rf_dock/mCD20_dim_sh_0/mCD...
2,1.635968,0.768005,0.544756,15.297468,2.489532,TLGGLLMIPTGVFAPICLSVWYPLWGIMDILNMTLSHFLKMRRLEL...,output/mCD20_dim_sh/mpnn_af2/mCD20_dim_sh_1/af...,,output/mCD20_dim_sh/rf_dock/mCD20_dim_sh_1/mCD...
3,1.445020,0.741985,0.395455,17.782951,2.889918,TLGGLLMIPTGVFAPICLSVWYPLWGIMDILNMTLSHFLKMRRLEL...,output/mCD20_dim_sh/mpnn_af2/mCD20_dim_sh_1/af...,,output/mCD20_dim_sh/rf_dock/mCD20_dim_sh_1/mCD...
4,1.152487,0.706791,0.328622,21.062182,2.166544,TLGGLLMIPTGVFAPICLSVWYPLWGIMDILNMTLSHFLKMRRLEL...,output/mCD20_dim_sh/mpnn_af2/mCD20_dim_sh_1/af...,,output/mCD20_dim_sh/rf_dock/mCD20_dim_sh_1/mCD...
...,...,...,...,...,...,...,...,...,...
209,1.248275,0.708607,0.193649,22.333597,2.586961,TLGGLLMIPTGVFAPICLSVWYPLWGIMDILNMTLSHFLKMRRLEL...,output/mCD20_dim_sh/mpnn_af2/mCD20_dim_sh_117/...,,output/mCD20_dim_sh/rf_dock/mCD20_dim_sh_117/m...
210,1.202625,0.726757,0.280654,20.442824,2.899456,TLGGLLMIPTGVFAPICLSVWYPLWGIMDILNMTLSHFLKMRRLEL...,output/mCD20_dim_sh/mpnn_af2/mCD20_dim_sh_117/...,,output/mCD20_dim_sh/rf_dock/mCD20_dim_sh_117/m...
211,1.175465,0.705679,0.183866,22.810647,2.429413,TLGGLLMIPTGVFAPICLSVWYPLWGIMDILNMTLSHFLKMRRLEL...,output/mCD20_dim_sh/mpnn_af2/mCD20_dim_sh_117/...,,output/mCD20_dim_sh/rf_dock/mCD20_dim_sh_117/m...
212,1.173140,0.722445,0.263783,20.938118,2.636884,TLGGLLMIPTGVFAPICLSVWYPLWGIMDILNMTLSHFLKMRRLEL...,output/mCD20_dim_sh/mpnn_af2/mCD20_dim_sh_117/...,,output/mCD20_dim_sh/rf_dock/mCD20_dim_sh_117/m...


### Prepare input for analysis

In [157]:
save_directory_rf = f"output/{prefix}/cd_opt/analysis/rf_pdbs"
os.makedirs(save_directory_rf, exist_ok=True)

# Proceed with batching rf_pdbs
batch_size = 1000
batches_rf = [rf_pdbs[i:i + batch_size] for i in range(0, len(rf_pdbs), batch_size)]

# Save each batch as a separate TXT file
for i, batch in enumerate(batches_rf):
    save_path = os.path.join(save_directory_rf, "model_paths_rf_pdbs" + str(i) + ".txt")
    with open(save_path, "w") as file:
        file.write("\n".join(batch))


# Save the model_paths from the predictions
save_directory_af = f"output/{prefix}/cd_opt/analysis/af_pdbs"
os.makedirs(save_directory_af, exist_ok=True)


model_paths = af2_results["model_path"].tolist()
batches_af = [model_paths[i:i + batch_size] for i in range(0, len(model_paths), batch_size)]

# Save each batch as a separate TXT file
for i, batch in enumerate(batches_af):
    save_path = os.path.join(save_directory_af, "model_paths_af_pdbs" + str(i) + ".txt")
    with open(save_path, "w") as file:
        file.write("\n".join(batch))

### Run analysis scripts

In [158]:
bash_script_rf = f"""#!/bin/bash
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=2
#SBATCH --job-name=analyze_pdbs
#SBATCH --array=0-{len(batches_rf)-1}

set -e

# Load Anaconda environment
source /home/tsatler/anaconda3/etc/profile.d/conda.sh

# Activate the environment
conda activate colabthread

input_files=({save_directory_rf}/*.txt)
file=${{input_files[$SLURM_ARRAY_TASK_ID]}}
echo "Running $file"

# Run the script
python helper_scripts/analyze_diffusions.py --input_txt $file --hotspots {hotspots} --target {target} --binder_chain A --target_chain B --output_csv {out_csv}
"""
with open(f"{save_directory_rf}/run_analysis.sh", "w") as f:
    f.write(bash_script_rf)

bash_script_af = f"""#!/bin/bash
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=2
#SBATCH --job-name=analyze_pdbs
#SBATCH --array=0-{len(batches_af)-1}

set -e

# Load Anaconda environment
source /home/tsatler/anaconda3/etc/profile.d/conda.sh

# Activate the environment
conda activate colabthread

input_files=({save_directory_af}/*.txt)
file=${{input_files[$SLURM_ARRAY_TASK_ID]}}
echo "Running $file"

# input file
echo "Running $file"

# Run the script
python helper_scripts/analyze_diffusions.py --input_txt $file --hotspots {hotspots} --target {target} --binder_chain B --target_chain A --output_csv {out_csv}
"""
with open(f"{save_directory_af}/run_analysis.sh", "w") as f:
    f.write(bash_script_af)

In [159]:
# # Submit the script
subprocess.run(["sbatch", f"{save_directory_rf}/run_analysis.sh"])
subprocess.run(["sbatch", f"{save_directory_af}/run_analysis.sh"])

Submitted batch job 548158
Submitted batch job 548159


CompletedProcess(args=['sbatch', 'output/mCD20_dim_sh/cd_opt/analysis/af_pdbs/run_analysis.sh'], returncode=0)

In [141]:
!squeue --me

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
      548137_[0-2]       amd analyze_  tsatler PD       0:00      1 (Priority)


In [138]:
# rf_distance_dict = {}
# for rf_pdb in rf_pdbs:
#     distance = analyze_binder_diffusions(rf_pdb, extract_hotspot_numbers(hotspots), hotspot_offset=int(get_target_offset(target)), binder_chain="A", target_chain="B")
#     rf_distance_dict[rf_pdb] = distance

# rf_dist = []
# af_dist = []
# for i, row in all_predictions.iterrows():
#     rf_pdb = row["input_pdb"]
#     af_pdb = row["model_path"]
#     distance = analyze_binder_diffusions(af_pdb, extract_hotspot_numbers(hotspots), hotspot_offset=int(get_target_offset(target)), binder_chain="B", target_chain="A")
#     rf_dist.append(rf_distance_dict[rf_pdb])
#     af_dist.append(distance)

# all_predictions["rf_distance"] = rf_dist
# all_predictions["af_distance"] = af_dist
# all_predictions.to_csv(f"{output_folder}/af2_results_all_binderanalyzed.csv", index=False)

In [160]:
rf_pdbs_all_analyzed = pd.read_csv(out_csv)
rf_pdbs_all_analyzed.describe()

Unnamed: 0,mean_hotspot_distance,binder_max_distance,rg
count,4922.0,4922.0,4922.0
mean,25.047369,35.706177,11.406644
std,2.426665,12.641827,3.45201
min,11.269671,25.955254,9.205392
25%,24.213395,29.822496,9.833025
50%,25.941336,30.611752,10.052774
75%,26.607645,32.212566,10.452503
max,28.995451,96.24165,28.008752


In [161]:
filtered = rf_pdbs_all_analyzed[(rf_pdbs_all_analyzed["mean_hotspot_distance"] < 25)
                                 & (rf_pdbs_all_analyzed["binder_max_distance"] < 30)
                                    & (rf_pdbs_all_analyzed["rg"] < 11)]
filtered = filtered.sort_values("mean_hotspot_distance")
filtered = filtered.drop_duplicates("model_path")
filtered

Unnamed: 0,model_path,mean_hotspot_distance,binder_max_distance,rg
269,output/mCD20_dim_sh/rf_dock/mCD20_dim_sh_111/m...,21.223999,27.393782,9.517595
1685,output/mCD20_dim_sh/rf_dock/mCD20_dim_sh_18/mC...,22.007719,29.385197,9.608468
2798,output/mCD20_dim_sh/mpnn_af2/mCD20_dim_sh_68/a...,22.180655,29.720964,10.081790
3468,output/mCD20_dim_sh/rf_dock/mCD20_dim_sh_26/mC...,22.231071,29.627607,9.769263
2797,output/mCD20_dim_sh/mpnn_af2/mCD20_dim_sh_68/a...,22.317965,29.760708,10.064989
...,...,...,...,...
4695,output/mCD20_dim_sh/rf_dock/mCD20_dim_sh_91/mC...,24.894670,29.653261,9.749941
1393,output/mCD20_dim_sh/rf_dock/mCD20_dim_sh_4/mCD...,24.933157,27.614367,9.880040
1068,output/mCD20_dim_sh/rf_dock/mCD20_dim_sh_86/mC...,24.945750,29.813753,9.984917
942,output/mCD20_dim_sh/rf_dock/mCD20_dim_sh_80/mC...,24.949911,29.134443,9.487424


In [162]:
# Lets manually look at some best ones and worst ones...
# import shutil

# best_dir = f"{save_directory_rf}/best"
# os.makedirs(best_dir, exist_ok=True)
# worst_dir = f"{save_directory_rf}/worst_hotspot"
# os.makedirs(worst_dir, exist_ok=True)

# best = filtered.head(10)
# for i, row in best.iterrows():
#     pdb = row["model_path"]
#     shutil.copy(pdb, best_dir)

# worst_hotspot_idst = rf_pdbs_all_analyzed.sort_values("mean_hotspot_distance", ascending=False).head(10)
# for i, row in worst_hotspot_idst.iterrows():
#     pdb = row["model_path"]
#     shutil.copy(pdb, worst_dir)

# worst_binder_idst = rf_pdbs_all_analyzed.sort_values("binder_max_distance", ascending=False).head(10)
# worst_dir = f"{save_directory_rf}/worst_binder"
# os.makedirs(worst_dir, exist_ok=True)
# for i, row in worst_binder_idst.iterrows():
#     pdb = row["model_path"]
#     shutil.copy(pdb, worst_dir)

In [163]:
# Copy top X for cd_opt (input for next step)
import shutil
top = 10
cd_opt_dir = f"output/{prefix}/cd_opt/inputs"
os.makedirs(cd_opt_dir, exist_ok=True)
top_pdbs = filtered.head(top)
for i, row in top_pdbs.iterrows():
    pdb = row["model_path"]
    shutil.copy(pdb, cd_opt_dir)

In [164]:
print(hotspots)

ppi.hotspot_res=[A69,A70,A39,A146,A156,A157,A160]
