In [1]:
import os
import random
import subprocess

In [2]:
input_prefix = "1vmg"
input_pdb = "input/1vmg.pdb"
output_dir = f"output/{input_prefix}"
script_dir = f"{output_dir}/scripts"

### Slurm params
num_jobs = 100
array_limit = 5

### RFdiffusion params
diffuser_T = 100
diffusions_per_job = 10

fill_contigs = f"30-100/A35-57/30-100"
inpaint_seq = "A36-53"
total_len = "80-200"

input_pdb_as = "A35,A54-57"

### MPNN params
num_seqs = 24
mpnn_batch = 8
sampling_temp = 0.1
rm_aa = "C"
soluble = True

### AF2 params
recycles = 6
multimer = False
initial_guess = False

print(f"🔄 Submitting {num_jobs} jobs to the cluster, generating {diffusions_per_job} diffusions")
print(f"🔄 In total, {num_jobs*diffusions_per_job} RF diffusions will be generated.")
print(f"🔬 For each diffusion, {num_seqs} MPNN sequences will be evaluated, resulting in {num_jobs*diffusions_per_job*num_seqs} sequences validated with AF2.")

🔄 Submitting 100 jobs to the cluster, generating 10 diffusions
🔄 In total, 1000 RF diffusions will be generated.
🔬 For each diffusion, 24 MPNN sequences will be evaluated, resulting in 24000 sequences validated with AF2.


## Run motif scaffolding

In [55]:
def generate_contig_strings(num_jobs: int, fill_contigs: str, total_len_range: str) -> list:
    """Generates a list of contig strings based on input ranges and length constraints."""

    total_len_min, total_len_max = map(int, total_len_range.split("-"))
    contig_parts = fill_contigs.split("/")
    contig_strings = []

    while len(contig_strings) < num_jobs:
        contig_list = []
        total_sum = 0

        for part in contig_parts:
            if "-" in part and part.replace("-", "").isdigit():
                start, end = map(int, part.split("-"))
                value = random.randint(start, end)
                total_sum += value
                contig_list.append(f"{value}-{value}")
            else:
                int_part = part[1:]
                start, end = map(int, int_part.split("-"))
                value = end - start
                total_sum += value
                contig_list.append(part)

        if total_len_min <= total_sum <= total_len_max:
            contig_strings.append("/".join(contig_list))

    return contig_strings

contig_strings = generate_contig_strings(num_jobs, fill_contigs, str(total_len))
print(f"🔗 Generated {len(contig_strings)} contigs.. example: {contig_strings[0]}")

🔗 Generated 100 contigs.. example: 46-46/A35-57/57-57


In [56]:
os.makedirs(output_dir, exist_ok=True)
for i, contig in enumerate(contig_strings):
    contigs_list = contig.split(" ")
    contigs_str = ":".join(contigs_list)
    initial_guess_flag = "--initial_guess" if initial_guess else ""
    multimer_flag = "--multimer" if multimer else ""
    soluble_flag = "--use_soluble" if soluble else ""
    rf_output_dir = f"{output_dir}/rf/rf_{i}"
    af_output_dir = f"{output_dir}/af/af_{i}"
    script = f"""#!/bin/bash
#SBATCH --partition=gpu
#SBATCH --gres=gpu:A40:1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=2
#SBATCH --array=0-{num_jobs-1}%{array_limit}
#SBATCH --job-name=motif_{input_prefix}

set -e
# Load Anaconda environment
source /home/tsatler/anaconda3/etc/profile.d/conda.sh

# Activate Rosetta environment
echo Metal binding scaffold guided RFdiffusion
conda activate SE3nv

python /home/tsatler/RFdif/RFdiffusion/scripts/run_inference.py \
        diffuser.T={diffuser_T} \
        'contigmap.contigs=[{contig}]' \
        'contigmap.inpaint_seq=[A36-53]' \
        "inference.output_prefix={rf_output_dir}" \
        inference.input_pdb={input_pdb} \
        inference.num_designs={diffusions_per_job} \
        inference.ckpt_override_path=/home/tsatler/RFdif/RFdiffusion/models/ActiveSite_ckpt.pt

# Validate with AF2
echo Validate with AF2
conda activate colabdesign

input_pdbs=("{rf_output_dir}"/*.pdb)
script="/home/tsatler/RFdif/ClusterProteinDesign/scripts/metalbinding/scripts/motif_design.py"
mkdir -p {af_output_dir}

for ((i=0; i<${{#input_pdbs[@]}}; i++)); do
    pdb=${{input_pdbs[i]}}
    echo "Processing $pdb..."
    
    python $script $pdb {af_output_dir} {contigs_str} \
            --input_pdb {input_pdb} \
            --input_pdb_as {input_pdb_as} \
            --num_seqs {num_seqs} \
            --sampling_temp {sampling_temp} \
            --num_recycles {recycles} \
            --rm_aa={rm_aa} \
            --mpnn_batch {mpnn_batch} \
            --results_dataframe {output_dir}/df \
            --save_best_only \
            {initial_guess_flag} {multimer_flag} {soluble_flag}
done

"""
    os.makedirs(script_dir, exist_ok=True)
    with open(f"{script_dir}/job_{i}.sh", "w") as f:
        f.write(script)

In [57]:
## Create run script
run_script = f"""#!/bin/bash
#SBATCH --partition=gpu
#SBATCH --gres=gpu:A40:1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=2
#SBATCH --array=0-{num_jobs-1}%{array_limit}
#SBATCH --job-name=motif_{input_prefix}

set -e

input_commands=({script_dir}/*.sh)
command_file=${{input_commands[$SLURM_ARRAY_TASK_ID]}}
# mapfile -t commands < "$command_file"

echo "Running command file: $command_file"                   
bash $command_file

"""
with open(f"{output_dir}/run.sh", "w") as f:
    f.write(run_script)


In [58]:
print(f"🚀 Submitting jobs...")
subprocess.run(f"sbatch {output_dir}/run.sh", shell=True)

🚀 Submitting jobs...


sbatch: error: Batch job submission failed: Invalid account or account/partition combination specified


CompletedProcess(args='sbatch output/1vmg/run.sh', returncode=1)

In [59]:
!squeue --me

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)


## Analysis