In [17]:
#conda activate getcontact
import itertools
import pandas as pd
from pathlib import Path
import torch
import shutil
import subprocess
import os

In [18]:
HPC_USER = "wd304@login-icelake.hpc.cam.ac.uk"
REMOTE_DIR = "/rds/user/wd304/hpc-work/20250101_RFdiffusion_test"
LOCAL_FILE_PATH = "/home/eva/0_bury_charged_pair/0_Biopython_playground/0_myown_tips_tricks/RFdiffusion"
INPUT_PDB_SOURCE = "/home/eva/0_bury_charged_pair/0_Biopython_playground/0_myown_tips_tricks/output/3ult_cleaned_VE.pdb"
LEEWAY = 15 # How much residue is allow to interact with N/C cap (in contact map), and in inpaint
HELIX_LENGTH = 7
GAP_LENGTH = 0

In [19]:
# ----- Step 1: Setup search space -----
cycle_list = range(1, 6)  # Cycle values from 1 to 5
extend_len_list = range(25, 40)  # Extension length from 25 to 40
# Numbers expressed in terms of PyMol residue numbering (i.e. starts at 1, not 0).
crop_NT_list = [1]  #No crop # range(1,3) N-terminal cropping from 1 (no crop) to 3 (max crop). You crop everything before this residue.
crop_CT_list = [114]  #No crop # range (113, 115) C-terminal cropping from 113 (max crop) to 115 (no crop). You crop everything after this residue.

# Create the search space using itertools.product
search_space = list(itertools.product(cycle_list, extend_len_list, crop_CT_list, crop_NT_list))

# Create a DataFrame from the search space
df = pd.DataFrame(search_space, columns=['cycle', 'extend_len', 'crop_CT', 'crop_NT'])

# Update chain_A_len: Adjust based on cropping and extension
df['chain_A_len'] = df.apply(lambda row: row['crop_CT'] - (row['crop_NT'] - 1) + 2 * row['extend_len'], axis=1)

# Generate index_str with zero-padded values
df['index_str'] = df.apply(lambda row: f"cropNT_{str(int(row['crop_NT'])).zfill(3)}_cropCT_{str(int(row['crop_CT'])).zfill(3)}_extendlen_{str(int(row['extend_len'])).zfill(3)}_cycle_{str(int(row['cycle'])).zfill(2)}", axis=1)

print(df['index_str'])

0     cropNT_001_cropCT_114_extendlen_025_cycle_01
1     cropNT_001_cropCT_114_extendlen_026_cycle_01
2     cropNT_001_cropCT_114_extendlen_027_cycle_01
3     cropNT_001_cropCT_114_extendlen_028_cycle_01
4     cropNT_001_cropCT_114_extendlen_029_cycle_01
                          ...                     
70    cropNT_001_cropCT_114_extendlen_035_cycle_05
71    cropNT_001_cropCT_114_extendlen_036_cycle_05
72    cropNT_001_cropCT_114_extendlen_037_cycle_05
73    cropNT_001_cropCT_114_extendlen_038_cycle_05
74    cropNT_001_cropCT_114_extendlen_039_cycle_05
Name: index_str, Length: 75, dtype: object


In [20]:
# Ensure local folders exist
# Parent=True is to ensure all necessary parent dictionary will be created if they dont have already exist
# exist_ok is to prevent an error from being raised if the target dictionary already exist
Path(LOCAL_FILE_PATH).mkdir(parents=True, exist_ok=True)
Path(f"{LOCAL_FILE_PATH}/input").mkdir(parents=True, exist_ok=True)
Path(f"{LOCAL_FILE_PATH}/output").mkdir(parents=True, exist_ok=True)
Path(f"{LOCAL_FILE_PATH}/tasks").mkdir(parents=True, exist_ok=True)

In [21]:
# Add 'scaffold_dir', 'rfd_prefix', and 'input_pdb' columns
df['scaffold_dir'] = df.apply(lambda row: f"{REMOTE_DIR}/output/production_run_2/scaffold_dir/{row['index_str']}", axis=1)
# Modify index_str by adding full remote path and 3ult prefix, you can also create a new column instead of modifying the intex_str
df['rfd_prefix'] = df['index_str'].apply(lambda index: f"{REMOTE_DIR}/output/production_run_2/3ult_{index}")
input_pdb = INPUT_PDB_SOURCE.split("/")[-1]
df['input_pdb'] = f"{REMOTE_DIR}/input/{input_pdb}"  # Input PDB file used for all jobs
print(df)

    cycle  extend_len  crop_CT  crop_NT  chain_A_len  \
0       1          25      114        1          164   
1       1          26      114        1          166   
2       1          27      114        1          168   
3       1          28      114        1          170   
4       1          29      114        1          172   
..    ...         ...      ...      ...          ...   
70      5          35      114        1          184   
71      5          36      114        1          186   
72      5          37      114        1          188   
73      5          38      114        1          190   
74      5          39      114        1          192   

                                       index_str  \
0   cropNT_001_cropCT_114_extendlen_025_cycle_01   
1   cropNT_001_cropCT_114_extendlen_026_cycle_01   
2   cropNT_001_cropCT_114_extendlen_027_cycle_01   
3   cropNT_001_cropCT_114_extendlen_028_cycle_01   
4   cropNT_001_cropCT_114_extendlen_029_cycle_01   
..             

In [22]:
# ----- Step 1.1: Copy the PDB file to the generated input folder -----
if not os.path.exists(INPUT_PDB_SOURCE):
    print(f"Error: PDB file not found at {INPUT_PDB_SOURCE}. Please check the path.")
else:
    print(f"Copying PDB file from {INPUT_PDB_SOURCE} to {LOCAL_FILE_PATH}/input/{input_pdb}.")
    shutil.copy(INPUT_PDB_SOURCE, f"{LOCAL_FILE_PATH}/input/{input_pdb}")
    print("PDB file copied successfully.")

# Random select a specific number of row from df
num_scaffolds = int(input("Enter the number of scaffold files to generate: "))
df_limited = df.sample(n=num_scaffolds)

Copying PDB file from /home/eva/0_bury_charged_pair/0_Biopython_playground/0_myown_tips_tricks/output/3ult_cleaned_VE.pdb to /home/eva/0_bury_charged_pair/0_Biopython_playground/0_myown_tips_tricks/RFdiffusion/input/3ult_cleaned_VE.pdb.
PDB file copied successfully.


In [27]:
# ----- Step 2: Generate intermediate scaffold files locally and write SBATCH task files -----
def get_scaffoldguided(row):
    chain_len = row['chain_A_len']
    block_adj = torch.zeros((chain_len, chain_len)).float()

    # Define N-terminal and C-terminal blocks
    blocks = {
        "N-terminal": (0, row['extend_len'] + 1 + LEEWAY),
        "C-terminal": (chain_len - (row['extend_len'] + LEEWAY), chain_len)
    }
    
    # Populate adjacency matrix for the defined blocks
    for (start, end) in blocks.values():
        # use repeat =2 to replace (range(start, end), range(start, end))
        for x, y in itertools.product(range(start, end), repeat=2):
            block_adj[x, y] = 1
    
    for line in block_adj.numpy():
        print(" ".join(map(lambda x: str(int(x)), line)))
    
    # Define the alternating pattern for helix and mask
    pattern = [3] * chain_len
    for i in range(blocks["N-terminal"][1] - (GAP_LENGTH + HELIX_LENGTH), blocks["N-terminal"][1] - GAP_LENGTH):
        pattern[i] = 0
    for i in range(blocks["C-terminal"][0] + GAP_LENGTH, blocks["C-terminal"][0] + GAP_LENGTH + HELIX_LENGTH):
        pattern[i] = 0
    print("Pattern: ", pattern)
    tensor_pattern = torch.tensor(pattern).float()
    print(tensor_pattern)

    # Save tensors for scaffold data
    scaffold_local_dir = row['scaffold_dir'].replace(REMOTE_DIR, LOCAL_FILE_PATH)
    Path(scaffold_local_dir).mkdir(parents=True, exist_ok=True)
    torch.save(tensor_pattern, f'{scaffold_local_dir}/{row["index_str"]}_ss.pt')
    torch.save(block_adj, f'{scaffold_local_dir}/{row["index_str"]}_adj.pt')


In [24]:
def write_task(row):
    command = ("/home/wd304/.conda/envs/SE3nv-cuda116/bin/python "
               "/rds/user/wd304/hpc-work/RFdiffusion/scripts/run_inference.py "
              f"inference.output_prefix={row['rfd_prefix']} "
              f"inference.input_pdb={row['input_pdb']} "
              f"contigmap.contigs=[{row['extend_len']}/A{row['crop_NT']}-{row['crop_CT']}/{row['extend_len']}] "
               "inference.write_trajectory=False "
               "inference.num_designs=10 "
               "scaffoldguided.scaffoldguided=True "
               "scaffoldguided.target_pdb=False "
               "scaffoldguided.systematic=True "
              f"scaffoldguided.scaffold_dir={row['scaffold_dir']}")

    with open(f'{LOCAL_FILE_PATH}/tasks/tasks.txt', "a") as file:
        file.write(f"{command}\n")

In [25]:
for _, df_row in df_limited.iterrows():
    get_scaffoldguided(df_row)
    write_task(df_row)

1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [26]:
# ----- Step 4: Generate SBATCH script locally -----
# SBATCH header with the correct job array specification
sbatch_header = f"""#!/bin/bash
#SBATCH -J step1_rfdiffusion_matrix
#SBATCH --gres=gpu:1
#SBATCH -p ampere
#SBATCH -A GKAMINSKI-SL2-GPU
#SBATCH --cpus-per-task=1
#SBATCH -t 02:00:00
#SBATCH -c 1
#SBATCH -N 1
#SBATCH --mem=16g
#SBATCH --mail-type=END,FAIL
#SBATCH --mail-user=wd304@cam.ac.uk
#SBATCH --error={REMOTE_DIR}/tasks/%A_%a.err
#SBATCH --output={REMOTE_DIR}/tasks/%A_%a.out
#SBATCH --array=1-{num_scaffolds}  # Corrected array specification

module load anaconda
source /usr/local/software/anaconda/3.2019-10/etc/profile.d/conda.sh
conda activate SE3nv-cuda116

if [ -z "$SLURM_ARRAY_TASK_ID" ]; then
    echo "Running outside of SLURM. Setting TASK_ID manually."
    TASK_ID=1
else
    echo "Using SLURM_ARRAY_TASK_ID."
    TASK_ID=$(($SLURM_ARRAY_TASK_ID))
fi

export HYDRA_FULL_ERROR=1

# Get the task command from the tasks file
task=$(sed -n "${{TASK_ID}}p" {REMOTE_DIR}/tasks/tasks.txt)

# Execute the task command
echo "Running task: $task"
eval $task
"""

# Write the SBATCH script to a file
sbatch_file = f"{LOCAL_FILE_PATH}/tasks/digs_array_job.sh"
with open(sbatch_file, "w") as file:
    file.write(sbatch_header)

print(f"SBATCH script '{sbatch_file}' has been created.")

SBATCH script '/home/eva/0_bury_charged_pair/0_Biopython_playground/0_myown_tips_tricks/RFdiffusion/tasks/digs_array_job.sh' has been created.


In [None]:
# ----- Step 5: Upload to the HPC -----
subprocess.run(f"scp -r {LOCAL_FILE_PATH} {HPC_USER}:{REMOTE_DIR}", shell=True)
print(f"SLURM script copied to {REMOTE_DIR}.")

subprocess.run(f"ssh {HPC_USER} 'cd {REMOTE_DIR}/tasks/ && sbatch digs_array_job.sh'", shell=True)
print("Submitted digs_array_job.sh to HPC.")