In [1]:
# Standard libraries
import csv
import glob
import json
import math
import os
import random
import re
import subprocess
import sys
import time
from shutil import copy2

# Third-party libraries
import getpass
import importlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import yaml

# BioPython
from Bio import SeqIO
from Bio.PDB import PDBParser, PPBuilder
from Bio.PDB.vectors import calc_dihedral

# Excel writer
from openpyxl.workbook import Workbook

# Local utilities
### Path to this cloned GitHub repo:
SCRIPT_DIR = os.path.dirname("/rds/general/user/wh621/home/heme_binder_diffusion/PCBpipeline.ipynb") # edit this to the GitHub repo path. Throws an error by default.
assert os.path.exists(SCRIPT_DIR)
sys.path.append(SCRIPT_DIR + "/scripts/utils")
import utils

# Debugging
print(SCRIPT_DIR)
print(sys.path)

/rds/general/user/wh621/home/heme_binder_diffusion
['/rds/general/user/wh621/home/miniforge3/envs/pipeline/lib/python311.zip', '/rds/general/user/wh621/home/miniforge3/envs/pipeline/lib/python3.11', '/rds/general/user/wh621/home/miniforge3/envs/pipeline/lib/python3.11/lib-dynload', '', '/rds/general/user/wh621/home/miniforge3/envs/pipeline/lib/python3.11/site-packages', '/rds/general/user/wh621/home/heme_binder_diffusion/scripts/utils']


In [2]:
diffusion_script = "/rds/general/user/wh621/home/rf_diffusion_all_atom/run_inference.py"  # edit this
inpaint_script = "PATH/TO/RFDesign/inpainting/inpaint.py"  # edit this if needed
proteinMPNN_script = f"{SCRIPT_DIR}/lib/LigandMPNN/run.py"  # from submodule
AF2_script = f"{SCRIPT_DIR}/scripts/af2/af2.py"  # from submodule

print({SCRIPT_DIR})

### Python and/or Apptainer executables needed for running the jobs
### Please provide paths to executables that are able to run the different tasks.
### They can all be the same if you have an environment with all of the ncessary Python modules in one

# If your added Apptainer does not execute scripts directly,
# try adding 'apptainer run' or 'apptainer run --nv' (for GPU) in front of the command

CONDAPATH = "/rds/general/user/wh621/home/miniforge3"   # edit this depending on where your Conda environments live
PYTHON = {"diffusion": f"{CONDAPATH}/envs/diffusion/bin/python",
          "af2": f"{CONDAPATH}/envs/mlfold/bin/python",
          "proteinMPNN": f"{CONDAPATH}/envs/diffusion/bin/python",
          "general": f"{CONDAPATH}/envs/diffusion/bin/python"}

{'/rds/general/user/wh621/home/heme_binder_diffusion'}


In [3]:
username = getpass.getuser()  # your username on the running system
print(username)
EMAIL = f"{username}@ic.ac.uk"  # edit based on your organization. For Slurm job notifications.

PROJECT = "example_Heme_diffusion"

### Path where the jobs will be run and outputs dumped
WDIR = f"{SCRIPT_DIR}/output"

if not os.path.exists(WDIR):
    os.makedirs(WDIR, exist_ok=True)

print(f"Working directory: {WDIR}")

USE_GPU_for_AF2 = True

wh621
Working directory: /rds/general/user/wh621/home/heme_binder_diffusion/output


In [4]:
# Ligand information
params = [f"{SCRIPT_DIR}/theozyme/PCB/PCB.params"]  # Rosetta params file(s)
LIGAND = "PCB"

In [7]:
SEQUENCE_DIR = f"{WDIR}/chainbreaker/sequence"
YAML_OUTPUT_DIR = f"{WDIR}/chainbreaker/input_yaml"
os.makedirs(YAML_OUTPUT_DIR, exist_ok=True)

Ligand_SMILES = 'C/C=C1C(=C/c2[nH]c(/C=c3\\[nH]/c(=C\\C4=NC(=O)C(CC)=C4C)c(C)c3CCC(=O)O)c(CCC(=O)O)c2C)/NC(=O)[C@@H]/1C'

fasta_files = glob.glob(os.path.join(SEQUENCE_DIR, "*.fa"))
print(f"Found {len(fasta_files)} FASTA files in {SEQUENCE_DIR}.")

for fasta_file in fasta_files:
    base_name = os.path.basename(fasta_file).replace(".fa", "")
    record = next(SeqIO.parse(fasta_file, "fasta"))
    sequence = str(record.seq)

    yaml_dict = {
        "version": 1,
        "sequences": [
            {
                "protein": {
                    "id": "A",
                    "sequence": sequence,
                    "msa":"empty"
                }
            },
            {
                "ligand": {
                    "id": "B",
                    "smiles": Ligand_SMILES
                }
            }
        ],
        "properties": [
            {
                "affinity": {
                    "binder": "B"
                }
            }
        ]
    }

    yaml_file = os.path.join(YAML_OUTPUT_DIR, f"{base_name}.yaml")
    with open(yaml_file, "w") as f:
        yaml.dump(yaml_dict, f, sort_keys=False)

    print(f"Generated Boltz-2 YAML for {base_name} -> {yaml_file}")

Found 339 FASTA files in /rds/general/user/wh621/home/heme_binder_diffusion/output/chainbreaker/sequence.
Generated Boltz-2 YAML for 5tou_Chain_I -> /rds/general/user/wh621/home/heme_binder_diffusion/output/chainbreaker/input_yaml/5tou_Chain_I.yaml
Generated Boltz-2 YAML for 7sut_Chain_B -> /rds/general/user/wh621/home/heme_binder_diffusion/output/chainbreaker/input_yaml/7sut_Chain_B.yaml
Generated Boltz-2 YAML for 3o18_Chain_A -> /rds/general/user/wh621/home/heme_binder_diffusion/output/chainbreaker/input_yaml/3o18_Chain_A.yaml
Generated Boltz-2 YAML for 8xf1_Chain_A -> /rds/general/user/wh621/home/heme_binder_diffusion/output/chainbreaker/input_yaml/8xf1_Chain_A.yaml
Generated Boltz-2 YAML for 7eh7_Chain_I -> /rds/general/user/wh621/home/heme_binder_diffusion/output/chainbreaker/input_yaml/7eh7_Chain_I.yaml
Generated Boltz-2 YAML for 1gh0_Chain_B -> /rds/general/user/wh621/home/heme_binder_diffusion/output/chainbreaker/input_yaml/1gh0_Chain_B.yaml
Generated Boltz-2 YAML for 4o4s_Chai

In [8]:
DUMP_DIR = f"{WDIR}/chainbreaker/result"

yaml_files = glob.glob(os.path.join(YAML_OUTPUT_DIR, "*.yaml"))
print(f"Found {len(yaml_files)} YAML files to process in {YAML_OUTPUT_DIR}.")

# === Generate prediction commands ===
commands_boltz = []
cmds_filename_boltz = os.path.join(DUMP_DIR, "commands_boltz")

with open(cmds_filename_boltz, "w") as file:
    for yaml_file in yaml_files:
        cmd = (
            f"boltz predict {yaml_file} "
            f"--out_dir {DUMP_DIR} "
            f"--use_potentials "
            f"--diffusion_samples 5 "
            f"--recycling_steps 10 "
            f"--sampling_steps 200 "
            f"--accelerator gpu"
        )
        commands_boltz.append(cmd)
        file.write(cmd + "\n")

print("Example Boltz-2 command:")
print(commands_boltz[-1])
print(f"Total {len(commands_boltz)} commands written to {cmds_filename_boltz}")

Found 339 YAML files to process in /rds/general/user/wh621/home/heme_binder_diffusion/output/chainbreaker/input_yaml.


NameError: name 'DUMP_DIR' is not defined

In [None]:
submit_script = os.path.join(DUMP_DIR, "submit_boltz.sh")

utils.create_slurm_submit_script(
    filename=submit_script,
    name="boltz_job",
    gpu=True,
    gres="gpu:a2000:1",
    mem="64g",
    N_cores=4,
    time="24:00:00",
    email=EMAIL,
    group=10,
    array=len(commands_boltz),
    array_commandfile=cmds_filename_boltz
)

print(f"SLURM submission script '{submit_script}' created.")