<a href="https://colab.research.google.com/github/RishikeshMagar/ColabBoltz/blob/main/ColabBoltz_Monomer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Colab Notebook to run Boltz-2.

In [4]:
#@title Install dependencies

!pip install -q --no-deps boltz -U
!pip install -q py3Dmol rdkit biopython matplotlib pandas


In [7]:
# !boltz --help

Following the [ColabFold Protocols](https://www.nature.com/articles/s41596-024-01060-5), we explore monomer Prediction in this example notebook. I will be using the [PIGU](https://github.com/steineggerlab/colabfold-protocol/blob/main/query/PIGU.fasta) fasta sequence as an example

In [2]:
#@title Input protein and ligand(s)
from google.colab import files
import os
import re
import hashlib
import random
import requests

# Helper to generate unique job name
def add_hash(x, y):
    return x + "_" + hashlib.sha1(y.encode()).hexdigest()[:5]

#@markdown ### Protein(s)
query_sequence = 'MAAPLVLVLVVAVTVRAALFRSSLAEFISERVEVVSPLSSWKRVVEGLSLLDLGVSPYSGAVFHETPLIIYLFHFLIDYAELVFMITDALTAIALYFAIQDFNKVVFKKQKLLLELDQYAPDVAELIRTPMEMRYIPLKVALFYLLNPYTILSCVAKSTCAINNTLIAFFILTTIKGSAFLSAIFLALATYQSLYPLTLFVPGLLYLLQRQYIPVKMKSKAFWIFSWEYAMMYVGSLVVIICLSFFLLSSWDFIPAVYGFILSVPDLTPNIGLFWYFFAEMFEHFSLFFVCVFQINVFFYTIPLAIKLKEHPIFFMFIQIAVIAIFKSYPTVGDVALYMAFFPVWNHLYRFLRNIFVLTCIIIVCSLLFPVLWHLWIYAGSANSNFFYAITLTFNVGQILLISDYFYAFLRREYYLTHGL'  #@param {type:"string"}
#@markdown Use `:` to separate chains (e.g., "SEQ1:SEQ2")

#@markdown ### Ligands
ligand_input = ''  #@param {type:"string"}
#@markdown - Colon-separated SMILES strings

ligand_input_ccd = ''  #@param {type:"string"}
#@markdown - Colon-separated CCD codes

ligand_input_common_name = ''  #@param {type:"string"}
#@markdown - Colon-separated common names (e.g., "aspirin")

#@markdown ### DNA
dna_input = ''  #@param {type:"string"}
#@markdown - Colon-separated DNA sequences

rna_input = ''  #@param {type:"string"}
#@markdown - Colon-separated RNA sequences

#@markdown ### Jobname
jobname = 'Monomer Prediction'  #@param {type:"string"}

# Clean up
query_sequence = "".join(query_sequence.split())
ligand_input = "".join(ligand_input.split())
ligand_input_ccd = "".join(ligand_input_ccd.split())
ligand_input_common_name = "".join(ligand_input_common_name.split())
dna_input = "".join(dna_input.split())
basejobname = "".join(jobname.split())
basejobname = re.sub(r'\W+', '', basejobname)
jobname = add_hash(basejobname, query_sequence)

# Ensure uniqueness
if os.path.exists(jobname):
    n = 0
    while os.path.exists(f"{jobname}_{n}"):
        n += 1
    jobname = f"{jobname}_{n}"

# Create job folder
os.makedirs(jobname, exist_ok=True)
print(f"✅ Job directory created: {jobname}")


✅ Job directory created: MonomerPrediction_36762_0


In [3]:
#@title Process inputs and write YAML
import os
import yaml
from string import ascii_uppercase

# Step 1: Split sequences on chain breaks
protein_sequences = query_sequence.strip().split(':') if query_sequence.strip() else []
ligand_sequences = ligand_input.strip().split(':') if ligand_input.strip() else []
ligand_sequences_ccd = ligand_input_ccd.strip().split(':') if ligand_input_ccd.strip() else []
ligand_sequences_common_name = ligand_input_common_name.strip().split(':') if ligand_input_common_name.strip() else []
dna_sequences = dna_input.strip().split(':') if dna_input.strip() else []

# Step 2: Resolve common names to SMILES
def get_smiles(compound_name):
    autocomplete_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/autocomplete/compound/{compound_name}/json?limit=1"
    autocomplete_response = requests.get(autocomplete_url)
    if autocomplete_response.status_code != 200:
        return None
    autocomplete_data = autocomplete_response.json()
    if autocomplete_data.get("status", {}).get("code") != 0 or autocomplete_data.get("total", 0) == 0:
        return None
    suggested = autocomplete_data.get("dictionary_terms", {}).get("compound", [])
    if not suggested:
        return None
    smiles_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{suggested[0]}/property/CanonicalSMILES/JSON"
    smiles_response = requests.get(smiles_url)
    if smiles_response.status_code != 200:
        return None
    props = smiles_response.json().get("PropertyTable", {}).get("Properties", [])
    return props[0].get("CanonicalSMILES") if props else None

smiles_cache = {}
for name in ligand_sequences_common_name:
    if name not in smiles_cache:
        smiles_cache[name] = get_smiles(name)
        if smiles_cache[name]:
            print(f"Mapped compound {name} to {smiles_cache[name]}")
    if smiles_cache[name]:
        ligand_sequences.append(smiles_cache[name])

# Step 3: Assign chain labels and collect entries
chain_labels = iter(ascii_uppercase)
sequences_yaml = []
csv_entries = []
seq_id_counter = 0
seq_to_seq_id = {}

for seq in protein_sequences:
    seq = seq.strip()
    if not seq:
        continue
    chain_id = next(chain_labels)
    if seq not in seq_to_seq_id:
        seq_id = f"{jobname}_{seq_id_counter}"
        seq_id_counter += 1
        seq_to_seq_id[seq] = seq_id
        csv_entries.append((seq_id, seq))
    sequences_yaml.append({
        "protein": {
            "id": chain_id,
            "sequence": seq,

        }
    })

for smiles in ligand_sequences:
    smiles = smiles.strip()
    if not smiles:
        continue
    chain_id = next(chain_labels)
    sequences_yaml.append({
        "ligand": {
            "id": chain_id,
            "smiles": smiles
        }
    })

for ccd in ligand_sequences_ccd:
    ccd = ccd.strip().upper()
    if not ccd:
        continue
    chain_id = next(chain_labels)
    sequences_yaml.append({
        "ligand": {
            "id": chain_id,
            "ccd": ccd
        }
    })

for dna in dna_sequences:
    dna = dna.strip()
    if not dna:
        continue
    chain_id = next(chain_labels)
    sequences_yaml.append({
        "dna": {
            "id": chain_id,
            "sequence": dna
        }
    })

# Step 4: Write CSV for MSA (ColabFold)
csv_path = os.path.join(jobname, f"{jobname}.csv")
with open(csv_path, 'w') as f:
    f.write("id,sequence\n")
    for seq_id, seq in csv_entries:
        f.write(f"{seq_id},{seq}\n")

# Step 5: Write YAML for Boltz
yaml_path = os.path.join(jobname, "input.yaml")
with open(yaml_path, 'w') as f:
    yaml.dump({"sequences": sequences_yaml}, f, sort_keys=False)

print(f"✅ YAML written to: {yaml_path}")
print(f"✅ CSV written to: {csv_path}")


✅ YAML written to: MonomerPrediction_36762_0/input.yaml
✅ CSV written to: MonomerPrediction_36762_0/MonomerPrediction_36762_0.csv


In [4]:
#@title Boltz Runtime Settings
#@markdown These will be passed as CLI flags to `boltz predict`

# General
out_dir = "results"  #@param {type:"string"}
cache_dir = "~/.boltz"  #@param {type:"string"}
checkpoint = ""  #@param {type:"string"}
devices = 1  #@param {type:"integer"}
accelerator = "gpu"  #@param ["gpu", "cpu", "tpu"]
num_workers = 2  #@param {type:"integer"}
output_format = "mmcif"  #@param ["pdb", "mmcif"]

# Sampling
sampling_steps = 200  #@param {type:"integer"}
diffusion_samples = 1  #@param {type:"integer"}
max_parallel_samples = 5  #@param {type:"integer"}
step_scale = 1.638  #@param {type:"number"}

# MSA
use_msa_server = True  #@param {type:"boolean"}
msa_server_url = "https://api.colabfold.com"  #@param {type:"string"}
msa_pairing_strategy = "greedy"  #@param ["greedy", "complete"]
max_msa_seqs = 8192  #@param {type:"integer"}
subsample_msa = False  #@param {type:"boolean"}
num_subsampled_msa = 1024  #@param {type:"integer"}

# Affinity settings (optional, for protein-ligand tasks later)
affinity_checkpoint = ""  #@param {type:"string"}
sampling_steps_affinity = 200  #@param {type:"integer"}
diffusion_samples_affinity = 5  #@param {type:"integer"}
affinity_mw_correction = False  #@param {type:"boolean"}

# Preprocessing and advanced flags
method = ""  #@param {type:"string"}
preprocessing_threads = 4  #@param {type:"integer"}
no_trifast = False  #@param {type:"boolean"}
override = False  #@param {type:"boolean"}
use_potentials = False  #@param {type:"boolean"}
write_full_pae = False  #@param {type:"boolean"}
write_full_pde = False  #@param {type:"boolean"}

# # For reproducibility
# random_seed = 42  #@param {type:"integer"}

print("✅ All runtime flags captured.")




✅ All runtime flags captured.


In [8]:
import shlex

# Required args
yaml_path = f"{jobname}/input.yaml"
output_dir = jobname

# CLI defaults based on Boltz documentation
defaults = {
    "sampling_steps": 200,
    "diffusion_samples": 1,
    "seed": None,
    "device": "",  # empty means auto
    "use_msa_server": False,
    "msa_server_url": "https://api.colabfold.com",
    "msa_pairing_strategy": "greedy",
    "max_parallel_samples": 5,
    "step_scale": 1.638,
    "output_format": "mmcif",
    "num_workers": 2,
    "max_msa_seqs": 8192,
    "subsample_msa": False,
    "num_subsampled_msa": 1024,
    "no_trifast": False,
    "override": False,
    "use_potentials": False,
    "write_full_pae": False,
    "write_full_pde": False,
    "checkpoint": "",
    "cache_dir": "~/.boltz",
    "devices": 1,
    "accelerator": "gpu",
    "preprocessing_threads": None,
    "method": "",
    "affinity_checkpoint": "",
    "sampling_steps_affinity": 200,
    "diffusion_samples_affinity": 5,
    "affinity_mw_correction": False,
}
# User-specified values (names match your param block exactly)
# User-specified values (names match your @param block exactly)
flags = {
    "sampling_steps": sampling_steps,
    "diffusion_samples": diffusion_samples,
    "use_msa_server": use_msa_server,
    "msa_server_url": msa_server_url,
    "msa_pairing_strategy": msa_pairing_strategy,
    "max_parallel_samples": max_parallel_samples,
    "step_scale": step_scale,
    "output_format": output_format,
    "num_workers": num_workers,
    "max_msa_seqs": max_msa_seqs,
    "subsample_msa": subsample_msa,
    "num_subsampled_msa": num_subsampled_msa,
    "no_trifast": no_trifast,
    "override": override,
    "use_potentials": use_potentials,
    "write_full_pae": write_full_pae,
    "write_full_pde": write_full_pde,
    "checkpoint": checkpoint,
    "cache_dir": cache_dir,
    "devices": devices,
    "accelerator": accelerator,
    "preprocessing_threads": preprocessing_threads,
    "method": method,
    "affinity_checkpoint": affinity_checkpoint,
    "sampling_steps_affinity": sampling_steps_affinity,
    "diffusion_samples_affinity": diffusion_samples_affinity,
    "affinity_mw_correction": affinity_mw_correction,
}


# Build command
cmd = ["boltz", "predict", "--out_dir", output_dir]

for k, v in flags.items():
    default = defaults[k]
    if v != default and v is not None and v != "":
        flag = f"--{k}".replace("_", "-")
        if isinstance(v, bool):
            if v:
                cmd.append(flag)
        else:
            cmd += [flag, str(v)]

# Positional YAML file argument
cmd.append(yaml_path)

# Print full command
print("✅ Running command:\n", " ".join(shlex.quote(arg) for arg in cmd))

# Execute
!{" ".join(shlex.quote(arg) for arg in cmd)}

✅ Running command:
 boltz predict --out_dir MonomerPrediction_36762_0 --use-msa-server --preprocessing-threads 4 MonomerPrediction_36762_0/input.yaml
Traceback (most recent call last):
  File "/usr/local/bin/boltz", line 5, in <module>
    from boltz.main import cli
  File "/usr/local/lib/python3.11/dist-packages/boltz/main.py", line 15, in <module>
    from pytorch_lightning import Trainer, seed_everything
ModuleNotFoundError: No module named 'pytorch_lightning'
