In [4]:
import os
import json
from Bio.PDB import PDBParser
from Bio.Data import IUPACData
from rdkit import Chem
import datamol as dm
from pathlib import Path

dm.disable_rdkit_log()

def extract_all_chains_from_pdb(pdb_path: str):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("struct", pdb_path)
    three_to_one = IUPACData.protein_letters_3to1
    chain_dict = {}
    for model in structure:
        for chain in model:
            seq_list = []
            for residue in chain.get_residues():
                if residue.get_id()[0] == " ":
                    r = residue.get_resname().strip().upper()
                    aa = three_to_one.get(r.capitalize(), "X")
                    seq_list.append(aa)
            chain_dict[chain.id] = "".join(seq_list)
        break
    return chain_dict

def get_ligand_smiles(sdf_path: str) -> str:
    suppl = Chem.SDMolSupplier(str(sdf_path), sanitize=False)
    for m in suppl:
        if m is None:
            continue
        try:
            Chem.SanitizeMol(m)
        except:
            pass
        raw_smiles = Chem.MolToSmiles(m, isomericSmiles=True)
        mol = dm.to_mol(raw_smiles, ordered=True)
        mol = dm.fix_mol(mol)
        mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False)
        if mol is None:
            continue
        try:
            mol = dm.standardize_mol(
                mol,
                disconnect_metals=False,
                normalize=True,
                reionize=True,
                uncharge=False,
                stereo=True,
            )
        except:
            continue
        return dm.to_smiles(mol)
    return ""

pdb_list_file = Path("/mnt/ligandpro/data/docking_hack/nikolenko_calc/timesplit_test_calc/timesplit_test")
base_dir = Path("/mnt/ligandpro/data/talgat/flowdock_data/PDBBind_processed/")
json_dir = Path("json_inputs")
os.makedirs(json_dir, exist_ok=True)

with open(pdb_list_file, "r") as f:
    pdb_ids = [line.strip() for line in f if line.strip()]

ligand_count = 0
no_ligand_count = 0

for pdb_id in pdb_ids:
    pdb_dir = base_dir / pdb_id
    pdb_file = pdb_dir / f"{pdb_id}_protein_processed.pdb"
    sdf_file = pdb_dir / f"{pdb_id}_ligand.sdf"
    print(f"Processing {pdb_id}, PDB file: {pdb_file}, SDF file: {sdf_file}")
    if not pdb_file.exists():
        print(f"PDB file not found: {pdb_file}")
        continue
    if not sdf_file.exists():
        print(f"SDF file not found: {sdf_file}")
        no_ligand_count += 1
        continue

    chain_sequences = extract_all_chains_from_pdb(str(pdb_file))
    print(f"Found {len(chain_sequences)} chains: {chain_sequences}")
    ligand_smiles = get_ligand_smiles(str(sdf_file))
    print(f"Ligand SMILES: {ligand_smiles or 'no ligand'}")

    seqs = []
    for chain_id, seq in chain_sequences.items():
        seqs.append({"protein": {"id": chain_id, "sequence": seq}})
    seqs.append({"ligand": {"id": "L", "smiles": ligand_smiles}})

    data = {
        "name": pdb_id,
        "modelSeeds": [0],
        "sequences": seqs,
        "dialect": "alphafold3",
        "version": 2
    }

    out_json = json_dir / f"{pdb_id}.json"
    with open(out_json, "w") as o:
        json.dump(data, o, indent=2)
    print(f"JSON saved: {out_json}")
    ligand_count += 1

print(f"\nTotal processed: {len(pdb_ids)}")
print(f"With ligands: {ligand_count}")
print(f"Without ligands: {no_ligand_count}")


Processing 6qqw, PDB file: /mnt/ligandpro/data/talgat/flowdock_data/PDBBind_processed/6qqw/6qqw_protein_processed.pdb, SDF file: /mnt/ligandpro/data/talgat/flowdock_data/PDBBind_processed/6qqw/6qqw_ligand.sdf
Found 2 chains: {'A': 'SMKIDVVTIFPEYLQPVRQSLPGKAIDAGLVDVAVHDLRRWTHDVHKSVDDSPYGGGPGMVMKPTVWGDALDEICTSETLLVVPTPAGYPFTQETAWQWSTEDHLVIACGRYEGIDQRVADDAATRMRVREVSIGDYVLNGGEAAALVIIEAVLRLVPGVLSLLEGPSYTRPPSWRGMDVPPVLLSGDHAKIAAWRAEQSRQRTIERRPDLL', 'B': 'SMKIDVVTIFPEYLQPVGLVDVAVHDLRRWTSVDDSPYGGGPGMVMKPTVWGDALDEICTSETLLVVPTPAGYPFTQETAWQWSTEDHLVIACGRYEGIDQRVADDAATRMRVREVSIGDYVLNGGEAAALVIIEAVLRLVPGSLLEGPSYTRPPSWRGMDVPPVLLSGDHAKIAAWRAEQSRQRTIERRPDLLGFDSP'}
Ligand SMILES: NC1CC(c2ccc3ccn(Cc4ccccc4)c3c2)NN1
JSON saved: json_inputs/6qqw.json
Processing 6d08, PDB file: /mnt/ligandpro/data/talgat/flowdock_data/PDBBind_processed/6d08/6d08_protein_processed.pdb, SDF file: /mnt/ligandpro/data/talgat/flowdock_data/PDBBind_processed/6d08/6d08_ligand.sdf
Found 1 chains: {'A': 'GEFVVEKVLDRRVVKGKVEYLLKWKGGSD

In [10]:
docker run -it --rm \
    --gpus "device=3" \
    -e DB_DIR=/root/public_databases \
    --volume /mnt/ligandpro/data/DockGen/json_inputs_template:/root/af_input \
    --volume /mnt/ligandpro/soft/protein/alphafold3/infer_output:/root/af_output \
    --volume /mnt/ligandpro/soft/protein/alphafold3/models:/root/models \
    --volume /mnt/ligandpro/db/AF3:/root/public_databases \
    alphafold3 \
    python run_alphafold.py \
    --json_path=/root/af_input/1hg0_1_SIN_1.json \
    --model_dir=/root/models \
    --output_dir=/root/af_output \
    --jackhmmer_n_cpu=16 \
    --nhmmer_n_cpu=16



docker run -it --gpus all --rm \
    -e DB_DIR=/root/public_databases \
    --volume /mnt/ligandpro/soft/protein/alphafold3/infer_input:/root/af_input \
    --volume /mnt/ligandpro/soft/protein/alphafold3/infer_output:/root/af_output \
    --volume /mnt/ligandpro/soft/protein/alphafold3/models:/root/models \
    --volume /mnt/ligandpro/db/AF3:/root/public_databases \
    alphafold3 \
    python run_alphafold.py \
    --json_path=/root/af_input/1hg0_1_SIN_1.json \
    --model_dir=/root/models \
    --output_dir=/root/af_output

