In [None]:
import glob
import logging
logging.basicConfig(level=logging.INFO)
import numpy as np
import os
import sys
sys.path.append("/path/to/my_pyrosetta_package/")
import pyrosetta
pyrosetta.init()
import pyrosetta.distributed
import pyrosetta.distributed.io as io
import pyrosetta.distributed.viewer as viewer
import sys
import subprocess

INFO:pyrosetta.rosetta:Found rosetta database at: /home/vesper/my_pyrosetta_package/pyrosetta/database; using it....
INFO:pyrosetta.rosetta:┌──────────────────────────────────────────────────────────────────────────────┐
│                                 PyRosetta-4                                  │
│              Created in JHU by Sergey Lyskov and PyRosetta Team              │
│              (C) Copyright Rosetta Commons Member Institutions               │
│                                                                              │
│ NOTE: USE OF PyRosetta FOR COMMERCIAL PURPOSES REQUIRE PURCHASE OF A LICENSE │
│         See LICENSE.PyRosetta.md or email license@uw.edu for details         │
└──────────────────────────────────────────────────────────────────────────────┘
PyRosetta-4 2024 [Rosetta devel 2024.24.post.dev+4.main.f5ae1de8e146ed3da2662da903342c9c1ad0b046 2024-08-12T12:35:30] retrieved from: https://github.com/RosettaCommons/rosetta.git
INFO:rosetta:core.init: Checking

┌──────────────────────────────────────────────────────────────────────────────┐
│                                 PyRosetta-4                                  │
│              Created in JHU by Sergey Lyskov and PyRosetta Team              │
│              (C) Copyright Rosetta Commons Member Institutions               │
│                                                                              │
│ NOTE: USE OF PyRosetta FOR COMMERCIAL PURPOSES REQUIRE PURCHASE OF A LICENSE │
│         See LICENSE.PyRosetta.md or email license@uw.edu for details         │
└──────────────────────────────────────────────────────────────────────────────┘
PyRosetta-4 2024 [Rosetta devel 2024.24.post.dev+4.main.f5ae1de8e146ed3da2662da903342c9c1ad0b046 2024-08-12T12:35:30] retrieved from: https://github.com/RosettaCommons/rosetta.git


In [None]:
import glob
import os
import subprocess
from rdkit import Chem

# 设置路径
pdb_file = "8tze.pdb" # 输入的 PDB 文件
protein_pdb = "pro.pdb" # 输出的蛋白质 PDB 文件

ligand_files = sorted(glob.glob("ligands/*.mol2")) # 输入的配体文件夹

rosetta_script = "/path/to/install/rosetta/source/scripts/python/public/molfile_to_params.py"

# === 分离蛋白和配体 ===
subprocess.run(f"grep ^ATOM {pdb_file} > {protein_pdb}", shell=True)



# 计算电荷

def get_formal_charge(mol2_file):
    mol = Chem.MolFromMol2File(mol2_file, sanitize=True)
    if mol is None:
        raise ValueError(f"无法读取 {mol2_file}")
    return Chem.GetFormalCharge(mol)


# === 处理配体 ===
for ligand_mol2 in ligand_files:
    ligand_name = os.path.splitext(os.path.basename(ligand_mol2))[0]
    params_file = f"{ligand_name.upper()}.params"

    print(f"\n>>> 处理配体：{ligand_name}")

    charge = get_formal_charge(ligand_mol2)
    subprocess.run(f"acpype -i {ligand_mol2} -n {charge} -a gaff", shell=True)

    # 2. 寻找 ACPYPE 输出的 mol2 文件
    ligand_gaff_candidates = glob.glob(f"{ligand_name}.acpype/*_bcc_gaff.mol2")
    if not ligand_gaff_candidates:
        raise FileNotFoundError(f"未找到 {ligand_name}.acpype 下的 *_bcc_gaff.mol2，ACPYPE 可能失败！")
    ligand_gaff = ligand_gaff_candidates[0]

    # 3. 修正 GAFF 原子类型
    with open(ligand_gaff, "r") as f:
        content = f.readlines()

    fix_dict = {
        "c3": "C", "ca": "C", "cg": "C", "ch": "C", "cd": "C", "cc": "C","cp": "C", "c": "C",
        "nd": "N", "na": "N", "n3": "N", "n2": "N", "n4": "N", "NH": "N", "n": "N",  "nh": "N", "nc": "N", "NC": "N",
        "o3": "O",
        "o": "O", "o2": "O",
        "f": "F",
        "s6": "S", "s4": "S",
        "cl": "Cl",
        "br": "Br",
        "i": "I",
        "p3": "P",
        "h1": "H", "h4": "H", "ha": "H", "hn": "H", "HC": "H", "hc": "H", "ho": "H", "hs": "H", "hx": "H",
    }

    fixed_content = []
    for line in content:
        split_line = line.split()
        if len(split_line) > 5:
            atom_type = split_line[5]
            if atom_type in fix_dict:
                split_line[5] = fix_dict[atom_type]
                line = " ".join(split_line) + "\n"
        fixed_content.append(line)

    with open(ligand_gaff, "w") as f:
        f.writelines(fixed_content)

    # 4. 生成 Rosetta params 文件
    rosetta_cmd = f"python {rosetta_script} -n {ligand_name.upper()} --extra_torsion_output {ligand_gaff} --clobber"
    result = subprocess.run(rosetta_cmd, shell=True, capture_output=True, text=True)

    if result.returncode != 0:
        raise RuntimeError(f"错误: `molfile_to_params.py` 运行失败！\n{result.stderr}")

    if not os.path.exists(params_file):
        raise RuntimeError(f"错误: `{params_file}` 未生成！")

    print(f"{params_file} 生成成功")

print("\n所有配体处理完成！")


INFO:rdkit:Enabling RDKit 2024.03.5 jupyter extensions



>>> 处理配体：1919888-06-4
| ACPYPE: AnteChamber PYthon Parser interfacE v. 2023.10.27 (c) 2025 AWSdS |
==> ... charge set to 0
==> Executing Antechamber...
==> * Antechamber OK *
==> * Parmchk OK *
==> Executing Tleap...
==> * Tleap OK *
==> Removing temporary files...
==> Using OpenBabel v.3.1.0

==> Writing NEW PDB file

==> Writing CNS/XPLOR files

==> Writing GROMACS files

==> Disambiguating lower and uppercase atomtypes in GMX top file, even if identical.

==> Writing GMX dihedrals for GMX 4.5 and higher.

==> Writing CHARMM files

==> Writing pickle file 1919888-06-4.pkl
==> Removing temporary files...
Total time of execution: 1m 12s
1919888-06-4.params 生成成功

>>> 处理配体：918505-61-0
| ACPYPE: AnteChamber PYthon Parser interfacE v. 2023.10.27 (c) 2025 AWSdS |
==> ... charge set to -1
==> Executing Antechamber...
==> * Antechamber OK *
==> * Parmchk OK *
==> Executing Tleap...
==> * Tleap OK *
==> Removing temporary files...
==> Using OpenBabel v.3.1.0

==> Writing NEW PDB file

==> Wri

In [3]:
from pyrosetta import init, pose_from_pdb
from pyrosetta.toolbox import cleanATOM

# 初始化 PyRosetta
init()

# 清理 PDB 文件，只保留 ATOM 行
cleanATOM("pro.pdb")  # 生成的是 pro.clean.pdb

# 加载原始和清理后的 Pose
pose = pose_from_pdb("pro.pdb")
pose_clean = pose_from_pdb("pro.clean.pdb")


INFO:pyrosetta.rosetta:Found rosetta database at: /home/vesper/my_pyrosetta_package/pyrosetta/database; using it....
INFO:pyrosetta.rosetta:┌──────────────────────────────────────────────────────────────────────────────┐
│                                 PyRosetta-4                                  │
│              Created in JHU by Sergey Lyskov and PyRosetta Team              │
│              (C) Copyright Rosetta Commons Member Institutions               │
│                                                                              │
│ NOTE: USE OF PyRosetta FOR COMMERCIAL PURPOSES REQUIRE PURCHASE OF A LICENSE │
│         See LICENSE.PyRosetta.md or email license@uw.edu for details         │
└──────────────────────────────────────────────────────────────────────────────┘
PyRosetta-4 2024 [Rosetta devel 2024.24.post.dev+4.main.f5ae1de8e146ed3da2662da903342c9c1ad0b046 2024-08-12T12:35:30] retrieved from: https://github.com/RosettaCommons/rosetta.git
INFO:rosetta:core.init: Checking

┌──────────────────────────────────────────────────────────────────────────────┐
│                                 PyRosetta-4                                  │
│              Created in JHU by Sergey Lyskov and PyRosetta Team              │
│              (C) Copyright Rosetta Commons Member Institutions               │
│                                                                              │
│ NOTE: USE OF PyRosetta FOR COMMERCIAL PURPOSES REQUIRE PURCHASE OF A LICENSE │
│         See LICENSE.PyRosetta.md or email license@uw.edu for details         │
└──────────────────────────────────────────────────────────────────────────────┘
PyRosetta-4 2024 [Rosetta devel 2024.24.post.dev+4.main.f5ae1de8e146ed3da2662da903342c9c1ad0b046 2024-08-12T12:35:30] retrieved from: https://github.com/RosettaCommons/rosetta.git


INFO:rosetta:core.chemical.GlobalResidueTypeSet: Finished initializing fa_standard residue type set.  Created 985 residue types
INFO:rosetta:core.chemical.GlobalResidueTypeSet: Total time to initialize 0.513777 seconds.
INFO:rosetta:core.import_pose.import_pose: File 'pro.pdb' automatically determined to be of type PDB
INFO:rosetta:core.pack.pack_missing_sidechains: packing residue number 3 because of missing atom number 6 atom name  CG1
INFO:rosetta:core.pack.pack_missing_sidechains: packing residue number 4 because of missing atom number 6 atom name  OG
INFO:rosetta:core.pack.pack_missing_sidechains: packing residue number 5 because of missing atom number 6 atom name  CG
INFO:rosetta:core.pack.pack_missing_sidechains: packing residue number 12 because of missing atom number 6 atom name  CG
INFO:rosetta:core.pack.pack_missing_sidechains: packing residue number 25 because of missing atom number 6 atom name  CG
INFO:rosetta:core.pack.pack_missing_sidechains: packing residue number 26 be

In [4]:
import difflib

def find_difference_difflib(str1, str2):
    d = difflib.Differ()
    diff = list(d.compare(str1, str2))
    return '|'.join(diff)

# Differences between uncleaned and cleaned sequences
find_difference_difflib(pose.sequence(),
                        pose_clean.sequence())

# non-canonical amino acids and hetatms are spelled out more explicitly now
pose.annotated_sequence()


# non-canonical amino acids and hetero atoms (`hetatms`) are spelled out more explicitly now
pose_clean.annotated_sequence()

'G[GLY:NtermProteinFull]IISRRDVEYFKLLEKFQIALPIGEEYLLVPSSLSDHRPVIELPHCENSEIIIRLYEMPYFPMGFWSRLINRLLEISPYMLLRPNRMYWRQGIYLNWSPEAYCLVGSEVLDNHPESFLKITVPSCRKGCILLGQVVDHIDSLMEEWFPGLLEIETLLKKWALYSFNDGEEHQKILLDDLMKKAEEGDLLVNPDQPRLTIPISQIAPDLILADLPRNIMLNNDELEFEQAPEFLLGDGSFGSVYRAAYEGEEVAVKIFNKHTSLRLLRQELVVLCHLHHPSLISLLAAGIRPRMLVMELASKGSLDRLLQQDKASLTRTLQHRIALHVADGLRYLHSAMIIYRDLKPHNVLLFTLYPNAAIIAKIADYGIPGFRAPEVARGYNQQADVYSFGLLLYDILTTGGRIVLPDPVKEYGCAPWPMVEKLIKQCLKENPQERPTSAQVFDILNSAELVCLTRRILLPKNVIVECMVATHHASIWLGCGHTDRGQLSFLDLNTEGYTSEEVADSRILCLALVHLESWIVSGTQSGTLLVINTEDGKHTLEKMTDSVTCLYCNFLLVGTADGKLAIFEDKTVKLKGAAPLKILNIGNVSTPLMCLSESNVMWGGCGTKIFSFSNDFTIQKLIETRTSQLFSYAAFSDSNIITVVVDTALYIAKQNSPVVEVWDKKTEKLCGLIDCVHFLREVMSGRVKTLCLQKNTALWIGTGGGHILLLDLSTRRLIRVIYNFCNSVRVMMTAQLGSLKNVMLVLGYNREIQSCLTVWDINLPHEVQNLEKHIEVRKELAEKMR[ARG:CtermProteinFull]'

In [5]:
# Iterate over atoms and identify heteroatoms
for residue_id in range(1, pose.size() + 1):
    residue = pose.residue(residue_id)
    if residue.is_metal():
        print(f"HETATM found in residue {residue_id}: {residue.name3()}")

In [6]:
# Iterate over atoms and identify heteroatoms
for residue_id in range(1, pose_clean.size() + 1):
    residue = pose_clean.residue(residue_id)
    if residue.is_metal():
        print(f"HETATM found in residue {residue_id}: {residue.name3()}")

In [7]:
# Count the number of chains in the pose
print(pose_clean.num_chains())

1


In [None]:
# Access the information corresponding to the PDB file
print(pose_clean.pdb_info())   # 看pose id 的方法

PDB file name: pro.clean.pdb
 Pose Range  Chain    PDB Range  |   #Residues         #Atoms

0001 -- 0009    A 1624  -- 1632  |   0009 residues;    00149 atoms
0010 -- 0081    A 1649  -- 1720  |   0072 residues;    01229 atoms
0082 -- 0153    A 1727  -- 1798  |   0072 residues;    01172 atoms
0154 -- 0369    A 1805  -- 2020  |   0216 residues;    03500 atoms
0370 -- 0380    A 2036  -- 2046  |   0011 residues;    00163 atoms
0381 -- 0405    A 2050  -- 2074  |   0025 residues;    00395 atoms
0406 -- 0474    A 2092  -- 2160  |   0069 residues;    01122 atoms
0475 -- 0517    A 2165  -- 2207  |   0043 residues;    00647 atoms
0518 -- 0539    A 2212  -- 2233  |   0022 residues;    00323 atoms
0540 -- 0554    A 2236  -- 2250  |   0015 residues;    00233 atoms
0555 -- 0600    A 2261  -- 2306  |   0046 residues;    00704 atoms
0601 -- 0685    A 2313  -- 2397  |   0085 residues;    01350 atoms
0686 -- 0752    A 2411  -- 2477  |   0067 residues;    01085 atoms
0753 -- 0787    A 2488  -- 2522  |   

In [9]:
for k, v in pyrosetta.rosetta.core.pose.conf2pdb_chain(pose_clean).items():
    print(f"Chain {v}: {pose_clean.chain_sequence(k)}")

Chain A: GIISRRDVEYFKLLEKFQIALPIGEEYLLVPSSLSDHRPVIELPHCENSEIIIRLYEMPYFPMGFWSRLINRLLEISPYMLLRPNRMYWRQGIYLNWSPEAYCLVGSEVLDNHPESFLKITVPSCRKGCILLGQVVDHIDSLMEEWFPGLLEIETLLKKWALYSFNDGEEHQKILLDDLMKKAEEGDLLVNPDQPRLTIPISQIAPDLILADLPRNIMLNNDELEFEQAPEFLLGDGSFGSVYRAAYEGEEVAVKIFNKHTSLRLLRQELVVLCHLHHPSLISLLAAGIRPRMLVMELASKGSLDRLLQQDKASLTRTLQHRIALHVADGLRYLHSAMIIYRDLKPHNVLLFTLYPNAAIIAKIADYGIPGFRAPEVARGYNQQADVYSFGLLLYDILTTGGRIVLPDPVKEYGCAPWPMVEKLIKQCLKENPQERPTSAQVFDILNSAELVCLTRRILLPKNVIVECMVATHHASIWLGCGHTDRGQLSFLDLNTEGYTSEEVADSRILCLALVHLESWIVSGTQSGTLLVINTEDGKHTLEKMTDSVTCLYCNFLLVGTADGKLAIFEDKTVKLKGAAPLKILNIGNVSTPLMCLSESNVMWGGCGTKIFSFSNDFTIQKLIETRTSQLFSYAAFSDSNIITVVVDTALYIAKQNSPVVEVWDKKTEKLCGLIDCVHFLREVMSGRVKTLCLQKNTALWIGTGGGHILLLDLSTRRLIRVIYNFCNSVRVMMTAQLGSLKNVMLVLGYNREIQSCLTVWDINLPHEVQNLEKHIEVRKELAEKMR
