In [1]:
import os
import pyrosetta

pyrosetta.init("--ex1 --ex2")

"""After filter out the good protein scaffold, I use pyrosetta to map the 3ult sequence back.
That is just need to find out the region that is not continuous glycine - that should be
the main body made of valine and glutamamte.
"""

┌──────────────────────────────────────────────────────────────────────────────┐
│                                 PyRosetta-4                                  │
│              Created in JHU by Sergey Lyskov and PyRosetta Team              │
│              (C) Copyright Rosetta Commons Member Institutions               │
│                                                                              │
│ NOTE: USE OF PyRosetta FOR COMMERCIAL PURPOSES REQUIRE PURCHASE OF A LICENSE │
│         See LICENSE.PyRosetta.md or email license@uw.edu for details         │
└──────────────────────────────────────────────────────────────────────────────┘
PyRosetta-4 2024 [Rosetta PyRosetta4.conda.ubuntu.cxx11thread.serialization.Ubuntu.python311.Release 2024.42+release.3366cf78a3df04339d1982e94531b77b098ddb99 2024-10-11T08:24:04] retrieved from: http://www.pyrosetta.org
core.init: Checking for fconfig files in pwd and ./rosetta/flags
core.init: Rosetta version: PyRosetta4.conda.ubuntu.cxx11thread.ser

'After filter out the good protein scaffold, I use pyrosetta to map the 3ult sequence back.\nThat is just need to find out the region that is not continuous glycine - that should be\nthe main body made of valine and glutamamte.\n'

In [3]:

def one_letter_to_three(letter):
    one_to_three = {
        'A': 'ALA', 'R': 'ARG', 'N': 'ASN', 'D': 'ASP', 'C': 'CYS',
        'Q': 'GLN', 'E': 'GLU', 'G': 'GLY', 'H': 'HIS', 'I': 'ILE',
        'L': 'LEU', 'K': 'LYS', 'M': 'MET', 'F': 'PHE', 'P': 'PRO',
        'S': 'SER', 'T': 'THR', 'W': 'TRP', 'Y': 'TYR', 'V': 'VAL'
    }
    return one_to_three.get(letter)


In [4]:
def detect_glycine_caps(pose):
    """Automatically detect the N cap and C cap lengths based on glycine residues in PyRosetta."""
    # Otherwise you can use the (MDanalysis method) in 20250101_filter_cap_1.ipynb
    # or directly reading from pdb file in 20241226_clean_pdb_sequence_pyrosetta_repack.ipynb
    n_cap_len, c_cap_len = 0, 0
    # Detect N cap (continuous glycine residues at the start)
    for i in range(1, pose.total_residue() + 1):
        if pose.residue(i).name1() == 'G':
            n_cap_len += 1
        else:
            break
    # Detect C cap (continuous glycine residues at the end)
    for i in range(pose.total_residue(), 0, -1):
        if pose.residue(i).name1() == 'G':
            c_cap_len += 1
        else:
            break
    return n_cap_len, c_cap_len


In [5]:
def modify_main_body_sequence(pose, input_sequence, n_cap_len, c_cap_len):
    """Replace only the main body sequence, leaving the N and C caps unchanged."""
    main_body_start = n_cap_len + 1
    main_body_end = pose.total_residue() - c_cap_len

    # Calculate the expected main body length
    expected_main_body_length = main_body_end - main_body_start + 1
    print(f"Detected N cap length: {n_cap_len}, C cap length: {c_cap_len}")
    print(f"Main body start: {main_body_start}, Main body end: {main_body_end}")
    print(f"Expected main body length: {expected_main_body_length}")
    print(f"Input sequence length: {len(input_sequence)}")

    # Check if input sequence length matches the main body length
    if len(input_sequence) != expected_main_body_length:
        raise ValueError("Input sequence length does not match the number of residues in the main body.")

    # Replace residues in the main body
    for i, resi in enumerate(range(main_body_start, main_body_end + 1)):
        target_residue = input_sequence[i]
        target_residue_3letter = one_letter_to_three(target_residue)

        # Only mutate if the target residue is different from the current residue
        if pose.residue(resi).name1() != target_residue:
            pose.replace_residue(
                resi, 
                pyrosetta.rosetta.core.conformation.ResidueFactory.create_residue(
                    pose.residue_type_set_for_pose().name_map(target_residue_3letter)
                ), 
                True
            )


In [6]:
def process_folder(input_folder, output_folder, input_sequence):
    """Process each PDB file in the input folder, modify the main body sequence, and save it to output folder."""
    os.makedirs(output_folder, exist_ok=True)

    for pdb_file in os.listdir(input_folder):
        if pdb_file.endswith(".pdb"):
            input_pdb_path = os.path.join(input_folder, pdb_file)
            output_pdb_path = os.path.join(output_folder, f"modified_{pdb_file}")

            # Load the pose and detect cap regions
            pose = pyrosetta.pose_from_file(input_pdb_path)
            n_cap_len, c_cap_len = detect_glycine_caps(pose)

            # Modify only the main body sequence
            modify_main_body_sequence(pose, input_sequence, n_cap_len, c_cap_len)

            # Save the modified pose
            pose.dump_pdb(output_pdb_path)
            print(f"Processed {pdb_file} -> {output_pdb_path}")


In [7]:
# Example usage
n_cap_folder = '/home/eva/0_bury_charged_pair/0_Biopython_playground/0_myown_tips_tricks/RFdiffusion/1_cap_calculation/N_cap'
c_cap_folder = '/home/eva/0_bury_charged_pair/0_Biopython_playground/0_myown_tips_tricks/RFdiffusion/1_cap_calculation/C_cap'
output_n_folder = '/home/eva/0_bury_charged_pair/0_Biopython_playground/0_myown_tips_tricks/RFdiffusion/1_cap_calculation/N_cap_add_3ultback'
output_c_folder = '/home/eva/0_bury_charged_pair/0_Biopython_playground/0_myown_tips_tricks/RFdiffusion/1_cap_calculation/C_cap_add_3ultback'

# Define the sequence to replace the main body with
main_body_sequence = "PNTISGSNNTVRSGSKNVLAGNDNTVISGDNNSVSGSNNTVVSGNDNTVTGSNHVVSGTNHIVTDNNNNVSGNDNNVSGSFHTVSGGHNTVSGSNNTVSGSNHVVSGSNKVVTD"  # Replace with the actual sequence

# Process N cap and C cap folders
process_folder(n_cap_folder, output_n_folder, main_body_sequence)
process_folder(c_cap_folder, output_c_folder, main_body_sequence)

core.chemical.GlobalResidueTypeSet: Finished initializing fa_standard residue type set.  Created 985 residue types
core.chemical.GlobalResidueTypeSet: Total time to initialize 0.796137 seconds.
core.import_pose.import_pose: File '/home/eva/0_bury_charged_pair/0_Biopython_playground/0_myown_tips_tricks/RFdiffusion/1_cap_calculation/N_cap/Ncap_3ult_cropNT_001_cropCT_114_extendlen_033_cycle_01_3.pdb' automatically determined to be of type PDB
core.pack.pack_missing_sidechains: packing residue number 34 because of missing atom number 5 atom name  CB
core.pack.pack_missing_sidechains: packing residue number 35 because of missing atom number 5 atom name  CB
core.pack.pack_missing_sidechains: packing residue number 36 because of missing atom number 5 atom name  CB
core.pack.pack_missing_sidechains: packing residue number 37 because of missing atom number 5 atom name  CB
core.pack.pack_missing_sidechains: packing residue number 38 because of missing atom number 5 atom name  CB
core.pack.pack_m