conda activate getcontact

These are several script:
1. Clean and renumber PDB files from folder
2. Output the protein sequence from PDB
3. Repack the protein sequence using pyrosetta

In [21]:
from genericpath import isdir
from os.path import join
from os import makedirs, listdir
from pymol import cmd # type: ignore

def clean_and_renumber_pdb(input_file, output_file, start_resi=1, chains_to_keep="A"):
    """
    Cleans and renumbers a PDB file:
    - Removes water (HOH) and common ions (NA, CL, MG, CA, ZN).
    - Keeps only atoms with alternate locations 'A' or ' '.
    - Filters to keep only specified chain(s), default is chain A.
    - Renumbers residues and atoms sequentially
    """
    cmd.load(input_file, "protein")

    # Remove water, ions, and unwanted heteroatoms
    cmd.remove("resn HOH or resn NA+CL+MG+CA+ZN or hetatm")

    # Keep only alternate locations 'A' or empty
    cmd.remove("not alt ''+A")

    # Keep only the specified chain(s)
    if chains_to_keep:
        cmd.remove(f"not chain {chains_to_keep}")

    # Renumber residues to start from 1
    starting_residue = int(cmd.get_model("all").atom[0].resi)  # Find the starting residue number
    cmd.alter("all", f"resi=str(int(resi) - {starting_residue - 1})")  # Adjust residues to start from 1

    # Renumber atoms sequentially starting from 1
    atom_counter = 1
    cmd.alter("all", "serial=atom_counter; atom_counter+=1", space={'atom_counter': atom_counter})

    # Sort atoms to ensure proper ordering
    cmd.sort()

    # Save the cleaned and renumbered PDB file
    cmd.save(output_file, "protein")
    cmd.delete("all")


def batch_clean_and_renumber(input_folder, output_folder, chains_to_keep="A"):
    if not isdir(output_folder):
        makedirs(output_folder, exist_ok=True)

    for file_name in listdir(input_folder):
        if file_name.endswith(".pdb"):
            input_path = join(input_folder, file_name)
            output_path = join(output_folder, f"{file_name.split('.')[0]}_cleaned.pdb")
            print(f"Processing: {file_name}")
            clean_and_renumber_pdb(input_path, output_path, chains_to_keep=chains_to_keep)
            print(f"Saved cleaned file: {output_path}")


# Example usage
input_folder = "/home/eva/0_bury_charged_pair/0_Biopython_playground/0_myown_tips_tricks/input"  
output_folder = "/home/eva/0_bury_charged_pair/0_Biopython_playground/0_myown_tips_tricks/output" 
batch_clean_and_renumber(input_folder, output_folder, chains_to_keep="A")


Processing: 3ult.pdb
Saved cleaned file: /home/eva/0_bury_charged_pair/0_Biopython_playground/0_myown_tips_tricks/output/3ult_cleaned.pdb


In [22]:
#Output protein sequences
def parse_pdb_sequence(file_path, chain_id='A'):
    three_to_one = {
        'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C',
        'GLN': 'Q', 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
        'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P',
        'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
    }

    sequence = {}
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith(('ATOM')) and line[21].strip() == chain_id:
                res_name = line[17:20].strip()
                if res_name in three_to_one:
                    sequence[line[22:27].strip()] = three_to_one[res_name]   
    return "".join(sequence.values())

#Sequence of chain A: PNTISGSNTVRSGSKNVLAGNDNTVISGDNSVSGSNTVSGNDNTVTGSNHVSGTNHIVTDNVSGNDNVSGSFHTVSGHNTVSGSNTVSGSNHVSGSNKVTD
# Provide the path to your PDB file
pdb_file_path = '/home/eva/0_bury_charged_pair/0_Biopython_playground/0_myown_tips_tricks/output/3ult_cleaned.pdb'

sequence = parse_pdb_sequence(pdb_file_path)
print(f"Sequence of chain A: {sequence}")
print(len("PNTISGSNNTVRSGSKNVLAGNDNTVISGDNNSVSGSNNTVVSGNDNTVTGSNHVVSGTNHIVTDNNNNVSGNDNNVSGSFHTVSGGHNTVSGSNNTVSGSNHVVSGSNKVVTD"))

Sequence of chain A: PNTISGSNNTVRSGSKNVLAGNDNTVISGDNNSVSGSNNTVVSGNDNTVTGSNHVVSGTNHIVTDNNNNVSGNDNNVSGSFHTVSGGHNTVSGSNNTVSGSNHVVSGSNKVVTD
114


In [1]:
# Another way of output the protein sequence
import Bio.PDB

pdb_file_path = '/home/eva/0_bury_charged_pair/0_Biopython_playground/0_myown_tips_tricks/output/3ult_cleaned.pdb'

structure = Bio.PDB.PDBParser(QUIET=True).get_structure('protein', pdb_file_path)
ppb = Bio.PDB.PPBuilder()
sequence = "".join(str(pp.get_sequence()) for pp in ppb.build_peptides(structure))
if not sequence:
    print(f"No valid sequence in {pdb_file_path}. Skipping.")
else:
    print(sequence)

PNTISGSNNTVRSGSKNVLAGNDNTVISGDNNSVSGSNNTVVSGNDNTVTGSNHVVSGTNHIVTDNNNNVSGNDNNVSGSFHTVSGGHNTVSGSNNTVSGSNHVVSGSNKVVTD


In [24]:
#Sometimes you want to output all the fasta from the same folder into dictionary
input_pdb_folder = "/home/eva/0_bury_charged_pair/0_Biopython_playground/0_myown_tips_tricks/input" 
fasta_list = {}
for file_name in listdir(input_pdb_folder):
    if file_name.endswith(".pdb"):
        input_path = join(input_folder, file_name)
        fasta_list[file_name] = parse_pdb_sequence(input_path)
print(fasta_list)

{'3ult.pdb': 'PNTISGSNNTVRSGSKNVLAGNDNTVISGDNNSVSGSNNTVVSGNDNTVTGSNHVVSGTNHIVTDNNNNVSGNDNNVSGSFHTVSGGHNTVSGSNNTVSGSNHVVSGSNKVVTD'}


In [None]:
#If you want to repack the protein sequence
import pyrosetta # type: ignore
pyrosetta.init("--ex1 --ex2")
def one_letter_to_three(letter):
    one_to_three = {
        'A': 'ALA', 'R': 'ARG', 'N': 'ASN', 'D': 'ASP', 'C': 'CYS',
        'Q': 'GLN', 'E': 'GLU', 'G': 'GLY', 'H': 'HIS', 'I': 'ILE',
        'L': 'LEU', 'K': 'LYS', 'M': 'MET', 'F': 'PHE', 'P': 'PRO',
        'S': 'SER', 'T': 'THR', 'W': 'TRP', 'Y': 'TYR', 'V': 'VAL'
    }
    return one_to_three.get(letter)

def modify_protein_sequence(input_pdb: str, output_pdb: str, input_sequence: str) -> None:
    # Initialize PyRosetta
    #pyrosetta.init("--ex1 --ex2")
    
    # Load the pose from the PDB file
    pose = pyrosetta.pose_from_file(input_pdb)
    
    # Check if the input sequence length matches the total residues in the PDB file
    if len(input_sequence) != pose.total_residue():
        print("pose total residue", pose.total_residue())
        raise ValueError("Input sequence length does not match the number of residues in the PDB file.")
    
    # Map the input sequence to the PDB structure
    for resi in range(1, pose.total_residue() + 1):
        # Get the target residue type from the input sequence
        target_residue = input_sequence[resi - 1]
        # Convert the one-letter code to three-letter code
        target_residue_3letter = one_letter_to_three(target_residue)
        
        # Only mutate if the target residue is different from the current residue
        if pose.residue(resi).name1() != target_residue:
            pose.replace_residue(
                resi, 
                pyrosetta.rosetta.core.conformation.ResidueFactory.create_residue(
                    pose.residue_type_set_for_pose().name_map(target_residue_3letter)
                ), 
                True
            )
    
    # Repack the structure
    scorefxn = pyrosetta.rosetta.core.scoring.get_score_function()
    pack_mover = pyrosetta.rosetta.protocols.minimization_packing.PackRotamersMover(scorefxn)
    tf = pyrosetta.rosetta.core.pack.task.TaskFactory()
    tf.push_back(pyrosetta.rosetta.core.pack.task.operation.InitializeFromCommandline())
    tf.push_back(pyrosetta.rosetta.core.pack.task.operation.RestrictToRepacking())
    pack_mover.task_factory(tf)
    pack_mover.apply(pose)
    
    # Dump the modified pose to the defined file
    pose.dump_pdb(output_pdb)


# Example usage
dammy_seq = 'EVE'*1 + "V"*9 + 'E'*90 + 'V'*8 + 'EVEE'*1
input_pdb = '/home/eva/0_bury_charged_pair/0_Biopython_playground/0_myown_tips_tricks/output/3ult_cleaned.pdb'
output_pdb = '/home/eva/0_bury_charged_pair/0_Biopython_playground/0_myown_tips_tricks/output/3ult_cleaned_VE.pdb'
input_sequence = dammy_seq  # Replace with the actual input sequence
modify_protein_sequence(input_pdb, output_pdb, input_sequence)

┌──────────────────────────────────────────────────────────────────────────────┐
│                                 PyRosetta-4                                  │
│              Created in JHU by Sergey Lyskov and PyRosetta Team              │
│              (C) Copyright Rosetta Commons Member Institutions               │
│                                                                              │
│ NOTE: USE OF PyRosetta FOR COMMERCIAL PURPOSES REQUIRE PURCHASE OF A LICENSE │
│         See LICENSE.PyRosetta.md or email license@uw.edu for details         │
└──────────────────────────────────────────────────────────────────────────────┘
PyRosetta-4 2024 [Rosetta PyRosetta4.conda.ubuntu.cxx11thread.serialization.Ubuntu.python311.Release 2024.42+release.3366cf78a3df04339d1982e94531b77b098ddb99 2024-10-11T08:24:04] retrieved from: http://www.pyrosetta.org
core.init: Checking for fconfig files in pwd and ./rosetta/flags
core.init: Rosetta version: PyRosetta4.conda.ubuntu.cxx11thread.ser

core.import_pose.import_pose: File '/home/eva/0_bury_charged_pair/0_Biopython_playground/0_myown_tips_tricks/output/3ult_cleaned.pdb' automatically determined to be of type PDB
core.scoring.ScoreFunctionFactory: SCOREFUNCTION: ref2015
core.pack.task: Packer task: initialize from command line()
core.pack.pack_rotamers: built 7598 rotamers at 114 positions.
core.pack.pack_rotamers: Requesting all available threads for interaction graph computation.
core.pack.interaction_graph.interaction_graph_factory: Instantiating DensePDInteractionGraph
core.pack.rotamer_set.RotamerSets: Completed interaction graph pre-calculation in 1 available threads (1 had been requested).
