In [2]:
import os
import glob
from Bio.PDB import PPBuilder
from Bio.PDB import PDBParser, PDBIO

In [None]:
def split_pdb_by_chain(input_dir, output_dir):
    """
    Splits each PDB file in the input directory into separate files for each chain.

    Parameters:
    - input_dir: Directory containing the PDB files.
    - output_dir: Directory where the split PDB files will be saved.
    """
    parser = PDBParser(QUIET=True)
    io = PDBIO()

    # Ensure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Iterate over all PDB files in the input directory
    for pdb_file in os.listdir(input_dir):
        if pdb_file.endswith('.pdb'):
            pdb_path = os.path.join(input_dir, pdb_file)
            structure = parser.get_structure(pdb_file[:-4], pdb_path)

            # Iterate over each model in the structure
            for model in structure:
                # Iterate over each chain in the model
                for chain in model:
                    chain_id = chain.id
                    output_filename = f"{pdb_file[:-4]}_Chain_{chain_id}.pdb"
                    output_path = os.path.join(output_dir, output_filename)

                    # Save the current chain to a new PDB file
                    io.set_structure(chain)
                    io.save(output_path)
                    print(f"Saved {output_filename}")

input_directory = 'pdb_files/'       # Directory containing original PDB files
output_directory = 'split_chains/'   # Directory to save split chain files
split_pdb_by_chain(input_directory, output_directory)

In [None]:
INPUT_DIR = "split_chains/"            # or your `output_dir`
OUTPUT_DIR = "sequences/"

os.makedirs(OUTPUT_DIR, exist_ok=True)

parser = PDBParser(QUIET=True)
ppb = PPBuilder()

for pdb_file in glob.glob(os.path.join(INPUT_DIR, "*.pdb")):
    structure = parser.get_structure("struct", pdb_file)
    seq = ""

    model = next(structure.get_models())
    chain = next(model.get_chains())  # auto-detects single chain in split file

    for pp in ppb.build_peptides(chain):
        seq += str(pp.get_sequence())

    if not seq:
        print(f"No sequence found in {pdb_file}. Skipping...")
        continue

    base_name = os.path.basename(pdb_file).replace(".pdb", "")
    fasta_file = os.path.join(OUTPUT_DIR, f"{base_name}.fa")

    with open(fasta_file, "w") as out_f:
        out_f.write(f">{base_name}\n")
        for i in range(0, len(seq), 10):
            out_f.write(seq[i:i+10] + "\n")

    print(f" Extracted sequence for {base_name} -> {fasta_file}")

# Summary
fasta_files = glob.glob(os.path.join(OUTPUT_DIR, "*.fa"))
print(f" Found {len(fasta_files)} FASTA files in {OUTPUT_DIR}.")