<a href="https://colab.research.google.com/github/Siddhartha96123/GCHS/blob/main/GCHS_CHAIN_EXTRACT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Hello Fellow Curious Human.

This here is a simple tool to split both of your reference and target PDBs into its contituent PDB chains and have access to that combined ZIP files. Please follow the steps as you move through.

STEP 1. Sort out all computational pre-requisites.

In [None]:
!pip install biopython

Now the core function of splitting the PDB itself :

Please initiate the code to run and when promopted, please go ahead with uploading the reference and target PDBs that needs to be split into contitutent chains.

In [None]:
import os
from google.colab import files
import shutil
import time  # For adding delay between downloads

def extract_protein_chains(pdb_file, output_dir):
    """
    Extract protein chains from a PDB file and save them as separate PDB files in the specified output directory.
    :param pdb_file: Path to the input PDB file
    :param output_dir: Directory where the chain PDB files will be saved
    :return: Number of chains extracted from the PDB file
    """
    chains = {}

    # Open the PDB file for reading
    with open(pdb_file, 'r') as f:
        for line in f:
            # Only process lines that start with 'ATOM'
            if line.startswith("ATOM"):
                chain_id = line[21]  # Chain identifier is in column 22 (index 21 in 0-indexed)

                # Append the line to the corresponding chain
                if chain_id not in chains:
                    chains[chain_id] = []
                chains[chain_id].append(line)

    # Create a directory to store the chain PDB files
    os.makedirs(output_dir, exist_ok=True)

    # Write each chain's data to a separate PDB file
    for chain_id, chain_data in chains.items():
        pdb_filename = os.path.join(output_dir, f"chain_{chain_id}.pdb")
        with open(pdb_filename, 'w') as pdb_file:
            pdb_file.writelines(chain_data)
            print(f"Chain {chain_id} saved as {pdb_filename}")

    # Return the number of chains extracted
    return len(chains)

def upload_pdb_files():
    """
    Allows the user to upload the reference and target PDB files.
    """
    print("Please upload the reference PDB file:")
    uploaded_files = files.upload()
    ref_pdb = list(uploaded_files.keys())[0]
    print(f"Reference PDB file uploaded: {ref_pdb}")

    print("\nPlease upload the target PDB file:")
    uploaded_files = files.upload()
    target_pdb = list(uploaded_files.keys())[0]
    print(f"Target PDB file uploaded: {target_pdb}")

    return ref_pdb, target_pdb

def process_pdb_files():
    # Upload PDB files
    ref_pdb, target_pdb = upload_pdb_files()

    # Extract chains for the reference PDB file
    ref_output_dir = "ref_chains_pdb_files"
    ref_chain_count = extract_protein_chains(ref_pdb, ref_output_dir)
    print(f"\n{ref_chain_count} chains extracted from the reference PDB file.\n")

    # Extract chains for the target PDB file
    target_output_dir = "target_chains_pdb_files"
    target_chain_count = extract_protein_chains(target_pdb, target_output_dir)
    print(f"{target_chain_count} chains extracted from the target PDB file.\n")

    # Zip the output directories for downloading
    shutil.make_archive('ref_chains_pdb_files', 'zip', 'ref_chains_pdb_files')
    shutil.make_archive('target_chains_pdb_files', 'zip', 'target_chains_pdb_files')

    # Provide download links for the user
    print("Preparing to download ZIP files...\n")

    # Delay to ensure the first download completes
    files.download('ref_chains_pdb_files.zip')

    # Adding a short delay before the second download to ensure both downloads work
    time.sleep(2)  # You can adjust this time if necessary

    files.download('target_chains_pdb_files.zip')

# Run the processing
process_pdb_files()


