In [None]:
#Download the PDB listed in the Excel file
import os
import pandas as pd
import requests

# Function to download PDB file
def download_pdb(pdb_code, save_directory):
    download_url = f"https://files.rcsb.org/download/{pdb_code}.pdb"
    response = requests.get(download_url)
    if response.status_code == 200:
        save_path = os.path.join(save_directory, f"{pdb_code}.pdb")
        with open(save_path, "wb") as file:
            file.write(response.content)
        print(f"Downloaded {pdb_code}.pdb")
        return True
    else:
        print(f"Failed to download {pdb_code}.pdb")
        return False

# Read the Excel file with the PDB codes
excel_file = "L:/Zn-installer_rawdata/241111_Mn_Final/Mn.xlsx"
df = pd.read_excel(excel_file)

# Get the PDB codes from the Excel file
pdb_codes = df["PDB ID"].tolist()

# Directory to save the downloaded PDB files
pdb_directory = "L:/Zn-installer_rawdata/241111_Mn_Final/"

# Create the directory if it doesn't exist
os.makedirs(pdb_directory, exist_ok=True)

# List to store the PDB IDs for failed downloads
failed_pdb_ids = []

# Iterate over the PDB codes and download the structures
for pdb_code in pdb_codes:
    if not download_pdb(pdb_code, pdb_directory):
        failed_pdb_ids.append(pdb_code)

# Save the failed PDB IDs to an Excel file
failed_df = pd.DataFrame({"PDB ID": failed_pdb_ids})
failed_excel_file = "L:/Zn-installer_rawdata/241111_Mn_Final/Mn_failed.xlsx"
failed_df.to_excel(failed_excel_file, index=False)
print(f"Failed PDB IDs saved to {failed_excel_file}")

In [None]:
#Classify the PDB Files based on the number of manganese ions in the assymetric units
import os
import pandas as pd
from Bio.PDB import PDBParser
from collections import defaultdict
import shutil

def categorize_metal_atoms(filename):
    pdb_id = os.path.basename(filename).split('.')[0]
    manganese_atoms = []
    other_metal_atoms = []

    parser = PDBParser(QUIET=True)
    structure = parser.get_structure(pdb_id, filename)

    # Store Mn and other metal atoms
    for model in structure:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    element_symbol = atom.element
                    if element_symbol == 'MN':
                        manganese_atoms.append(atom)
                    elif element_symbol in ['ZN', 'MG', 'CO', 'CA', 'FE', 'PT', 'NA', 'K', 'LI', 'CD', 'YB', 'NI', 'PR', 'HG', 'MN']:
                        other_metal_atoms.append(atom)

    return manganese_atoms, other_metal_atoms

def save_metal_atoms_to_excel(directory):
    pdb_files = [f for f in os.listdir(directory) if f.endswith('.pdb')]
    all_mn_atoms = []
    all_other_metals = []

    for pdb_file in pdb_files:
        pdb_file_path = os.path.join(directory, pdb_file)
        mn_atoms, other_metals = categorize_metal_atoms(pdb_file_path)
        all_mn_atoms.extend([(pdb_file.split('.')[0], 'MN')] * len(mn_atoms))
        all_other_metals.extend([(pdb_file.split('.')[0], atom.element) for atom in other_metals])

    df_mn_atoms = pd.DataFrame(all_mn_atoms, columns=['PDB ID', 'Name of metal atoms'])
    df_other_metals = pd.DataFrame(all_other_metals, columns=['PDB ID', 'Name of metal atoms'])

    output_file_mn_atoms = os.path.join(directory, 'mn_atoms2.xlsx')
    output_file_other_metals = os.path.join(directory, 'other_metals2.xlsx')

    df_mn_atoms.to_excel(output_file_mn_atoms, index=False)
    df_other_metals.to_excel(output_file_other_metals, index=False)

    print(f"MN atoms categorized have been saved to '{output_file_mn_atoms}'.")
    print(f"Other metals categorized have been saved to '{output_file_other_metals}'.")

    return output_file_mn_atoms

def count_and_categorize_mn_atoms(excel_file, pdb_directory):
    df = pd.read_excel(excel_file)
    mn_counts = defaultdict(int)

    for index, row in df.iterrows():
        pdb_id = row['PDB ID']
        metal_atom_name = row['Name of metal atoms']

        if metal_atom_name == 'MN':
            mn_counts[pdb_id] += 1

    mn_counts_dict = dict(mn_counts)
    result_df = pd.DataFrame(list(mn_counts_dict.items()), columns=['PDB ID', 'Metal Count'])
    output_excel = os.path.join(pdb_directory, 'mn_count_results.xlsx')
    result_df.to_excel(output_excel, index=False)
    print(f"Metal atom counts have been saved to '{output_excel}'.")

    # Categorize and copy PDB files based on the number of MN atoms
    for pdb_id, count in mn_counts_dict.items():
        pdb_file = os.path.join(pdb_directory, f"{pdb_id}.pdb")
        if not os.path.isfile(pdb_file):
            print(f"Warning: PDB file {pdb_file} not found.")
            continue

        if count == 1:
            destination_dir = os.path.join(pdb_directory, 'metal_count_1_mn2')
        elif count > 1:
            destination_dir = os.path.join(pdb_directory, 'metal_count_greater_than_1_mn2')
        else:
            continue

        os.makedirs(destination_dir, exist_ok=True)
        shutil.copy(pdb_file, os.path.join(destination_dir, f"{pdb_id}.pdb"))
        print(f"File '{pdb_id}.pdb' copied to '{destination_dir}'.")

# Specify the directories
pdb_directory = 'L:/Zn-installer_rawdata/241111_Mn_Final/Final_3/'

# Save categorized metal atoms to Excel and get the path of the Excel file
mn_atoms_excel_file = save_metal_atoms_to_excel(pdb_directory)

# Count Mn atoms based on the Excel file and categorize PDB files
count_and_categorize_mn_atoms(mn_atoms_excel_file, pdb_directory)

In [None]:
#Classification of mononuclear from heteronuclear proteins
import os
import shutil
import pandas as pd
from Bio.PDB import PDBParser

def calculate_distance(atom1, atom2):
    # Calculate the Euclidean distance between two atoms
    x1, y1, z1 = atom1.coord
    x2, y2, z2 = atom2.coord
    distance = ((x1 - x2)**2 + (y1 - y2)**2 + (z1 - z2)**2)**0.5
    return distance

def categorize_pdb_files(directory, distance_threshold=5.0):
    mono_hetero_results = []
    mono_results = []

    pdb_files = [f for f in os.listdir(directory) if f.endswith('.pdb')]
    parser = PDBParser(QUIET=True)

    for pdb_file in pdb_files:
        try:
            pdb_id = pdb_file.split('.')[0]
            structure = parser.get_structure(pdb_id, os.path.join(directory, pdb_file))

            # Store Mn and other metal atoms
            manganese_atoms = []
            other_metal_atoms = []
            has_other_metals_near_manganese = False

            for model in structure:
                for chain in model:
                    for residue in chain:
                        for atom in residue:
                            element_symbol = atom.element
                            if element_symbol == 'MN':
                                manganese_atoms.append(atom)
                            elif element_symbol in ['ZN', 'MG', 'CO', 'CA', 'FE', 'PT', 'NA', 'K', 'LI', 'CD', 'YB', 'NI', 'PR', 'HG', 'MN']:
                                other_metal_atoms.append(atom)

            # If exactly one Mn atom, check for proximity to other metals
            if len(manganese_atoms) == 1:
                manganese_atom = manganese_atoms[0]
                for metal_atom in other_metal_atoms:
                    distance = calculate_distance(manganese_atom, metal_atom)
                    if distance <= distance_threshold:
                        mono_hetero_results.append(pdb_id)
                        has_other_metals_near_manganese = True
                        break  # No need to check further, already categorized as 'mono-hetero'

            if not has_other_metals_near_manganese:
                mono_results.append(pdb_id)

        except Exception as e:
            print(f"Error processing {pdb_file}: {e}")

    # Save results to separate Excel files
    save_results(mono_hetero_results, mono_results, directory)

def save_results(mono_hetero_results, mono_results, directory):
    """Save categorization results to Excel files and copy PDB files."""
    # Set up output directories
    output_dir_mono_hetero = os.path.join(directory, 'mono_hetero2')
    output_dir_mono = os.path.join(directory, 'mono2')

    try:
        # Create directories if they do not exist
        os.makedirs(output_dir_mono_hetero, exist_ok=True)
        os.makedirs(output_dir_mono, exist_ok=True)

        # Save results to Excel files
        df_mono_hetero = pd.DataFrame(mono_hetero_results, columns=['PDB ID'])
        df_mono = pd.DataFrame(mono_results, columns=['PDB ID'])

        df_mono_hetero.to_excel(os.path.join(output_dir_mono_hetero, 'mono_hetero2.xlsx'), index=False)
        df_mono.to_excel(os.path.join(output_dir_mono, 'mono2.xlsx'), index=False)

        # Copy corresponding PDB files to directories
        for pdb_id in mono_hetero_results:
            src_file = os.path.join(directory, f"{pdb_id}.pdb")
            if os.path.isfile(src_file):
                shutil.copy(src_file, os.path.join(output_dir_mono_hetero, f"{pdb_id}.pdb"))

        for pdb_id in mono_results:
            src_file = os.path.join(directory, f"{pdb_id}.pdb")
            if os.path.isfile(src_file):
                shutil.copy(src_file, os.path.join(output_dir_mono, f"{pdb_id}.pdb"))

        print("PDB files have been copied to the corresponding folders.")
        print(f"Results categorized as 'mono_hetero' have been saved to '{output_dir_mono_hetero}/mono_hetero2.xlsx'.")
        print(f"Results categorized as 'mono' have been saved to '{output_dir_mono}/mono2.xlsx'.")

    except PermissionError as e:
        print(f"PermissionError: {e}")
    except Exception as e:
        print(f"Error creating directories or saving results: {e}")

# Specify the directory containing the PDB files
pdb_directory = 'J:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_1/'

# Categorize and copy PDB files based on the presence of Mn and other metals within 5 Å
categorize_pdb_files(pdb_directory)

In [None]:
#Extract the mononuclear with (M>1, extract from hetero and homonuclear proteins)
import os
import shutil
import pandas as pd
from Bio.PDB import PDBParser
import numpy as np

# Function to calculate the distance between two atoms
def calculate_distance(atom1, atom2):
    return np.linalg.norm(atom1.get_coord() - atom2.get_coord())

# Function to find and categorize Mn atoms based on their proximity to other metal ions (including other Mn atoms)
def categorize_pdb_files(directory, distance_threshold=5.0):
    manganese_ion = 'MN'
    other_metals = ['CU', 'MG', 'CO', 'CA', 'FE', 'PT', 'NA', 'K ', 'LI', 'CD', 'YB', 'NI', 'PR', 'HG', 'MN', 'ZN']  # Include Mn

    all_isolated_results = []
    mixed_results = []
    other_results = []

    pdb_files = [f for f in os.listdir(directory) if f.endswith('.pdb')]
    parser = PDBParser(QUIET=True)

    for pdb_file in pdb_files:
        try:
            pdb_id = pdb_file.split('.')[0]
            structure = parser.get_structure(pdb_id, os.path.join(directory, pdb_file))
            manganese_atoms = []
            other_metal_atoms = []

            # Collect all Mn atoms and other metal atoms in the structure
            for model in structure:
                for chain in model:
                    for residue in chain:
                        for atom in residue:
                            element_symbol = atom.element.strip()  # Strip leading/trailing spaces to ensure proper comparison

                            if element_symbol == manganese_ion:
                                manganese_atoms.append(atom)
                            elif element_symbol in other_metals:
                                other_metal_atoms.append(atom)

            # Flag to track isolated Mn atoms
            isolated_mn_count = 0

            # Check if each Mn atom has any other metal ions (including other Mn atoms) within the distance threshold
            for manganese_atom in manganese_atoms:
                is_isolated = True  # Assume the Mn atom is isolated unless proven otherwise
                for metal_atom in (manganese_atoms + other_metal_atoms):
                    if metal_atom != manganese_atom:  # Exclude the Mn atom itself from comparison
                        distance = calculate_distance(manganese_atom, metal_atom)
                        if distance <= distance_threshold:
                            is_isolated = False  # Mn atom is not isolated
                            break

                if is_isolated:
                    isolated_mn_count += 1

            # Classify the file based on the number of isolated Mn atoms
            if isolated_mn_count == len(manganese_atoms):
                all_isolated_results.append(pdb_id)  # All Mn atoms are isolated
                print(f"{pdb_id} classified as all_isolated.")
            elif isolated_mn_count > 0:
                mixed_results.append(pdb_id)  # Mix of isolated and non-isolated Mn atoms
                print(f"{pdb_id} classified as mixed.")
            else:
                other_results.append(pdb_id)  # No isolated Mn atoms
                print(f"{pdb_id} classified as other.")

        except Exception as e:
            print(f"Error processing {pdb_file}: {e}")

    # Save results
    save_results(all_isolated_results, mixed_results, other_results, directory)

# Function to save categorized results to Excel and copy PDB files
def save_results(all_isolated_results, mixed_results, other_results, directory):
    # Create directories for storing results
    output_dir_all_isolated = os.path.join(directory, 'all_isolated')
    output_dir_mixed = os.path.join(directory, 'mixed')
    output_dir_other = os.path.join(directory, 'other')

    os.makedirs(output_dir_all_isolated, exist_ok=True)
    os.makedirs(output_dir_mixed, exist_ok=True)
    os.makedirs(output_dir_other, exist_ok=True)

    # Save results to Excel files
    pd.DataFrame(all_isolated_results, columns=['PDB ID']).to_excel(os.path.join(output_dir_all_isolated, 'all_isolated.xlsx'), index=False)
    pd.DataFrame(mixed_results, columns=['PDB ID']).to_excel(os.path.join(output_dir_mixed, 'mixed.xlsx'), index=False)
    pd.DataFrame(other_results, columns=['PDB ID']).to_excel(os.path.join(output_dir_other, 'other.xlsx'), index=False)

    # Copy corresponding PDB files to their respective directories
    for pdb_id in all_isolated_results:
        src_file = os.path.join(directory, f"{pdb_id}.pdb")
        if os.path.isfile(src_file):
            shutil.copy(src_file, output_dir_all_isolated)

    for pdb_id in mixed_results:
        src_file = os.path.join(directory, f"{pdb_id}.pdb")
        if os.path.isfile(src_file):
            shutil.copy(src_file, output_dir_mixed)

    for pdb_id in other_results:
        src_file = os.path.join(directory, f"{pdb_id}.pdb")
        if os.path.isfile(src_file):
            shutil.copy(src_file, output_dir_other)

    print(f"Results saved: {len(all_isolated_results)} in 'all_isolated', {len(mixed_results)} in 'mixed', {len(other_results)} in 'other'.")

# Main script execution
pdb_directory = 'J:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_greater_than_1/'
categorize_pdb_files(pdb_directory)

In [None]:
#Extract the mononuclear binding sites from homo and hetero nuclear
import os
import shutil
import pandas as pd
from Bio.PDB import PDBParser, PDBIO, Select
import numpy as np

# Function to calculate the distance between two atoms
def calculate_distance(atom1, atom2):
    return np.linalg.norm(atom1.get_coord() - atom2.get_coord())

# Function to process mixed PDB files and remove non-isolated Mn atoms
def process_mixed_pdb_files(directory, distance_threshold=5.0):
    manganese_ion = 'MN'
    other_metals = ['CU', 'MG', 'CO', 'CA', 'FE', 'PT', 'NA', 'K ', 'LI', 'CD', 'YB', 'NI', 'PR', 'HG', 'MN', 'ZN']  # Include Mn

    pdb_files = [f for f in os.listdir(directory) if f.endswith('.pdb')]
    parser = PDBParser(QUIET=True)

    for pdb_file in pdb_files:
        pdb_id = pdb_file.split('.')[0]
        pdb_file_path = os.path.join(directory, pdb_file)
        try:
            structure = parser.get_structure(pdb_id, pdb_file_path)
            manganese_atoms = []
            other_metal_atoms = []

            # Collect all Mn atoms and other metal atoms in the structure
            for model in structure:
                for chain in model:
                    for residue in chain:
                        for atom in residue:
                            element_symbol = atom.element.strip()  # Strip leading/trailing spaces to ensure proper comparison

                            if element_symbol == manganese_ion:
                                manganese_atoms.append(atom)
                            elif element_symbol in other_metals:
                                other_metal_atoms.append(atom)

            # Collect isolated Mn atoms
            isolated_manganese_atoms = []

            for manganese_atom in manganese_atoms:
                is_isolated = True  # Assume the Mn atom is isolated unless proven otherwise
                for metal_atom in (manganese_atoms + other_metal_atoms):
                    if metal_atom != manganese_atom:  # Exclude the Mn atom itself from comparison
                        distance = calculate_distance(manganese_atom, metal_atom)
                        if distance <= distance_threshold:
                            is_isolated = False  # Mn atom is not isolated
                            break

                if is_isolated:
                    isolated_manganese_atoms.append(manganese_atom)

            # Save PDB file with only isolated Mn atoms
            if isolated_manganese_atoms:
                io = PDBIO()
                io.set_structure(structure)
                io.save(os.path.join(directory, f"{pdb_id}.pdb"), select=IsolatedMnSelector(isolated_manganese_atoms))

        except Exception as e:
            print(f"Error processing {pdb_file}: {e}")

# Selector class to filter isolated Mn atoms
class IsolatedMnSelector(Select):
    def __init__(self, isolated_atoms):
        self.isolated_atoms = isolated_atoms

    def accept_atom(self, atom):
        return atom in self.isolated_atoms or atom.element.strip() != 'MN'

# Main script execution
pdb_directory = 'J:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_greater_than_1/mixed/'
process_mixed_pdb_files(pdb_directory)


In [None]:
import os
import pandas as pd
from Bio import PDB
import numpy as np
import logging

# --- Constants ---
# Input/Output Configuration
PDB_DIRECTORY = "D:/250414_Final/Mn/2His_1Asp"
OUTPUT_DIR = "D:/250414_Final/Mn/2His_1Asp"
# Changed prefix to indicate the format change
OUTPUT_FILENAME_PREFIX = "original_format_output_"

# Biochemical Constants
METAL_ID = 'MN'
ALLOWED_RESIDUES = {
    'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS',
    'LYS', 'MET', 'PHE', 'SER', 'THR', 'TYR', 'VAL'
}

# Distance thresholds (in Angstroms) for MN coordination [min_dist, max_dist]
# Using the dictionary provided in the script
MN_DISTANCE_THRESHOLDS = {
    'HIS_ND1': (0, 2.75), 'HIS_NE2': (0, 2.75), 'HIS_N': (0, 2.75), 'HIS_O': (0, 2.85),
    'GLU_OE1': (0, 2.85), 'GLU_OE2': (0, 2.85), 'GLU_N': (0, 2.75), 'GLU_O': (0, 2.85),
    'ASP_OD1': (0, 2.85), 'ASP_OD2': (0, 2.85), 'ASP_N': (0, 2.75), 'ASP_O': (0, 2.85),
    'ALA_N': (0, 2.75),   'ALA_O': (0, 2.85),
    'CYS_SG': (0, 2.75),
    'MET_SD': (0, 2.75),
    'ARG_NH1': (0, 2.75), 'ARG_NH2': (0, 2.75), 'ARG_NE': (0, 2.75), 'ARG_N': (0, 2.75), 'ARG_O': (0, 2.85),
    'ASN_OD1': (0, 2.85), 'ASN_ND2': (0, 2.75), 'ASN_N': (0, 2.75), 'ASN_O': (0, 2.85),
    'GLN_OE1': (0, 2.85), 'GLN_NE2': (0, 2.75), 'GLN_N': (0, 2.75), 'GLN_O': (0, 2.85),
    'GLY_N': (0, 2.75),   'GLY_O': (0, 2.85),
    'LYS_NZ': (0, 2.75),  'LYS_N': (0, 2.75), 'LYS_O': (0, 2.85),
    'SER_OG': (0, 2.85),  'SER_N': (0, 2.75), 'SER_O': (0, 2.85),
    'THR_OG1': (0, 2.85), 'THR_N': (0, 2.75), 'THR_O': (0, 2.85),
    'TYR_OH': (0, 2.85),  'TYR_N': (0, 2.75), 'TYR_O': (0, 2.85),
}

# Define base columns matching the desired "original" output structure (image)
# 8 columns before binding atoms start
BASE_OUTPUT_COLUMNS = [
    'Entry ID', 'PDB ID', 'Metal Chain ID', 'Metal Residue number', 'Metal',
    'Chain ID', 'Residue number', 'Residue name' # These refer to the specific ligand in the row
]

# --- Helper Functions (Keep as they are in the provided script) ---

def calculate_distance(atom1, atom2):
    """Calculates the Euclidean distance between two Bio.PDB.Atom objects."""
    try:
        coord1 = atom1.get_coord()
        coord2 = atom2.get_coord()
        diff = coord1 - coord2
        return np.sqrt(np.sum(diff * diff))
    except Exception as e:
        # It's often better to log the pdb_id here if possible, but it's not passed directly
        logging.error(f"Error calculating distance between {atom1} and {atom2}: {e}")
        return float('inf') # Return infinity if coordinates are bad

def get_distance_threshold(residue_name, atom_name, thresholds):
    """
    Gets the distance threshold tuple (min, max) for a given residue and atom.
    Handles specific cases like ASP/GLU carboxylates and CYS/MET sulfurs based on provided dict keys.
    """
    # This function attempts to match the logic needed for the provided MN_DISTANCE_THRESHOLDS dict
    # Check for specific atom keys first
    specific_key = f"{residue_name}_{atom_name}"
    if specific_key in thresholds:
        return thresholds[specific_key]

    # Handle potential combined carboxylate/amide oxygens if specific keys aren't present
    # Note: The provided dict HAS specific keys like GLU_OE1, GLU_OE2, so this fallback might not be needed
    # if residue_name == 'GLU' and atom_name in ['OE1', 'OE2'] and 'GLU_OE' in thresholds:
    #     return thresholds['GLU_OE']
    # if residue_name == 'ASP' and atom_name in ['OD1', 'OD2'] and 'ASP_OD' in thresholds:
    #     return thresholds['ASP_OD']

    # Handle specific sulfur atoms (assuming keys like CYS_SG, MET_SD exist)
    if residue_name == 'CYS' and atom_name == 'SG' and 'CYS_SG' in thresholds:
         return thresholds['CYS_SG']
    if residue_name == 'MET' and atom_name == 'SD' and 'MET_SD' in thresholds:
         return thresholds['MET_SD']

    # Fallback if no specific or grouped key is found
    return thresholds.get(specific_key, (0, 0))


# --- MODIFIED Core Logic Function ---
# Renamed to avoid confusion, but uses the same internal logic as the provided script's find_coordination_sites
# The CHANGE is in how the final data is structured and returned.
def find_coordination_sites_for_original_format(structure, pdb_id, metal_atom_name, thresholds, allowed_ligands):
    """
    Identifies metal coordination sites using the provided script's logic,
    but formats the output as one row per metal-ligand pair.

    Returns:
        A dictionary where keys are coordination numbers and values are lists
        of tuples. Each tuple represents ONE metal-ligand interaction:
        (entry_id, pdb_code, metal_chain, metal_resnum, metal_name,
         ligand_chain, ligand_resnum, ligand_name, atom1, atom2, ...)
    """
    # This dictionary will store {coord_num: [list_of_metal_ligand_tuples]}
    coordination_sites_by_number = {}

    # Iterate through the structure to find metal ions
    for model in structure:
        for chain in model:
            for metal_residue in chain:
                if metal_residue.get_resname() == metal_atom_name:
                    try:
                        metal_atom = metal_residue[metal_atom_name]
                    except KeyError:
                        logging.warning(f"Metal residue {metal_residue.id} in {pdb_id} "
                                        f"doesn't contain a '{metal_atom_name}' atom. Skipping.")
                        continue

                    metal_chain_id = metal_residue.get_parent().id
                    metal_res_num = metal_residue.id[1]
                    metal_name = metal_residue.get_resname()
                    # Use .get() for safer header access, fallback to pdb_id
                    pdb_code = structure.header.get('idcode', pdb_id)

                    # Store info about ligands coordinating *this specific* metal ion
                    # Using a dictionary keyed by residue ID to handle duplicates across models easily
                    coordinating_ligands_details = {} # {(chain, resnum): {ligand_info..., "binding_atoms": [...]}}

                    # Now, iterate through all potential ligands for this metal ion
                    for ligand_model in structure:
                        for ligand_chain in ligand_model:
                            for ligand_residue in ligand_chain:
                                # Skip if it's the metal itself or not an allowed residue type
                                if ligand_residue == metal_residue or ligand_residue.get_resname() not in allowed_ligands:
                                    continue

                                coordinating_atoms_in_ligand = [] # Atoms in *this* ligand coordinating the *current* metal
                                ligand_res_name = ligand_residue.get_resname()
                                ligand_chain_id = ligand_chain.id
                                ligand_res_num = ligand_residue.id[1]
                                residue_unique_id = (ligand_chain_id, ligand_res_num)

                                # Check each atom in the potential ligand residue
                                for ligand_atom in ligand_residue:
                                    ligand_atom_name = ligand_atom.get_id()
                                    min_dist, max_dist = get_distance_threshold(ligand_res_name, ligand_atom_name, thresholds)

                                    if max_dist <= 0: continue # Skip if no valid threshold

                                    distance = calculate_distance(metal_atom, ligand_atom)

                                    if min_dist <= distance <= max_dist:
                                        coordinating_atoms_in_ligand.append(ligand_atom_name)

                                # If any atom in this ligand coordinated, store/update its info
                                if coordinating_atoms_in_ligand:
                                    if residue_unique_id not in coordinating_ligands_details:
                                        coordinating_ligands_details[residue_unique_id] = {
                                            "ligand_chain_id": ligand_chain_id,
                                            "ligand_res_num": ligand_res_num,
                                            "ligand_res_name": ligand_res_name,
                                            "binding_atoms": set(coordinating_atoms_in_ligand) # Use a set initially
                                        }
                                    else:
                                        # Add any newly found coordinating atoms for this residue
                                        coordinating_ligands_details[residue_unique_id]["binding_atoms"].update(coordinating_atoms_in_ligand)

                    # --- Data Formatting Change Starts Here ---
                    # After checking all ligands for this metal ion, determine coordination number
                    coordination_number = len(coordinating_ligands_details)

                    if coordination_number > 0:
                        # Create the list of output tuples (one per ligand) for this metal
                        output_rows_for_this_metal = []
                        for ligand_details in coordinating_ligands_details.values():
                            # Sort binding atoms alphabetically for consistent output
                            sorted_binding_atoms = tuple(sorted(list(ligand_details["binding_atoms"])))

                            # Construct the tuple for this metal-ligand pair row
                            row_tuple = (
                                pdb_id,                   # Entry ID (from filename)
                                pdb_code,                 # PDB ID (from header)
                                metal_chain_id,           # Metal Chain ID
                                metal_res_num,            # Metal Residue number
                                metal_name,               # Metal Name
                                ligand_details["ligand_chain_id"],    # Ligand Chain ID
                                ligand_details["ligand_res_num"],     # Ligand Residue number
                                ligand_details["ligand_res_name"]     # Ligand Residue name
                            ) + sorted_binding_atoms      # Add tuple of binding atoms

                            output_rows_for_this_metal.append(row_tuple)

                        # Store this list of rows under the coordination number
                        if coordination_number not in coordination_sites_by_number:
                            coordination_sites_by_number[coordination_number] = []
                        coordination_sites_by_number[coordination_number].extend(output_rows_for_this_metal)
                    # --- Data Formatting Change Ends Here ---

    # Return the dictionary structured for the desired output format
    return coordination_sites_by_number


# --- MODIFIED Saving Function ---
# Renamed function to clarify its purpose
def save_coordination_data_as_original_format(all_coordination_data, output_dir, file_prefix, base_columns):
    """
    Saves the aggregated coordination data using the original format
    (one row per metal-ligand pair, dynamic binding atom columns).
    """
    if not all_coordination_data:
        logging.warning("No coordination data found to save.")
        return

    for coordination_number, sites_list in all_coordination_data.items():
        if not sites_list:
            logging.info(f"No sites found for coordination number {coordination_number}. Skipping file.")
            continue

        logging.info(f"Processing data for coordination number {coordination_number}...")

        # Remove exact duplicate rows before saving (often desirable)
        # Sort first by all elements in tuple for consistent non-duplicate selection
        unique_sites_list = sorted(list(set(sites_list)))

        # Determine the maximum number of binding atoms listed for any single ligand in this group
        num_base_cols = len(base_columns) # Should be 8
        max_binding_atoms = 0
        if unique_sites_list:
            try:
                # Calculate max length of the binding atom part of the tuples
                max_binding_atoms = max(len(site_tuple) - num_base_cols for site_tuple in unique_sites_list)
            except ValueError:
                 logging.error(f"Inconsistent tuple length found for coord_num {coordination_number}. Check data.")
                 max_binding_atoms = 0 # Default

        # Define columns: base columns + Binding atomX columns
        columns = list(base_columns)
        columns.extend([f'Binding atom{i}' for i in range(1, max_binding_atoms + 1)])

        # Create DataFrame using the unique list and defined columns
        # .fillna(np.nan) handles ligands with fewer binding atoms than the max
        try:
             df_coordination = pd.DataFrame(unique_sites_list, columns=columns).fillna(np.nan)
        except Exception as e:
             # Log error and fallback to default columns if specific columns fail
             logging.error(f"Error creating DataFrame for coord num {coordination_number} with specific columns: {e}. Using default columns.")
             df_coordination = pd.DataFrame(unique_sites_list)


        # Define output path and save
        output_filename = f"{file_prefix}{coordination_number}.xlsx"
        output_excel_path = os.path.join(output_dir, output_filename)

        try:
            df_coordination.to_excel(output_excel_path, index=False)
            logging.info(f"Saved data for coordination number {coordination_number} to {output_excel_path}")
        except Exception as e:
            logging.error(f"Failed to save Excel file {output_excel_path}: {e}")

# --- Main Execution (Calls the modified functions) ---

def main():
    """Main function to parse PDB files, find coordination sites, and save results in original format."""
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    aggregated_coordination_info = {} # Stores {coord_num: [list_of_metal_ligand_tuples]}
    parser = PDB.PDBParser(QUIET=True)

    logging.info(f"Starting PDB analysis in directory: {PDB_DIRECTORY}")

    processed_files = 0
    for filename in os.listdir(PDB_DIRECTORY):
        if filename.lower().endswith('.pdb'):
            pdb_filepath = os.path.join(PDB_DIRECTORY, filename)
            pdb_id = filename.split('.')[0]
            logging.info(f"Processing {filename} (ID: {pdb_id})...")

            try:
                structure = parser.get_structure(pdb_id, pdb_filepath)

                # === Call the function that formats data for the original output ===
                coordination_info = find_coordination_sites_for_original_format(
                    structure,
                    pdb_id,
                    METAL_ID,
                    MN_DISTANCE_THRESHOLDS,
                    ALLOWED_RESIDUES
                )

                # Aggregate results (coordination_info is already grouped by coord number)
                for coord_num, sites_list in coordination_info.items():
                    if coord_num not in aggregated_coordination_info:
                        aggregated_coordination_info[coord_num] = []
                    # sites_list contains tuples formatted for one row per metal-ligand pair
                    aggregated_coordination_info[coord_num].extend(sites_list)

                processed_files += 1

            except Exception as e:
                logging.error(f"Could not process file {filename}. Error: {e}", exc_info=True)

    logging.info(f"Finished processing {processed_files} PDB files.")

    logging.info("Saving aggregated results to Excel files (Original Row Format)...")
    # === Call the saving function designed for the original format ===
    save_coordination_data_as_original_format(
        aggregated_coordination_info,
        OUTPUT_DIR,
        OUTPUT_FILENAME_PREFIX,
        BASE_OUTPUT_COLUMNS
    )

    logging.info('Script execution completed.')


if __name__ == "__main__":
    main()

In [None]:
#Fasta file ()
import pandas as pd
import requests

# Load Excel file with PDB IDs
input_file = "/content/2His_1Glu_PDB_list.xlsx"  # Update with correct path
df = pd.read_excel(input_file)

# Extract PDB IDs
pdb_ids = df["PDB ID"].dropna().unique().tolist()

# Function to download FASTA from RCSB
def download_fasta(pdb_id):
    url = f"https://www.rcsb.org/fasta/entry/{pdb_id}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"❌ Failed to download {pdb_id}")
        return None

# Download FASTA sequences
fasta_file = "/content/2His_1Glu_coordination_3_pdb_list.fasta"  # Update path
with open(fasta_file, "w") as fasta_output:
    for pdb_id in pdb_ids:
        fasta_data = download_fasta(pdb_id)
        if fasta_data:
            fasta_output.write(f">{pdb_id}\n{fasta_data}\n")

print(f"✅ FASTA file saved to: {fasta_file}")

✅ FASTA file saved to: /content/2His_1Glu_coordination_3_pdb_list.fasta


In [None]:
#Only extract the sequence of metal-binding chain
import pandas as pd
from Bio import SeqIO

fasta_path = "/content/2His_1Glu_coordination_3_pdb_list.fasta"  # path to your FASTA file

records = []
for record in SeqIO.parse(fasta_path, "fasta"):
    full_header = record.description
    pdb_id = full_header.split("_")[0]

    try:
        chain_info = full_header.split("|")[1].strip()
        if chain_info.startswith("Chain "):
            chain_id = chain_info.replace("Chain ", "")
        elif chain_info.startswith("Chains "):
            chain_id = chain_info.replace("Chains ", "")
        else:
            chain_id = None
    except IndexError:
        chain_id = None

    sequence = str(record.seq)
    if chain_id is not None:
        for single_chain in [c.strip() for c in chain_id.split(",")]:
            records.append((pdb_id, single_chain, sequence))

df = pd.DataFrame(records, columns=["PDB ID", "Chain ID", "Sequence"])
df.to_excel("/content/2His_1Glu_coordination_3_pdb_list_parsed.xlsx", index=False)
print("✅ File saved: 3His_sequence_parsed2.xlsx")


✅ File saved: 3His_sequence_parsed2.xlsx


In [None]:
#Only extract the sequence of metal-binding chain
import pandas as pd

# Load both Excel files
file1_path = "/content/2His_1Glu_coordination_3_pdb_list_parsed.xlsx"
file2_path = "/content/2His_1Glu_pdb_chain_ID.xlsx"

df_main = pd.read_excel(file1_path)
df_match = pd.read_excel(file2_path)

# Merge on 'PDB ID' and 'Chain ID'
merged_df = pd.merge(df_main, df_match, on=['PDB ID', 'Chain ID'], how='inner')

# Save the result
output_path = "/content/2His_1Glu_coordination_3_pdb_list_sequences.xlsx"
merged_df.to_excel(output_path, index=False)




In [None]:
#Global sequence alignment
import pandas as pd
from Bio import pairwise2
from Bio.Align import substitution_matrices
import time

# Input file path
input_file = '/content/2His_1Glu_coordination_3_pdb_list_sequences.xlsx'

# Read Excel file
df = pd.read_excel(input_file)
df["Sequence"] = df["Sequence"].astype(str)

unique_sequences = []
excluded_sequences = []

start_time = time.time()
threshold = 30.0  # Identity threshold (%)

# Load BLOSUM62 substitution matrix correctly
blosum62 = substitution_matrices.load("BLOSUM62")

for i, row in df.iterrows():
    sequence = row["Sequence"]
    pdb_id = row["PDB ID"]
    is_unique = True

    for unique_pdb_id, unique_sequence in unique_sequences:
        # Perform global alignment using BLOSUM62
        alignments = pairwise2.align.globalds(sequence, unique_sequence, blosum62, -10, -0.5, one_alignment_only=True)

        aligned_seq1, aligned_seq2, score, start, end = alignments[0]

        alignment_length = len(aligned_seq1)
        identical = sum(a == b for a, b in zip(aligned_seq1, aligned_seq2) if a != '-' and b != '-')
        percent_identity = (identical / alignment_length) * 100

        if percent_identity >= threshold:
            is_unique = False
            excluded_sequences.append((pdb_id, sequence))
            print(f"PDB ID {pdb_id} excluded (identity: {percent_identity:.2f}%) with {unique_pdb_id}")
            break

    if is_unique:
        unique_sequences.append((pdb_id, sequence))
        print(f"PDB ID {pdb_id} is unique and saved.")

    elapsed_time = time.time() - start_time
    remaining_time = (elapsed_time / (i + 1)) * (len(df) - i - 1)
    print(f"Progress: {i+1}/{len(df)} | Elapsed: {elapsed_time:.2f}s | Remaining: {remaining_time:.2f}s")

# Save results
df_unique = pd.DataFrame(unique_sequences, columns=["PDB ID", "Sequence"])
df_excluded = pd.DataFrame(excluded_sequences, columns=["PDB ID", "Sequence"])

df_unique.to_excel("/content/2His_1Glu_unique_30.xlsx", index=False)
df_excluded.to_excel("/content/2His_1Glu_blo_exclude_30.xlsx", index=False)

print("Sequence identity analysis completed.")



In [None]:
#Copy the PDB file based on unique_sequence
import pandas as pd
import shutil
import os

# Paths to input Excel files
input_file_unique = "L:/Zn-installer_rawdata/241020_Mn_Final/Final_3/Final_3.xlsx"

# Path to the source directory containing PDB files
source_dir = "L:/Zn-installer_rawdata/241020_Mn_Final/metal_count_greater_than_1/mixed/"

# Paths to the destination directories
destination_dir_unique = "L:/Zn-installer_rawdata/241020_Mn_Final/Final_3/"

# Create the destination directories if they do not exist
os.makedirs(destination_dir_unique, exist_ok=True)

# Function to copy PDB files based on a DataFrame
def copy_pdb_files(df, destination_dir):
    pdb_ids = df['PDB ID'].tolist()
    for pdb_id in pdb_ids:
        source_file = os.path.join(source_dir, f"{pdb_id}.pdb")
        destination_file = os.path.join(destination_dir, f"{pdb_id}.pdb")
        if os.path.exists(source_file):
            shutil.copy(source_file, destination_file)
            print(f"Copied: {pdb_id}.pdb to {destination_dir}")
        else:
            print(f"File not found: {pdb_id}.pdb")

# Read Excel files using pandas
df_unique = pd.read_excel(input_file_unique)


# Copy PDB files for unique sequences
copy_pdb_files(df_unique, destination_dir_unique)

In [None]:
#structure alignment (If necessary)
import os
import itertools
import pandas as pd
from pymol2 import PyMOL

# Change this to your local folder with PDBs
pdb_folder = r"/content/drive/MyDrive/All/Zn_2His_1Glu"  # <-- change this
pdb_files = [f for f in os.listdir(pdb_folder) if f.endswith(".pdb")]

all_rmsd_path = os.path.join(pdb_folder, "/content/drive/MyDrive/250413_Final/2His_1Glu_pairwise_rmsd.xlsx")
similar_rmsd_path = os.path.join(pdb_folder, "/content/drive/MyDrive/250413_Final/2His_1Glu_similar_structures.xlsx")

results = []
similar_results = []

# ✅ Start PyMOL instance properly
with PyMOL() as pymol:
    pymol.start()
    cmd = pymol.cmd

    # Load each structure
    for pdb_file in pdb_files:
        full_path = os.path.join(pdb_folder, pdb_file)
        obj_name = os.path.splitext(pdb_file)[0]
        cmd.load(full_path, obj_name)

    # Align pairwise
    for pdb1, pdb2 in itertools.combinations(pdb_files, 2):
        obj1 = os.path.splitext(pdb1)[0]
        obj2 = os.path.splitext(pdb2)[0]
        try:
            rmsd = cmd.align(obj1, obj2)[0]
            print(f"{pdb1} vs {pdb2} -> RMSD = {rmsd:.3f}")
            row = {
                "Structure 1": pdb1,
                "Structure 2": pdb2,
                "RMSD (Å)": round(rmsd, 3)
            }
            results.append(row)
            if rmsd < 1.0:
                similar_results.append(row)
        except Exception as e:
            print(f"Failed: {pdb1} vs {pdb2} — {e}")

# Save results
df_all = pd.DataFrame(results)
df_similar = pd.DataFrame(similar_results)
df_all.to_excel(all_rmsd_path, index=False)
df_similar.to_excel(similar_rmsd_path, index=False)

# ✅ Extract unique representative PDBs by clustering similar ones
unique_pdbs = set(pdb_files)  # start with all PDBs

# Create groups of similar structures
similar_groups = []
used = set()

for _, row in df_similar.iterrows():
    a, b = row["Structure 1"], row["Structure 2"]
    found = False
    for group in similar_groups:
        if a in group or b in group:
            group.update([a, b])
            found = True
            break
    if not found:
        similar_groups.append(set([a, b]))
    used.update([a, b])

# Remove all but one from each similar group
for group in similar_groups:
    group = list(group)
    # Keep the first one and remove the others
    for pdb in group[1:]:
        if pdb in unique_pdbs:
            unique_pdbs.remove(pdb)

# Save unique list
unique_pdb_list = sorted(list(unique_pdbs))
df_unique = pd.DataFrame({"Unique PDB": unique_pdb_list})
unique_pdb_path = os.path.join(pdb_folder, "/content/drive/MyDrive/250413_Final/2His_1Glu_unique_pdb.xlsx")
df_unique.to_excel(unique_pdb_path, index=False)

print(f"✅ Unique PDBs saved to: {unique_pdb_path}")


In [None]:
import os
import pandas as pd
from Bio import PDB
import numpy as np
import logging

# --- Constants ---
# Input/Output Configuration
PDB_DIRECTORY = "D:/250414_Final/Mn/2His_1Asp"
OUTPUT_DIR = "D:/250414_Final/Mn/2His_1Asp"
# Changed prefix to indicate the format change
OUTPUT_FILENAME_PREFIX = "original_format_output_"

# Biochemical Constants
METAL_ID = 'MN'
ALLOWED_RESIDUES = {
    'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS',
    'LYS', 'MET', 'PHE', 'SER', 'THR', 'TYR', 'VAL'
}

# Distance thresholds (in Angstroms) for MN coordination [min_dist, max_dist]
# Using the dictionary provided in the script
MN_DISTANCE_THRESHOLDS = {
    'HIS_ND1': (0, 2.75), 'HIS_NE2': (0, 2.75), 'HIS_N': (0, 2.75), 'HIS_O': (0, 2.85),
    'GLU_OE1': (0, 2.85), 'GLU_OE2': (0, 2.85), 'GLU_N': (0, 2.75), 'GLU_O': (0, 2.85),
    'ASP_OD1': (0, 2.85), 'ASP_OD2': (0, 2.85), 'ASP_N': (0, 2.75), 'ASP_O': (0, 2.85),
    'ALA_N': (0, 2.75),   'ALA_O': (0, 2.85),
    'CYS_SG': (0, 2.75),
    'MET_SD': (0, 2.75),
    'ARG_NH1': (0, 2.75), 'ARG_NH2': (0, 2.75), 'ARG_NE': (0, 2.75), 'ARG_N': (0, 2.75), 'ARG_O': (0, 2.85),
    'ASN_OD1': (0, 2.85), 'ASN_ND2': (0, 2.75), 'ASN_N': (0, 2.75), 'ASN_O': (0, 2.85),
    'GLN_OE1': (0, 2.85), 'GLN_NE2': (0, 2.75), 'GLN_N': (0, 2.75), 'GLN_O': (0, 2.85),
    'GLY_N': (0, 2.75),   'GLY_O': (0, 2.85),
    'LYS_NZ': (0, 2.75),  'LYS_N': (0, 2.75), 'LYS_O': (0, 2.85),
    'SER_OG': (0, 2.85),  'SER_N': (0, 2.75), 'SER_O': (0, 2.85),
    'THR_OG1': (0, 2.85), 'THR_N': (0, 2.75), 'THR_O': (0, 2.85),
    'TYR_OH': (0, 2.85),  'TYR_N': (0, 2.75), 'TYR_O': (0, 2.85),
}

# Define base columns matching the desired "original" output structure (image)
# 8 columns before binding atoms start
BASE_OUTPUT_COLUMNS = [
    'Entry ID', 'PDB ID', 'Metal Chain ID', 'Metal Residue number', 'Metal',
    'Chain ID', 'Residue number', 'Residue name' # These refer to the specific ligand in the row
]

# --- Helper Functions (Keep as they are in the provided script) ---

def calculate_distance(atom1, atom2):
    """Calculates the Euclidean distance between two Bio.PDB.Atom objects."""
    try:
        coord1 = atom1.get_coord()
        coord2 = atom2.get_coord()
        diff = coord1 - coord2
        return np.sqrt(np.sum(diff * diff))
    except Exception as e:
        # It's often better to log the pdb_id here if possible, but it's not passed directly
        logging.error(f"Error calculating distance between {atom1} and {atom2}: {e}")
        return float('inf') # Return infinity if coordinates are bad

def get_distance_threshold(residue_name, atom_name, thresholds):
    """
    Gets the distance threshold tuple (min, max) for a given residue and atom.
    Handles specific cases like ASP/GLU carboxylates and CYS/MET sulfurs based on provided dict keys.
    """
    # This function attempts to match the logic needed for the provided MN_DISTANCE_THRESHOLDS dict
    # Check for specific atom keys first
    specific_key = f"{residue_name}_{atom_name}"
    if specific_key in thresholds:
        return thresholds[specific_key]

    # Handle potential combined carboxylate/amide oxygens if specific keys aren't present
    # Note: The provided dict HAS specific keys like GLU_OE1, GLU_OE2, so this fallback might not be needed
    # if residue_name == 'GLU' and atom_name in ['OE1', 'OE2'] and 'GLU_OE' in thresholds:
    #     return thresholds['GLU_OE']
    # if residue_name == 'ASP' and atom_name in ['OD1', 'OD2'] and 'ASP_OD' in thresholds:
    #     return thresholds['ASP_OD']

    # Handle specific sulfur atoms (assuming keys like CYS_SG, MET_SD exist)
    if residue_name == 'CYS' and atom_name == 'SG' and 'CYS_SG' in thresholds:
         return thresholds['CYS_SG']
    if residue_name == 'MET' and atom_name == 'SD' and 'MET_SD' in thresholds:
         return thresholds['MET_SD']

    # Fallback if no specific or grouped key is found
    return thresholds.get(specific_key, (0, 0))


# --- MODIFIED Core Logic Function ---
# Renamed to avoid confusion, but uses the same internal logic as the provided script's find_coordination_sites
# The CHANGE is in how the final data is structured and returned.
def find_coordination_sites_for_original_format(structure, pdb_id, metal_atom_name, thresholds, allowed_ligands):
    """
    Identifies metal coordination sites using the provided script's logic,
    but formats the output as one row per metal-ligand pair.

    Returns:
        A dictionary where keys are coordination numbers and values are lists
        of tuples. Each tuple represents ONE metal-ligand interaction:
        (entry_id, pdb_code, metal_chain, metal_resnum, metal_name,
         ligand_chain, ligand_resnum, ligand_name, atom1, atom2, ...)
    """
    # This dictionary will store {coord_num: [list_of_metal_ligand_tuples]}
    coordination_sites_by_number = {}

    # Iterate through the structure to find metal ions
    for model in structure:
        for chain in model:
            for metal_residue in chain:
                if metal_residue.get_resname() == metal_atom_name:
                    try:
                        metal_atom = metal_residue[metal_atom_name]
                    except KeyError:
                        logging.warning(f"Metal residue {metal_residue.id} in {pdb_id} "
                                        f"doesn't contain a '{metal_atom_name}' atom. Skipping.")
                        continue

                    metal_chain_id = metal_residue.get_parent().id
                    metal_res_num = metal_residue.id[1]
                    metal_name = metal_residue.get_resname()
                    # Use .get() for safer header access, fallback to pdb_id
                    pdb_code = structure.header.get('idcode', pdb_id)

                    # Store info about ligands coordinating *this specific* metal ion
                    # Using a dictionary keyed by residue ID to handle duplicates across models easily
                    coordinating_ligands_details = {} # {(chain, resnum): {ligand_info..., "binding_atoms": [...]}}

                    # Now, iterate through all potential ligands for this metal ion
                    for ligand_model in structure:
                        for ligand_chain in ligand_model:
                            for ligand_residue in ligand_chain:
                                # Skip if it's the metal itself or not an allowed residue type
                                if ligand_residue == metal_residue or ligand_residue.get_resname() not in allowed_ligands:
                                    continue

                                coordinating_atoms_in_ligand = [] # Atoms in *this* ligand coordinating the *current* metal
                                ligand_res_name = ligand_residue.get_resname()
                                ligand_chain_id = ligand_chain.id
                                ligand_res_num = ligand_residue.id[1]
                                residue_unique_id = (ligand_chain_id, ligand_res_num)

                                # Check each atom in the potential ligand residue
                                for ligand_atom in ligand_residue:
                                    ligand_atom_name = ligand_atom.get_id()
                                    min_dist, max_dist = get_distance_threshold(ligand_res_name, ligand_atom_name, thresholds)

                                    if max_dist <= 0: continue # Skip if no valid threshold

                                    distance = calculate_distance(metal_atom, ligand_atom)

                                    if min_dist <= distance <= max_dist:
                                        coordinating_atoms_in_ligand.append(ligand_atom_name)

                                # If any atom in this ligand coordinated, store/update its info
                                if coordinating_atoms_in_ligand:
                                    if residue_unique_id not in coordinating_ligands_details:
                                        coordinating_ligands_details[residue_unique_id] = {
                                            "ligand_chain_id": ligand_chain_id,
                                            "ligand_res_num": ligand_res_num,
                                            "ligand_res_name": ligand_res_name,
                                            "binding_atoms": set(coordinating_atoms_in_ligand) # Use a set initially
                                        }
                                    else:
                                        # Add any newly found coordinating atoms for this residue
                                        coordinating_ligands_details[residue_unique_id]["binding_atoms"].update(coordinating_atoms_in_ligand)

                    # --- Data Formatting Change Starts Here ---
                    # After checking all ligands for this metal ion, determine coordination number
                    coordination_number = len(coordinating_ligands_details)

                    if coordination_number > 0:
                        # Create the list of output tuples (one per ligand) for this metal
                        output_rows_for_this_metal = []
                        for ligand_details in coordinating_ligands_details.values():
                            # Sort binding atoms alphabetically for consistent output
                            sorted_binding_atoms = tuple(sorted(list(ligand_details["binding_atoms"])))

                            # Construct the tuple for this metal-ligand pair row
                            row_tuple = (
                                pdb_id,                   # Entry ID (from filename)
                                pdb_code,                 # PDB ID (from header)
                                metal_chain_id,           # Metal Chain ID
                                metal_res_num,            # Metal Residue number
                                metal_name,               # Metal Name
                                ligand_details["ligand_chain_id"],    # Ligand Chain ID
                                ligand_details["ligand_res_num"],     # Ligand Residue number
                                ligand_details["ligand_res_name"]     # Ligand Residue name
                            ) + sorted_binding_atoms      # Add tuple of binding atoms

                            output_rows_for_this_metal.append(row_tuple)

                        # Store this list of rows under the coordination number
                        if coordination_number not in coordination_sites_by_number:
                            coordination_sites_by_number[coordination_number] = []
                        coordination_sites_by_number[coordination_number].extend(output_rows_for_this_metal)
                    # --- Data Formatting Change Ends Here ---

    # Return the dictionary structured for the desired output format
    return coordination_sites_by_number


# --- MODIFIED Saving Function ---
# Renamed function to clarify its purpose
def save_coordination_data_as_original_format(all_coordination_data, output_dir, file_prefix, base_columns):
    """
    Saves the aggregated coordination data using the original format
    (one row per metal-ligand pair, dynamic binding atom columns).
    """
    if not all_coordination_data:
        logging.warning("No coordination data found to save.")
        return

    for coordination_number, sites_list in all_coordination_data.items():
        if not sites_list:
            logging.info(f"No sites found for coordination number {coordination_number}. Skipping file.")
            continue

        logging.info(f"Processing data for coordination number {coordination_number}...")

        # Remove exact duplicate rows before saving (often desirable)
        # Sort first by all elements in tuple for consistent non-duplicate selection
        unique_sites_list = sorted(list(set(sites_list)))

        # Determine the maximum number of binding atoms listed for any single ligand in this group
        num_base_cols = len(base_columns) # Should be 8
        max_binding_atoms = 0
        if unique_sites_list:
            try:
                # Calculate max length of the binding atom part of the tuples
                max_binding_atoms = max(len(site_tuple) - num_base_cols for site_tuple in unique_sites_list)
            except ValueError:
                 logging.error(f"Inconsistent tuple length found for coord_num {coordination_number}. Check data.")
                 max_binding_atoms = 0 # Default

        # Define columns: base columns + Binding atomX columns
        columns = list(base_columns)
        columns.extend([f'Binding atom{i}' for i in range(1, max_binding_atoms + 1)])

        # Create DataFrame using the unique list and defined columns
        # .fillna(np.nan) handles ligands with fewer binding atoms than the max
        try:
             df_coordination = pd.DataFrame(unique_sites_list, columns=columns).fillna(np.nan)
        except Exception as e:
             # Log error and fallback to default columns if specific columns fail
             logging.error(f"Error creating DataFrame for coord num {coordination_number} with specific columns: {e}. Using default columns.")
             df_coordination = pd.DataFrame(unique_sites_list)


        # Define output path and save
        output_filename = f"{file_prefix}{coordination_number}.xlsx"
        output_excel_path = os.path.join(output_dir, output_filename)

        try:
            df_coordination.to_excel(output_excel_path, index=False)
            logging.info(f"Saved data for coordination number {coordination_number} to {output_excel_path}")
        except Exception as e:
            logging.error(f"Failed to save Excel file {output_excel_path}: {e}")

# --- Main Execution (Calls the modified functions) ---

def main():
    """Main function to parse PDB files, find coordination sites, and save results in original format."""
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    aggregated_coordination_info = {} # Stores {coord_num: [list_of_metal_ligand_tuples]}
    parser = PDB.PDBParser(QUIET=True)

    logging.info(f"Starting PDB analysis in directory: {PDB_DIRECTORY}")

    processed_files = 0
    for filename in os.listdir(PDB_DIRECTORY):
        if filename.lower().endswith('.pdb'):
            pdb_filepath = os.path.join(PDB_DIRECTORY, filename)
            pdb_id = filename.split('.')[0]
            logging.info(f"Processing {filename} (ID: {pdb_id})...")

            try:
                structure = parser.get_structure(pdb_id, pdb_filepath)

                # === Call the function that formats data for the original output ===
                coordination_info = find_coordination_sites_for_original_format(
                    structure,
                    pdb_id,
                    METAL_ID,
                    MN_DISTANCE_THRESHOLDS,
                    ALLOWED_RESIDUES
                )

                # Aggregate results (coordination_info is already grouped by coord number)
                for coord_num, sites_list in coordination_info.items():
                    if coord_num not in aggregated_coordination_info:
                        aggregated_coordination_info[coord_num] = []
                    # sites_list contains tuples formatted for one row per metal-ligand pair
                    aggregated_coordination_info[coord_num].extend(sites_list)

                processed_files += 1

            except Exception as e:
                logging.error(f"Could not process file {filename}. Error: {e}", exc_info=True)

    logging.info(f"Finished processing {processed_files} PDB files.")

    logging.info("Saving aggregated results to Excel files (Original Row Format)...")
    # === Call the saving function designed for the original format ===
    save_coordination_data_as_original_format(
        aggregated_coordination_info,
        OUTPUT_DIR,
        OUTPUT_FILENAME_PREFIX,
        BASE_OUTPUT_COLUMNS
    )

    logging.info('Script execution completed.')


if __name__ == "__main__":
    main()

In [None]:
#change the format of excel file
import pandas as pd

# Load the Excel file
coordination_df = pd.read_excel('L:/Zn-installer_rawdata/241020_Mn_Final/Final_3/new_3.xlsx')

# Remove extra single quotes from column names
coordination_df.columns = coordination_df.columns.str.replace("'", "")

# Prepare the data for horizontal arrangement based on PDB_ID, Metal Chain ID, and Metal Residue number
grouped = coordination_df.groupby(['Entry ID', 'Metal Chain ID', 'Metal Residue number'])

# Create a new DataFrame to hold the horizontally arranged data
horizontal_data = []

for name, group in grouped:
    row = list(name)
    for _, data in group.iterrows():
        row.extend([
            data['Chain ID'], data['Residue number'], data['Residue name'], data['Binding atom1']
        ])
    horizontal_data.append(row)

# Determine the column names for the new DataFrame
max_columns = max(len(row) for row in horizontal_data)
columns = ['Entry ID', 'Metal Chain ID', 'Metal Residue number'] + \
    [item for i in range((max_columns - 3) // 4) for item in [f'Chain ID{i+1}', f'Residue_number{i+1}', f'Residue_name{i+1}', f'Binding atom{i+1}']]

# Create the horizontally arranged DataFrame
horizontal_df = pd.DataFrame(horizontal_data, columns=columns)

# Save the resulting DataFrame to a new Excel file
horizontal_df.to_excel('L:/Zn-installer_rawdata/241020_Mn_Final/Final_3/new_3_format.xlsx', index=False)

In [None]:
#calculate the geometric parameter
import pandas as pd
import numpy as np
from Bio.PDB import PDBParser
from Bio.PDB.PDBExceptions import PDBConstructionWarning
import warnings
from math import acos, degrees

# Load the Excel file
excel_file = 'L:/Zn-installer_rawdata/241111_Mn_Final//Final/new_3_format.xlsx'
df = pd.read_excel(excel_file)

# Specify the directory containing PDB files
pdb_directory = 'L:/Zn-installer_rawdata/241111_Mn_Final//Final/'

def parse_pdb_id(pdb_id):
    return pdb_id.strip()

def calculate_distance(coord1, coord2):
    return np.linalg.norm(coord1 - coord2)

def calculate_angle(vector1, vector2):
    """
    Calculate the angle between two vectors.
    """
    unit_vector1 = vector1 / np.linalg.norm(vector1)
    unit_vector2 = vector2 / np.linalg.norm(vector2)
    dot_product = np.dot(unit_vector1, unit_vector2)
    angle = degrees(acos(dot_product))
    return angle

# Add columns for distances and angles
for i in range(1, 4):
    df[f'Calpha_X_{i}'] = None
    df[f'Calpha_Y_{i}'] = None
    df[f'Calpha_Z_{i}'] = None
    df[f'Cbeta_X_{i}'] = None
    df[f'Cbeta_Y_{i}'] = None
    df[f'Cbeta_Z_{i}'] = None
    df[f'Angle_{i}'] = None

for i in range(3):
    for j in range(i + 1, 3):
        df[f'Calpha_Distance_{i+1}_{j+1}'] = None
        df[f'Cbeta_Distance_{i+1}_{j+1}'] = None
        df[f'Angle_{i+1}_{j+1}'] = None

# Calculate distances, angles, and save alpha and beta carbon coordinates
for index, row in df.iterrows():
    pdb_id = parse_pdb_id(row['Entry ID'])
    chain_ids = [row['Chain ID1'], row['Chain ID2'], row['Chain ID3']]
    residues = [row['Residue_number1'], row['Residue_number2'], row['Residue_number3']]

    pdb_filename = f"{pdb_directory}/{pdb_id}.pdb"
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", PDBConstructionWarning)
        parser = PDBParser(QUIET=True)
        structure = parser.get_structure(pdb_id, pdb_filename)

    model = structure[0]
    residue_coords = []

    for i in range(3):
        chain_id = chain_ids[i]
        residue_number = int(residues[i])
        try:
            chain = model[chain_id]
            residue = chain[residue_number]

            ca_coord = np.array(residue['CA'].get_coord())
            cb_coord = np.array(residue['CB'].get_coord())

            df.at[index, f'Calpha_X_{i+1}'] = ca_coord[0]
            df.at[index, f'Calpha_Y_{i+1}'] = ca_coord[1]
            df.at[index, f'Calpha_Z_{i+1}'] = ca_coord[2]

            df.at[index, f'Cbeta_X_{i+1}'] = cb_coord[0]
            df.at[index, f'Cbeta_Y_{i+1}'] = cb_coord[1]
            df.at[index, f'Cbeta_Z_{i+1}'] = cb_coord[2]

            residue_coords.append((ca_coord, cb_coord))
        except KeyError:
            residue_coords.append((None, None))
            continue

    for i in range(3):
        for j in range(i + 1, 3):
            ca_coord_i, cb_coord_i = residue_coords[i]
            ca_coord_j, cb_coord_j = residue_coords[j]

            if ca_coord_i is not None and ca_coord_j is not None:
                alpha_distance = calculate_distance(ca_coord_i, ca_coord_j)
                df.at[index, f'Calpha_Distance_{i+1}_{j+1}'] = alpha_distance

            if cb_coord_i is not None and cb_coord_j is not None:
                beta_distance = calculate_distance(cb_coord_i, cb_coord_j)
                df.at[index, f'Cbeta_Distance_{i+1}_{j+1}'] = beta_distance

            if ca_coord_i is not None and cb_coord_i is not None and ca_coord_j is not None and cb_coord_j is not None:
                vector_i = ca_coord_j - ca_coord_i
                vector_j = cb_coord_j - cb_coord_i
                angle = calculate_angle(vector_i, vector_j)
                df.at[index, f'Angle_{i+1}_{j+1}'] = angle

# Save the updated DataFrame to a new Excel file
df.to_excel('L:/Zn-installer_rawdata/241111_Mn_Final//Final/Geometric_parameters.xlsx', index=False)

In [None]:
#2D_Final (density map)
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from Bio.PDB import PDBParser, is_aa
from math import acos, degrees

def calculate_distance(coord1, coord2):
    return np.linalg.norm(coord1 - coord2)

def calculate_angle(coord1, coord2, coord3):
    # Calculate vector from coord2 to coord1 and coord2 to coord3
    v1 = coord1 - coord2
    v2 = coord3 - coord2
    # Calculate cosine of the angle using dot product
    cosine_angle = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    # Ensure the cosine value is within the valid range due to floating-point errors
    cosine_angle = np.clip(cosine_angle, -1.0, 1.0)
    # Return the angle in degrees
    return degrees(acos(cosine_angle))

def extract_zn_coordinates(pdb_directory, pdb_entry, metal_chain_id, metal_residue_number):
    # Initialize PDB parser
    parser = PDBParser(QUIET=True)

    # Build PDB file path
    pdb_file = os.path.join(pdb_directory, f"{pdb_entry}.pdb")

    if not os.path.exists(pdb_file):
        print(f"PDB file for {pdb_entry} not found in the directory.")
        return None

    # Parse the PDB file
    structure = parser.get_structure(pdb_entry, pdb_file)

    # Iterate over all chains and residues to find the Zn atom
    for chain in structure[0]:
        if chain.id == metal_chain_id:
            for residue in chain:
                if residue.id[1] == metal_residue_number and residue.resname == 'FE':
                    # Extract the Zn atom coordinates
                    for atom in residue:
                        if atom.element == 'FE':
                            return atom.coord
    return None

def extract_coordinates(structure, chain_id, residue_number, atom_name):
    for chain in structure[0]:
        if chain.id == chain_id:
            for residue in chain:
                if residue.id[1] == residue_number and is_aa(residue):
                    if atom_name in residue:
                        return residue[atom_name].coord
    return None

def process_pdb_file(pdb_directory, pdb_entry, metal_chain_id, metal_residue_number, residues_info):
    parser = PDBParser(QUIET=True)
    pdb_file = os.path.join(pdb_directory, f"{pdb_entry}.pdb")

    if not os.path.exists(pdb_file):
        print(f"PDB file for {pdb_entry} not found in the directory.")
        return None

    structure = parser.get_structure(pdb_entry, pdb_file)

    # Extract Zn coordinates using the improved method
    zn_coords = extract_zn_coordinates(pdb_directory, pdb_entry, metal_chain_id, metal_residue_number)
    if zn_coords is None:
        print(f"Zn atom not found in {pdb_entry}.")
        return None

    ca_zn_distances = []
    cb_zn_distances = []
    ca_zn_cb_angles = []

    for residue_info in residues_info:
        chain_id, residue_number = residue_info['chain_id'], residue_info['residue_number']
        ca_coords = extract_coordinates(structure, chain_id, residue_number, 'CA')
        cb_coords = extract_coordinates(structure, chain_id, residue_number, 'CB')

        if ca_coords is not None and cb_coords is not None:
            # Calculate distances
            ca_zn_distance = calculate_distance(ca_coords, zn_coords)
            cb_zn_distance = calculate_distance(cb_coords, zn_coords)

            # Calculate angle between CA-Zn-CB
            ca_zn_cb_angle = calculate_angle(ca_coords, zn_coords, cb_coords)

            ca_zn_distances.append(ca_zn_distance)
            cb_zn_distances.append(cb_zn_distance)
            ca_zn_cb_angles.append(ca_zn_cb_angle)
        else:
            print(f"CA or CB atom not found for residue {residue_number} in chain {chain_id} of {pdb_entry}.")

    if len(ca_zn_distances) == 3 and len(cb_zn_distances) == 3 and len(ca_zn_cb_angles) == 3:
        return {
            'Entry ID': pdb_entry,
            'Calpha_Zn_Dist1': ca_zn_distances[0],
            'Calpha_Zn_Dist2': ca_zn_distances[1],
            'Calpha_Zn_Dist3': ca_zn_distances[2],
            'Cbeta_Zn_Dist1': cb_zn_distances[0],
            'Cbeta_Zn_Dist2': cb_zn_distances[1],
            'Cbeta_Zn_Dist3': cb_zn_distances[2],
            'CA-Zn-CB Angle_1': ca_zn_cb_angles[0],
            'CA-Zn-CB Angle_2': ca_zn_cb_angles[1],
            'CA-Zn-CB Angle_3': ca_zn_cb_angles[2]
        }
    else:
        return None

def calculate_and_trim_probability_map(df, calpha_cols, cbeta_cols, calpha_bins, cbeta_bins, npy_file='prob_map.npy', excel_file='precomputed_prob_map.xlsx'):
    # Concatenate all Calpha and Cbeta distances into single series
    all_calpha_distances = pd.concat([df[calpha_col] for calpha_col in calpha_cols])
    all_cbeta_distances = pd.concat([df[cbeta_col] for cbeta_col in cbeta_cols])

    # Create a 2D histogram to count occurrences in each bin
    hist, calpha_bins_edges, cbeta_bins_edges = np.histogram2d(all_calpha_distances, all_cbeta_distances, bins=[calpha_bins, cbeta_bins])

    # Normalize the histogram to get probabilities
    prob_map = hist / np.sum(hist)

    # Create a DataFrame for the probability map with all bins included, even if counts are zero
    prob_map_df = pd.DataFrame(prob_map, index=calpha_bins_edges[:-1], columns=cbeta_bins_edges[:-1])

    # Set the index and column names to match the expected output format
    prob_map_df.index = prob_map_df.index[:len(prob_map_df)]
    prob_map_df.columns = prob_map_df.columns[:len(prob_map_df.columns)]

    # Reset the index and save the formatted DataFrame
    formatted_prob_map_df = prob_map_df.reset_index()
    formatted_prob_map_df.columns = ['Unnamed: 0'] + list(formatted_prob_map_df.columns[1:])

    # Save the trimmed probability map to an Excel file
    formatted_prob_map_df.to_excel(excel_file, index=False)
    print(f"Trimmed probability map saved to {excel_file}")

    # Save the probability map as a .npy file for later use
    np.save(npy_file, prob_map)
    print(f"Probability map saved as {npy_file}")

    return formatted_prob_map_df

def main():
    pdb_directory = 'D:/250414_Final/FE/2His_1Asp/'  # Update this path to the directory where PDB files are stored
    excel_file_path = 'D:/250414_Final/FE/Fe_2His_1Asp_geometric_parameters.xlsx'

    # Load the Excel sheet data
    sheet_data = pd.read_excel(excel_file_path, sheet_name='Sheet1')

    all_results = []

    for index, row in sheet_data.iterrows():
        pdb_entry = row['Entry ID']
        metal_chain_id = row['Metal Chain ID']
        metal_residue_number = row['Metal Residue number']

        residues_info = [
            {'chain_id': row['Chain ID1'], 'residue_number': row['Residue_number1']},
            {'chain_id': row['Chain ID2'], 'residue_number': row['Residue_number2']},
            {'chain_id': row['Chain ID3'], 'residue_number': row['Residue_number3']}
        ]

        # Process each PDB file
        result = process_pdb_file(pdb_directory, pdb_entry, metal_chain_id, metal_residue_number, residues_info)
        if result:
            all_results.append(result)

    # Convert results to DataFrame and save
    results_df = pd.DataFrame(all_results)
    print(results_df)
    results_df.to_excel('D:/250414_Final/FE/2His_1Asp_distance_angle.xlsx', index=False)

    # Calculate and save the probability map
    calpha_cols = ['Calpha_Zn_Dist1', 'Calpha_Zn_Dist2', 'Calpha_Zn_Dist3']
    cbeta_cols = ['Cbeta_Zn_Dist1', 'Cbeta_Zn_Dist2', 'Cbeta_Zn_Dist3']
    calpha_bins = np.arange(4.6, 7.0, 0.2).tolist()
    cbeta_bins = np.arange(3.6, 6.1, 0.2).tolist()
    npy_file = 'D:/250414_Final/FE/Fe_2His_1Asp_distance_angle_0.2.npy'
    excel_file = 'D:/250414_Final/FE/Fe_2His_1Asp_distance_distance_angle_0.2.xlsx'
    calculate_and_trim_probability_map(results_df, calpha_cols, cbeta_cols, calpha_bins, cbeta_bins, npy_file, excel_file)

if __name__ == "__main__":
    main()


In [None]:
#3D_Final

import os
import numpy as np
import pandas as pd
from Bio.PDB import PDBParser, is_aa
from math import acos, degrees

# Helper functions to calculate distances and angles
def calculate_distance(coord1, coord2):
    return np.linalg.norm(coord1 - coord2)

def calculate_angle(coord1, coord2, coord3):
    v1 = coord1 - coord2
    v2 = coord3 - coord2
    cosine_angle = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    cosine_angle = np.clip(cosine_angle, -1.0, 1.0)
    return degrees(acos(cosine_angle))

# Load Zn coordinates
def extract_zn_coordinates(pdb_directory, pdb_entry, metal_chain_id, metal_residue_number):
    parser = PDBParser(QUIET=True)
    pdb_file = os.path.join(pdb_directory, f"{pdb_entry}.pdb")
    if not os.path.exists(pdb_file):
        print(f"PDB file for {pdb_entry} not found.")
        return None

    structure = parser.get_structure(pdb_entry, pdb_file)
    for chain in structure[0]:
        if chain.id == metal_chain_id:
            for residue in chain:
                if residue.id[1] == metal_residue_number and residue.resname == 'FE':
                    for atom in residue:
                        if atom.element == 'FE':
                            return atom.coord
    return None

# Load atom coordinates
def extract_coordinates(structure, chain_id, residue_number, atom_name):
    for chain in structure[0]:
        if chain.id == chain_id:
            for residue in chain:
                if residue.id[1] == residue_number and is_aa(residue):
                    if atom_name in residue:
                        return residue[atom_name].coord
    return None

# Function to create the 3D density map
def calculate_3d_density_map_from_data(df, calpha_cols, cbeta_cols, angle_cols, calpha_bins, cbeta_bins, angle_bins, npy_file='3d_prob_map.npy', excel_file='3d_precomputed_prob_map.xlsx'):
    # Concatenate all CA-Zn distances, CB-Zn distances, and angles
    all_calpha_distances = pd.concat([df[calpha_col] for calpha_col in calpha_cols], ignore_index=True)
    all_cbeta_distances = pd.concat([df[cbeta_col] for cbeta_col in cbeta_cols], ignore_index=True)
    all_angles = pd.concat([df[angle_col] for angle_col in angle_cols], ignore_index=True)

    # Ensure all data series are of the same length
    if len(all_calpha_distances) == len(all_cbeta_distances) == len(all_angles):
        # Create a 3D histogram
        hist, edges = np.histogramdd(
            (all_calpha_distances, all_cbeta_distances, all_angles),
            bins=[calpha_bins, cbeta_bins, angle_bins]
        )

        # Normalize to get probability map
        prob_map = hist / np.sum(hist)

        # Verify that the sum of the probabilities equals 1
        prob_sum = np.sum(prob_map)
        if np.isclose(prob_sum, 1.0):
            print(f"Verification passed: Sum of probabilities = {prob_sum:.6f}")
        else:
            print(f"Warning: Sum of probabilities is not 1 (actual: {prob_sum:.6f})")

        # Flatten the 3D array and get bin centers for DataFrame
        prob_map_flat = prob_map.flatten()
        calpha_centers = 0.5 * (edges[0][:-1] + edges[0][1:])
        cbeta_centers = 0.5 * (edges[1][:-1] + edges[1][1:])
        angle_centers = 0.5 * (edges[2][:-1] + edges[2][1:])

        # Create DataFrame with multi-index for bins
        index = pd.MultiIndex.from_product([calpha_centers, cbeta_centers, angle_centers], names=['Calpha_Zn_Dist', 'Cbeta_Zn_Dist', 'CA-Zn-CB_Angle'])
        prob_map_df = pd.DataFrame(prob_map_flat, index=index, columns=['Probability']).reset_index()

        # Save the DataFrame to an Excel file
        prob_map_df.to_excel(excel_file, index=False)
        print(f"3D probability map saved to {excel_file}")

        # Save the 3D probability map as a .npy file for further use
        np.save(npy_file, prob_map)
        print(f"3D probability map saved as {npy_file}")

        return prob_map_df
    else:
        print("Data series lengths for distances and angles do not match.")
        return None

# Function to automatically generate bins with edges
def generate_bins_with_auto_edge(start, stop, step):
    """
    Generate bins including edges and automatically adjust the last bin edge if needed.
    """
    bins = np.arange(start, stop, step).tolist()
    if bins[-1] < stop:
        bins.append(stop)
    return bins

# Define bin ranges for distances and angles using the automatic function
calpha_bins = generate_bins_with_auto_edge(4.6, 7.0, 0.2)  # Automatically includes 7.1
cbeta_bins = generate_bins_with_auto_edge(3.6, 6.1, 0.2)   # Automatically includes 6.1
angle_bins = generate_bins_with_auto_edge(0, 20, 0.5)      # Automatically includes 20.0

# Load calculated distances and angles from the Excel file
calculated_data_path = 'D:/250414_Final/FE/2His_1Asp_distance_angle.xlsx'  # Update this path as needed
calculated_df = pd.read_excel(calculated_data_path)

# Columns with calculated data
calpha_cols = ['Calpha_Zn_Dist1', 'Calpha_Zn_Dist2', 'Calpha_Zn_Dist3']
cbeta_cols = ['Cbeta_Zn_Dist1', 'Cbeta_Zn_Dist2', 'Cbeta_Zn_Dist3']
angle_cols = ['CA-Zn-CB Angle_1', 'CA-Zn-CB Angle_2', 'CA-Zn-CB Angle_3']

# Output files for the probability map
npy_file ='D:/250414_Final/FE/2His_1Asp_prob_map_adjusted3.npy'
excel_file = 'D:/250414_Final/FE/2His_1Asp_0.2_distance_angle_0.5.xlsx'

# Generate and save the 3D probability map
calculate_3d_density_map_from_data(
    calculated_df, calpha_cols, cbeta_cols, angle_cols,
    calpha_bins, cbeta_bins, angle_bins,
    npy_file=npy_file,
    excel_file=excel_file
)


