In [None]:
#Download the PDB listed in the Excel file
import os
import pandas as pd
import requests

# Function to download PDB file
def download_pdb(pdb_code, save_directory):
    download_url = f"https://files.rcsb.org/download/{pdb_code}.pdb"
    response = requests.get(download_url)
    if response.status_code == 200:
        save_path = os.path.join(save_directory, f"{pdb_code}.pdb")
        with open(save_path, "wb") as file:
            file.write(response.content)
        print(f"Downloaded {pdb_code}.pdb")
        return True
    else:
        print(f"Failed to download {pdb_code}.pdb")
        return False

# Read the Excel file with the PDB codes
excel_file = "L:/Zn-installer_rawdata/241111_Mn_Final/Mn.xlsx"
df = pd.read_excel(excel_file)

# Get the PDB codes from the Excel file
pdb_codes = df["PDB ID"].tolist()

# Directory to save the downloaded PDB files
pdb_directory = "L:/Zn-installer_rawdata/241111_Mn_Final/"

# Create the directory if it doesn't exist
os.makedirs(pdb_directory, exist_ok=True)

# List to store the PDB IDs for failed downloads
failed_pdb_ids = []

# Iterate over the PDB codes and download the structures
for pdb_code in pdb_codes:
    if not download_pdb(pdb_code, pdb_directory):
        failed_pdb_ids.append(pdb_code)

# Save the failed PDB IDs to an Excel file
failed_df = pd.DataFrame({"PDB ID": failed_pdb_ids})
failed_excel_file = "L:/Zn-installer_rawdata/241111_Mn_Final/Mn_failed.xlsx"
failed_df.to_excel(failed_excel_file, index=False)
print(f"Failed PDB IDs saved to {failed_excel_file}")

In [None]:
#Classify the PDB Files based on the number of manganese ions in the assymetric units
import os
import pandas as pd
from Bio.PDB import PDBParser
from collections import defaultdict
import shutil

def categorize_metal_atoms(filename):
    pdb_id = os.path.basename(filename).split('.')[0]
    manganese_atoms = []
    other_metal_atoms = []

    parser = PDBParser(QUIET=True)
    structure = parser.get_structure(pdb_id, filename)

    # Store Mn and other metal atoms
    for model in structure:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    element_symbol = atom.element
                    if element_symbol == 'MN':
                        manganese_atoms.append(atom)
                    elif element_symbol in ['ZN', 'MG', 'CO', 'CA', 'FE', 'PT', 'NA', 'K', 'LI', 'CD', 'YB', 'NI', 'PR', 'HG', 'MN']:
                        other_metal_atoms.append(atom)

    return manganese_atoms, other_metal_atoms

def save_metal_atoms_to_excel(directory):
    pdb_files = [f for f in os.listdir(directory) if f.endswith('.pdb')]
    all_mn_atoms = []
    all_other_metals = []

    for pdb_file in pdb_files:
        pdb_file_path = os.path.join(directory, pdb_file)
        mn_atoms, other_metals = categorize_metal_atoms(pdb_file_path)
        all_mn_atoms.extend([(pdb_file.split('.')[0], 'MN')] * len(mn_atoms))
        all_other_metals.extend([(pdb_file.split('.')[0], atom.element) for atom in other_metals])

    df_mn_atoms = pd.DataFrame(all_mn_atoms, columns=['PDB ID', 'Name of metal atoms'])
    df_other_metals = pd.DataFrame(all_other_metals, columns=['PDB ID', 'Name of metal atoms'])

    output_file_mn_atoms = os.path.join(directory, 'mn_atoms2.xlsx')
    output_file_other_metals = os.path.join(directory, 'other_metals2.xlsx')

    df_mn_atoms.to_excel(output_file_mn_atoms, index=False)
    df_other_metals.to_excel(output_file_other_metals, index=False)

    print(f"MN atoms categorized have been saved to '{output_file_mn_atoms}'.")
    print(f"Other metals categorized have been saved to '{output_file_other_metals}'.")

    return output_file_mn_atoms

def count_and_categorize_mn_atoms(excel_file, pdb_directory):
    df = pd.read_excel(excel_file)
    mn_counts = defaultdict(int)

    for index, row in df.iterrows():
        pdb_id = row['PDB ID']
        metal_atom_name = row['Name of metal atoms']

        if metal_atom_name == 'MN':
            mn_counts[pdb_id] += 1

    mn_counts_dict = dict(mn_counts)
    result_df = pd.DataFrame(list(mn_counts_dict.items()), columns=['PDB ID', 'Metal Count'])
    output_excel = os.path.join(pdb_directory, 'mn_count_results.xlsx')
    result_df.to_excel(output_excel, index=False)
    print(f"Metal atom counts have been saved to '{output_excel}'.")

    # Categorize and copy PDB files based on the number of MN atoms
    for pdb_id, count in mn_counts_dict.items():
        pdb_file = os.path.join(pdb_directory, f"{pdb_id}.pdb")
        if not os.path.isfile(pdb_file):
            print(f"Warning: PDB file {pdb_file} not found.")
            continue

        if count == 1:
            destination_dir = os.path.join(pdb_directory, 'metal_count_1_mn2')
        elif count > 1:
            destination_dir = os.path.join(pdb_directory, 'metal_count_greater_than_1_mn2')
        else:
            continue

        os.makedirs(destination_dir, exist_ok=True)
        shutil.copy(pdb_file, os.path.join(destination_dir, f"{pdb_id}.pdb"))
        print(f"File '{pdb_id}.pdb' copied to '{destination_dir}'.")

# Specify the directories
pdb_directory = 'L:/Zn-installer_rawdata/241111_Mn_Final/Final_3/'

# Save categorized metal atoms to Excel and get the path of the Excel file
mn_atoms_excel_file = save_metal_atoms_to_excel(pdb_directory)

# Count Mn atoms based on the Excel file and categorize PDB files
count_and_categorize_mn_atoms(mn_atoms_excel_file, pdb_directory)

In [None]:
#Classification of mononuclear from heteronuclear proteins
import os
import shutil
import pandas as pd
from Bio.PDB import PDBParser

def calculate_distance(atom1, atom2):
    # Calculate the Euclidean distance between two atoms
    x1, y1, z1 = atom1.coord
    x2, y2, z2 = atom2.coord
    distance = ((x1 - x2)**2 + (y1 - y2)**2 + (z1 - z2)**2)**0.5
    return distance

def categorize_pdb_files(directory, distance_threshold=5.0):
    mono_hetero_results = []
    mono_results = []

    pdb_files = [f for f in os.listdir(directory) if f.endswith('.pdb')]
    parser = PDBParser(QUIET=True)

    for pdb_file in pdb_files:
        try:
            pdb_id = pdb_file.split('.')[0]
            structure = parser.get_structure(pdb_id, os.path.join(directory, pdb_file))

            # Store Mn and other metal atoms
            manganese_atoms = []
            other_metal_atoms = []
            has_other_metals_near_manganese = False

            for model in structure:
                for chain in model:
                    for residue in chain:
                        for atom in residue:
                            element_symbol = atom.element
                            if element_symbol == 'MN':
                                manganese_atoms.append(atom)
                            elif element_symbol in ['ZN', 'MG', 'CO', 'CA', 'FE', 'PT', 'NA', 'K', 'LI', 'CD', 'YB', 'NI', 'PR', 'HG', 'MN']:
                                other_metal_atoms.append(atom)

            # If exactly one Mn atom, check for proximity to other metals
            if len(manganese_atoms) == 1:
                manganese_atom = manganese_atoms[0]
                for metal_atom in other_metal_atoms:
                    distance = calculate_distance(manganese_atom, metal_atom)
                    if distance <= distance_threshold:
                        mono_hetero_results.append(pdb_id)
                        has_other_metals_near_manganese = True
                        break  # No need to check further, already categorized as 'mono-hetero'

            if not has_other_metals_near_manganese:
                mono_results.append(pdb_id)

        except Exception as e:
            print(f"Error processing {pdb_file}: {e}")

    # Save results to separate Excel files
    save_results(mono_hetero_results, mono_results, directory)

def save_results(mono_hetero_results, mono_results, directory):
    """Save categorization results to Excel files and copy PDB files."""
    # Set up output directories
    output_dir_mono_hetero = os.path.join(directory, 'mono_hetero2')
    output_dir_mono = os.path.join(directory, 'mono2')

    try:
        # Create directories if they do not exist
        os.makedirs(output_dir_mono_hetero, exist_ok=True)
        os.makedirs(output_dir_mono, exist_ok=True)

        # Save results to Excel files
        df_mono_hetero = pd.DataFrame(mono_hetero_results, columns=['PDB ID'])
        df_mono = pd.DataFrame(mono_results, columns=['PDB ID'])

        df_mono_hetero.to_excel(os.path.join(output_dir_mono_hetero, 'mono_hetero2.xlsx'), index=False)
        df_mono.to_excel(os.path.join(output_dir_mono, 'mono2.xlsx'), index=False)

        # Copy corresponding PDB files to directories
        for pdb_id in mono_hetero_results:
            src_file = os.path.join(directory, f"{pdb_id}.pdb")
            if os.path.isfile(src_file):
                shutil.copy(src_file, os.path.join(output_dir_mono_hetero, f"{pdb_id}.pdb"))

        for pdb_id in mono_results:
            src_file = os.path.join(directory, f"{pdb_id}.pdb")
            if os.path.isfile(src_file):
                shutil.copy(src_file, os.path.join(output_dir_mono, f"{pdb_id}.pdb"))

        print("PDB files have been copied to the corresponding folders.")
        print(f"Results categorized as 'mono_hetero' have been saved to '{output_dir_mono_hetero}/mono_hetero2.xlsx'.")
        print(f"Results categorized as 'mono' have been saved to '{output_dir_mono}/mono2.xlsx'.")

    except PermissionError as e:
        print(f"PermissionError: {e}")
    except Exception as e:
        print(f"Error creating directories or saving results: {e}")

# Specify the directory containing the PDB files
pdb_directory = 'J:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_1/'

# Categorize and copy PDB files based on the presence of Mn and other metals within 5 Å
categorize_pdb_files(pdb_directory)

In [None]:
#Extract the mononuclear with (M>1, extract from hetero and homonuclear proteins)
import os
import shutil
import pandas as pd
from Bio.PDB import PDBParser
import numpy as np

# Function to calculate the distance between two atoms
def calculate_distance(atom1, atom2):
    return np.linalg.norm(atom1.get_coord() - atom2.get_coord())

# Function to find and categorize Mn atoms based on their proximity to other metal ions (including other Mn atoms)
def categorize_pdb_files(directory, distance_threshold=5.0):
    manganese_ion = 'MN'
    other_metals = ['CU', 'MG', 'CO', 'CA', 'FE', 'PT', 'NA', 'K ', 'LI', 'CD', 'YB', 'NI', 'PR', 'HG', 'MN', 'ZN']  # Include Mn

    all_isolated_results = []
    mixed_results = []
    other_results = []

    pdb_files = [f for f in os.listdir(directory) if f.endswith('.pdb')]
    parser = PDBParser(QUIET=True)

    for pdb_file in pdb_files:
        try:
            pdb_id = pdb_file.split('.')[0]
            structure = parser.get_structure(pdb_id, os.path.join(directory, pdb_file))
            manganese_atoms = []
            other_metal_atoms = []

            # Collect all Mn atoms and other metal atoms in the structure
            for model in structure:
                for chain in model:
                    for residue in chain:
                        for atom in residue:
                            element_symbol = atom.element.strip()  # Strip leading/trailing spaces to ensure proper comparison

                            if element_symbol == manganese_ion:
                                manganese_atoms.append(atom)
                            elif element_symbol in other_metals:
                                other_metal_atoms.append(atom)

            # Flag to track isolated Mn atoms
            isolated_mn_count = 0

            # Check if each Mn atom has any other metal ions (including other Mn atoms) within the distance threshold
            for manganese_atom in manganese_atoms:
                is_isolated = True  # Assume the Mn atom is isolated unless proven otherwise
                for metal_atom in (manganese_atoms + other_metal_atoms):
                    if metal_atom != manganese_atom:  # Exclude the Mn atom itself from comparison
                        distance = calculate_distance(manganese_atom, metal_atom)
                        if distance <= distance_threshold:
                            is_isolated = False  # Mn atom is not isolated
                            break

                if is_isolated:
                    isolated_mn_count += 1

            # Classify the file based on the number of isolated Mn atoms
            if isolated_mn_count == len(manganese_atoms):
                all_isolated_results.append(pdb_id)  # All Mn atoms are isolated
                print(f"{pdb_id} classified as all_isolated.")
            elif isolated_mn_count > 0:
                mixed_results.append(pdb_id)  # Mix of isolated and non-isolated Mn atoms
                print(f"{pdb_id} classified as mixed.")
            else:
                other_results.append(pdb_id)  # No isolated Mn atoms
                print(f"{pdb_id} classified as other.")

        except Exception as e:
            print(f"Error processing {pdb_file}: {e}")

    # Save results
    save_results(all_isolated_results, mixed_results, other_results, directory)

# Function to save categorized results to Excel and copy PDB files
def save_results(all_isolated_results, mixed_results, other_results, directory):
    # Create directories for storing results
    output_dir_all_isolated = os.path.join(directory, 'all_isolated')
    output_dir_mixed = os.path.join(directory, 'mixed')
    output_dir_other = os.path.join(directory, 'other')

    os.makedirs(output_dir_all_isolated, exist_ok=True)
    os.makedirs(output_dir_mixed, exist_ok=True)
    os.makedirs(output_dir_other, exist_ok=True)

    # Save results to Excel files
    pd.DataFrame(all_isolated_results, columns=['PDB ID']).to_excel(os.path.join(output_dir_all_isolated, 'all_isolated.xlsx'), index=False)
    pd.DataFrame(mixed_results, columns=['PDB ID']).to_excel(os.path.join(output_dir_mixed, 'mixed.xlsx'), index=False)
    pd.DataFrame(other_results, columns=['PDB ID']).to_excel(os.path.join(output_dir_other, 'other.xlsx'), index=False)

    # Copy corresponding PDB files to their respective directories
    for pdb_id in all_isolated_results:
        src_file = os.path.join(directory, f"{pdb_id}.pdb")
        if os.path.isfile(src_file):
            shutil.copy(src_file, output_dir_all_isolated)

    for pdb_id in mixed_results:
        src_file = os.path.join(directory, f"{pdb_id}.pdb")
        if os.path.isfile(src_file):
            shutil.copy(src_file, output_dir_mixed)

    for pdb_id in other_results:
        src_file = os.path.join(directory, f"{pdb_id}.pdb")
        if os.path.isfile(src_file):
            shutil.copy(src_file, output_dir_other)

    print(f"Results saved: {len(all_isolated_results)} in 'all_isolated', {len(mixed_results)} in 'mixed', {len(other_results)} in 'other'.")

# Main script execution
pdb_directory = 'J:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_greater_than_1/'
categorize_pdb_files(pdb_directory)

In [None]:
#Extract the mononuclear binding sites from homo and hetero nuclear
import os
import shutil
import pandas as pd
from Bio.PDB import PDBParser, PDBIO, Select
import numpy as np

# Function to calculate the distance between two atoms
def calculate_distance(atom1, atom2):
    return np.linalg.norm(atom1.get_coord() - atom2.get_coord())

# Function to process mixed PDB files and remove non-isolated Mn atoms
def process_mixed_pdb_files(directory, distance_threshold=5.0):
    manganese_ion = 'MN'
    other_metals = ['CU', 'MG', 'CO', 'CA', 'FE', 'PT', 'NA', 'K ', 'LI', 'CD', 'YB', 'NI', 'PR', 'HG', 'MN', 'ZN']  # Include Mn

    pdb_files = [f for f in os.listdir(directory) if f.endswith('.pdb')]
    parser = PDBParser(QUIET=True)

    for pdb_file in pdb_files:
        pdb_id = pdb_file.split('.')[0]
        pdb_file_path = os.path.join(directory, pdb_file)
        try:
            structure = parser.get_structure(pdb_id, pdb_file_path)
            manganese_atoms = []
            other_metal_atoms = []

            # Collect all Mn atoms and other metal atoms in the structure
            for model in structure:
                for chain in model:
                    for residue in chain:
                        for atom in residue:
                            element_symbol = atom.element.strip()  # Strip leading/trailing spaces to ensure proper comparison

                            if element_symbol == manganese_ion:
                                manganese_atoms.append(atom)
                            elif element_symbol in other_metals:
                                other_metal_atoms.append(atom)

            # Collect isolated Mn atoms
            isolated_manganese_atoms = []

            for manganese_atom in manganese_atoms:
                is_isolated = True  # Assume the Mn atom is isolated unless proven otherwise
                for metal_atom in (manganese_atoms + other_metal_atoms):
                    if metal_atom != manganese_atom:  # Exclude the Mn atom itself from comparison
                        distance = calculate_distance(manganese_atom, metal_atom)
                        if distance <= distance_threshold:
                            is_isolated = False  # Mn atom is not isolated
                            break

                if is_isolated:
                    isolated_manganese_atoms.append(manganese_atom)

            # Save PDB file with only isolated Mn atoms
            if isolated_manganese_atoms:
                io = PDBIO()
                io.set_structure(structure)
                io.save(os.path.join(directory, f"{pdb_id}.pdb"), select=IsolatedMnSelector(isolated_manganese_atoms))

        except Exception as e:
            print(f"Error processing {pdb_file}: {e}")

# Selector class to filter isolated Mn atoms
class IsolatedMnSelector(Select):
    def __init__(self, isolated_atoms):
        self.isolated_atoms = isolated_atoms

    def accept_atom(self, atom):
        return atom in self.isolated_atoms or atom.element.strip() != 'MN'

# Main script execution
pdb_directory = 'J:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_greater_than_1/mixed/'
process_mixed_pdb_files(pdb_directory)


In [None]:
#split 3(final): Filter out coordination # 3 with distance parameter from metalPDB
import os
import pandas as pd
from Bio import PDB
import numpy as np

# Define the set of allowed residues
allowed_residues = {'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'LYS', 'MET', 'PHE', 'SER', 'THR', 'TYR', 'VAL'}

def calculate_coordination_info(structure, mn_distance_thresholds, allowed_residues, pdb_id):
    coordination_info = {}
    distances_info = []

    def calculate_distance(atom1, atom2):
        # Calculate the Euclidean distance between two atoms
        x1, y1, z1 = atom1.get_coord()
        x2, y2, z2 = atom2.get_coord()
        distance = ((x1 - x2)**2 + (y1 - y2)**2 + (z1 - z2)**2)**0.5
        return distance

    for mn_model_id, mn_model in enumerate(structure):
        for mn_chain_id, mn_chain in enumerate(mn_model):
            for mn_residue in mn_chain:
                if mn_residue.get_resname() == 'MN':
                    mn_atom = mn_residue['MN']
                    coordination_info_for_mn = set()
                    processed_residues = set()

                    for model_id, model in enumerate(structure):
                        for chain_id, chain in enumerate(model):
                            for residue in chain:
                                if residue.get_resname() not in allowed_residues:
                                    continue

                                coord_atoms = set()  # Use a set to store unique coordinated atoms for the current residue

                                for atom in residue:
                                    atom_name = atom.id

                                    if residue.get_resname() == 'HIS' and atom_name == 'ND1':
                                        min_distance, max_distance = mn_distance_thresholds['HIS_ND1']
                                    elif residue.get_resname() == 'HIS' and atom_name == 'NE2':
                                        min_distance, max_distance = mn_distance_thresholds['HIS_NE2']
                                    elif residue.get_resname() == 'HIS' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['HIS_N']
                                    elif residue.get_resname() == 'HIS' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['HIS_O']
                                    elif residue.get_resname() == 'GLU' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['GLU_N']
                                    elif residue.get_resname() == 'GLU' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['GLU_O']
                                    elif residue.get_resname() == 'ASP' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['ASP_N']
                                    elif residue.get_resname() == 'ASP' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['ASP_O']
                                    elif residue.get_resname() == 'GLU' and atom_name == 'OE1':
                                        min_distance, max_distance = mn_distance_thresholds['GLU_OE']
                                    elif residue.get_resname() == 'GLU' and atom_name == 'OE2':
                                        min_distance, max_distance = mn_distance_thresholds['GLU_OE']
                                    elif residue.get_resname() == 'ASP' and atom_name == 'OD1':
                                        min_distance, max_distance = mn_distance_thresholds['ASP_OD']
                                    elif residue.get_resname() == 'ASP' and atom_name == 'OD2':
                                        min_distance, max_distance = mn_distance_thresholds['ASP_OD']
                                    elif residue.get_resname() == 'ALA' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['ALA_N']
                                    elif residue.get_resname() == 'ALA' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['ALA_O']
                                    elif residue.get_resname() == 'ARG' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['ARG_N']
                                    elif residue.get_resname() == 'ARG' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['ARG_O']
                                    elif residue.get_resname() == 'ASN' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['ASN_N']
                                    elif residue.get_resname() == 'ASN' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['ASN_O']
                                    elif residue.get_resname() == 'GLN' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['GLN_N']
                                    elif residue.get_resname() == 'GLN' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['GLN_O']
                                    elif residue.get_resname() == 'LYS' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['LYS_N']
                                    elif residue.get_resname() == 'LYS' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['LYS_O']
                                    elif residue.get_resname() == 'SER' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['SER_N']
                                    elif residue.get_resname() == 'SER' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['SER_O']
                                    elif residue.get_resname() == 'THR' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['THR_N']
                                    elif residue.get_resname() == 'THR' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['THR_O']
                                    elif residue.get_resname() == 'TYR' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['TYR_N']
                                    elif residue.get_resname() == 'TYR' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['TYR_O']
                                    elif residue.get_resname() == 'CYS' and atom_name == 'SG':
                                        min_distance, max_distance = mn_distance_thresholds['CYS']
                                    elif residue.get_resname() == 'MET' and atom_name == 'SD':
                                        min_distance, max_distance = mn_distance_thresholds['MET']
                                    elif residue.get_resname() == 'ARG' and atom_name == 'NH1':
                                        min_distance, max_distance = mn_distance_thresholds['ARG_NH1']
                                    elif residue.get_resname() == 'ARG' and atom_name == 'NH2':
                                        min_distance, max_distance = mn_distance_thresholds['ARG_NH2']
                                    elif residue.get_resname() == 'ARG' and atom_name == 'NE':
                                        min_distance, max_distance = mn_distance_thresholds['ARG_NE']
                                    elif residue.get_resname() == 'ASN' and atom_name == 'OD1':
                                        min_distance, max_distance = mn_distance_thresholds['ASN_OD1']
                                    elif residue.get_resname() == 'ASN' and atom_name == 'ND2':
                                        min_distance, max_distance = mn_distance_thresholds['ASN_ND2']
                                    elif residue.get_resname() == 'GLN' and atom_name == 'OE1':
                                        min_distance, max_distance = mn_distance_thresholds['GLN_OE1']
                                    elif residue.get_resname() == 'GLN' and atom_name == 'NE2':
                                        min_distance, max_distance = mn_distance_thresholds['GLN_NE2']
                                    elif residue.get_resname() == 'GLY' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['GLY_N']
                                    elif residue.get_resname() == 'GLY' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['GLY_O']
                                    elif residue.get_resname() == 'LYS' and atom_name == 'NZ':
                                        min_distance, max_distance = mn_distance_thresholds['LYS_NZ']
                                    elif residue.get_resname() == 'SER' and atom_name == 'OG':
                                        min_distance, max_distance = mn_distance_thresholds['SER_OG']
                                    elif residue.get_resname() == 'THR' and atom_name == 'OG1':
                                        min_distance, max_distance = mn_distance_thresholds['THR_OG1']
                                    elif residue.get_resname() == 'TYR' and atom_name == 'OH':
                                        min_distance, max_distance = mn_distance_thresholds['TYR_OH']
                                    else:
                                        min_distance, max_distance = 0, 0

                                    # Calculate the distance between mn_atom and atom
                                    distance = calculate_distance(mn_atom, atom)

                                    if min_distance <= distance <= max_distance:
                                        coord_atoms.add((atom_name, residue.get_id()[1], distance))
                                        distances_info.append((pdb_id, mn_chain.get_id(), mn_residue.get_id()[1], mn_residue.get_resname(),
                                                              chain.get_id(), residue.get_id()[1], residue.get_resname(), atom_name, distance))

                                if coord_atoms:
                                    sorted_coord_atoms = sorted(coord_atoms, key=lambda x: x[1])[:3]
                                    coord_residue_info = (
                                        pdb_id,
                                        structure.header['idcode'],
                                        mn_chain.get_id(),
                                        mn_residue.get_id()[1],
                                        mn_residue.get_resname(),
                                        chain.get_id(),
                                        residue.get_id()[1],
                                        residue.get_resname()
                                    )

                                    for coord_atom_info in sorted_coord_atoms:
                                        coord_residue_info += (coord_atom_info[0],)

                                    coordination_info_for_mn.add(coord_residue_info)
                                processed_residues.add(residue.get_id())

                    coordinated_residues_count = len(coordination_info_for_mn)

                    if coordinated_residues_count in coordination_info:
                        coordination_info[coordinated_residues_count].extend(coordination_info_for_mn)
                    else:
                        coordination_info[coordinated_residues_count] = list(coordination_info_for_mn)

    return coordination_info, distances_info


# Directory containing PDB files
pdb_directory = "L:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_greater_than_1/mixed/"

# Dictionary defining distance thresholds for copper coordination with various residues
# Define distance thresholds for each coordination
mn_distance_thresholds = {
    'HIS_ND1': (0, 2.75),
    'HIS_NE2': (0, 2.75),
    'HIS_N': (0, 2.75),
    'HIS_O': (0, 2.85),
    'GLU_OE': (0, 2.85),
    'ASP_OD': (0, 2.85),
    'ASP_N': (0, 2.75),
    'ASP_O': (0, 2.85),
    'GLU_N': (0, 2.75),
    'GLU_O': (0, 2.85),
    'ALA_N': (0, 2.75),
    'ALA_O': (0, 2.85),
    'CYS': (0, 2.75),
    'MET': (0, 2.75),
    'ARG_NH1': (0, 2.75),
    'ARG_NH2': (0, 2.75),
    'ARG_N': (0, 2.75),
    'ARG_O': (0, 2.85),
    'ARG_NE': (0, 2.75),
    'ASN_OD1': (0, 2.85),
    'ASN_ND2': (0, 2.75),
    'ASN_N': (0, 2.75),
    'ASN_O': (0, 2.85),
    'GLN_OE1': (0, 2.85),
    'GLN_NE2': (0, 2.75),
    'GLN_O': (0, 2.85),
    'GLN_N': (0, 2.75),
    'GLY_N': (0, 2.75),
    'GLY_O': (0, 2.85),
    'LYS_NZ': (0, 2.75),
    'LYS_N': (0, 2.75),
    'LYS_O': (0, 2.85),
    'SER_OG': (0, 2.85),
    'SER_N': (0, 2.75),
    'SER_O': (0, 2.85),
    'THR_OG1': (0, 2.85),
    'THR_N': (0, 2.75),
    'THR_O': (0, 2.85),
    'TYR_OH': (0, 2.85),
    'TYR_N': (0, 2.75),
    'TYR_O': (0, 2.85),
}

# Initialize a dictionary to store coordination information for different coordination numbers
coordination_info_dict = {}
distances_info_list = []

# Loop through each PDB file in the specified directory
for filename in os.listdir(pdb_directory):
    if filename.endswith('.pdb'):
        pdb_file_path = os.path.join(pdb_directory, filename)
        pdb_id = filename.split('.')[0]  # Extract the PDB ID from the filename

        # Parse the PDB file
        parser = PDB.PDBParser(QUIET=True)
        structure = parser.get_structure('protein', pdb_file_path)

        # Calculate coordination information for the current PDB file
        coordination_info, distances_info = calculate_coordination_info(structure, mn_distance_thresholds, allowed_residues, pdb_id)

        # Get the number of columns from the first row of data (if available)
        num_columns = len(next(iter(coordination_info.values()), [])) if coordination_info else 0

        # Generate column names dynamically
        columns = [f'Column{i}' for i in range(num_columns)]

        # Add the coordination information to the coordination_info_dict
        for coordination_number, info_list in coordination_info.items():
            if coordination_number in coordination_info_dict:
                coordination_info_dict[coordination_number].extend(info_list)
            else:
                coordination_info_dict[coordination_number] = info_list

        # Add the distances information to the distances_info_list
        distances_info_list.extend(distances_info)

# Create and save Excel files for all coordination numbers
for coordination_number, info_list in coordination_info_dict.items():
    # Generate columns dynamically based on the maximum number of columns in the data
    max_columns = max(len(row) for row in info_list) if info_list else 0
    columns = ['Entry ID', 'PDB ID', 'Metal Chain ID', 'Metal Residue number', 'Metal', 'Chain ID', 'Residue number', 'Residue name']

    # Add binding atom columns based on max number of binding atoms found
    for i in range(1, max_columns - 8 + 1):
        columns.append(f'Binding atom{i}')

    # Create a DataFrame with a fixed number of columns and fill missing values with NaN
    df_coordination = pd.DataFrame(info_list, columns=columns).fillna(np.nan)

    output_excel_path = f'L:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_greater_than_1/mixed/new_241125_{coordination_number}.xlsx'
    df_coordination.to_excel(output_excel_path, index=False)

print('Data saved to properly formatted Excel files for all coordination numbers.')

Data saved to properly formatted Excel files for all coordination numbers.


In [None]:
#sequence extraction
import pandas as pd
from Bio import SeqIO

input_file = "L:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_greater_than_1/mixed/new_3.xlsx"  # Path to the input Excel file containing PDB IDs and Chain IDs
pdb_directory = "L:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_greater_than_1/mixed/"  # Directory where the PDB files are located
output_file = "L:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_greater_than_1/mixed/new_3_sequence.xlsx"  # Path to the output Excel file to save the sequences

# Read the input Excel file
input_df = pd.read_excel(input_file)

# Initialize a list to store sequences
sequences_data = []

# Iterate through each row in the input DataFrame
for index, row in input_df.iterrows():
    pdb_id = row['Entry ID']
    chain_id = row['Chain ID']

    # Construct the filename of the PDB file
    pdb_file = f"{pdb_id}.pdb"
    pdb_path = os.path.join(pdb_directory, pdb_file)

    try:
        # Extract sequence for the specified PDB ID and Chain ID
        sequences = []
        for record in SeqIO.parse(pdb_path, "pdb-atom"):
            if record.annotations.get("chain") == chain_id:
                sequence = str(record.seq)
                sequences.append({"PDB ID": pdb_id, "Chain ID": chain_id, "Sequence": sequence})

        if sequences:
            sequences_data.extend(sequences)
        else:
            sequences_data.append({"PDB ID": pdb_id, "Chain ID": chain_id, "Sequence": "No sequence found"})

    except Exception as e:
        print(f"Error processing {pdb_file}: {e}")

# Convert to DataFrame and save to Excel
output_df = pd.DataFrame(sequences_data)
output_df.to_excel(output_file, index=False)
print("Sequences saved to", output_file)

In [None]:
#Remove the redundancy and separate sequence containing letter "X: missing amino acids"
import pandas as pd
from Bio import pairwise2
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# Path to input Excel file
input_file = 'L:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_greater_than_1/mixed/new_3_sequence.xlsx'

# Read Excel file using pandas and convert sequence values to strings
df = pd.read_excel(input_file)
df["Sequence"] = df["Sequence"].astype(str)

# Create lists to store unique sequences, excluded sequences, and sequences with 'X'
filtered_sequences = []
mysterious_no_redundancy = []
excluded_sequences = []
similar_sequences = []

# Step 1: Filter sequences with the same PDB ID, keep the first sequence and remove 'X' containing sequences
for pdb_id, group in df.groupby("PDB ID"):
    seen_sequences = set()  # Track sequences already seen in the PDB group
    seen_x_sequences = set()  # Track 'X'-containing sequences seen in the PDB group
    for idx, row in group.iterrows():
        sequence = row["Sequence"]
        chain_id = row["Chain ID"]

        # Check if the sequence contains 'X'
        if 'X' in sequence:
            # If this is the first 'X' sequence for this group, keep it
            if sequence not in seen_x_sequences:
                seen_x_sequences.add(sequence)
                mysterious_no_redundancy.append((pdb_id, chain_id, sequence))
            continue

        # If sequence hasn't been seen yet, it's unique in this PDB group
        if sequence not in seen_sequences:
            seen_sequences.add(sequence)
            filtered_sequences.append((pdb_id, chain_id, sequence))
        else:
            # Sequence is redundant, exclude it
            excluded_sequences.append((pdb_id, chain_id, sequence))

# Save the filtered sequences (first unique per PDB ID) to filtered_sequence.xlsx
df_filtered_sequences = pd.DataFrame(filtered_sequences, columns=["PDB ID", "Chain ID", "Sequence"])
filtered_sequence_output = "L:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_greater_than_1/mixed/filtered_sequence3.xlsx"
df_filtered_sequences.to_excel(filtered_sequence_output, index=False)
print(f"Filtered sequences saved to {filtered_sequence_output}")

# Save the non-redundant 'X'-containing sequences to mysterious_no_redundancy.xlsx
df_mysterious_no_redundancy = pd.DataFrame(mysterious_no_redundancy, columns=["PDB ID", "Chain ID", "Sequence"])
mysterious_no_redundancy_output = "L:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_greater_than_1/mixed/mysterious_no_redundancy3.xlsx"
df_mysterious_no_redundancy.to_excel(mysterious_no_redundancy_output, index=False)
print(f"Non-redundant 'X' containing sequences saved to {mysterious_no_redundancy_output}")

# Save the excluded redundant sequences to same_pdb.xlsx
df_excluded_sequences = pd.DataFrame(excluded_sequences, columns=["PDB ID", "Chain ID", "Sequence"])
same_pdb_output = "L:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_greater_than_1/mixed//same.xlsx"
df_excluded_sequences.to_excel(same_pdb_output, index=False)
print(f"Excluded redundant sequences saved to {same_pdb_output}")

In [None]:
import pandas as pd

# Load the provided Excel file
file_path = 'L:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_greater_than_1/mixed/mysterious_no_redundancy3.xlsx'
df = pd.read_excel(file_path)

# Remove 'X' characters from the sequences in the 'Sequence' column
df['Sequence'] = df['Sequence'].str.replace('X', '', regex=False)

# Save the modified data to a new Excel file
output_file_path = 'L:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_greater_than_1/mixed/mysterious_no_X.xlsx'
df.to_excel(output_file_path, index=False)

# Provide the path to the user
print("Modified file saved at:", output_file_path)


Modified file saved at: J:/Zn-installer_rawdata/241111_Mn_Final/Sequence/combined_3mysterious_no_X.xlsx


In [None]:
#Remove the sequence redundancy
import pandas as pd
from Bio import pairwise2
import time

# Path to input Excel file
input_file = "L:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_greater_than_1/mixed/filtered_sequence3.xlsx"

# Read Excel file using pandas and convert sequence values to strings
df = pd.read_excel(input_file)
df["Sequence"] = df["Sequence"].astype(str)

# Extract PDB IDs and protein sequences from the DataFrame
pdb_ids = df['PDB ID'].tolist()
sequences = df['Sequence'].tolist()

# Create lists to store results
unique_sequences = []  # List for unique sequences
excluded_sequences = []  # List for sequences that are excluded
similarity_records = []  # List for high similarity records

# Start timer
start_time = time.time()

# Define the threshold for significant similarity (adjust as needed)
threshold = 50.0

# Iterate over each row in the DataFrame
for i, row in df.iterrows():
    sequence = row["Sequence"]
    pdb_id = row["PDB ID"]
    is_unique = True

    # Compare current sequence with saved unique sequences
    for unique_seq in unique_sequences:
        unique_pdb_id = unique_seq[0]
        unique_sequence = unique_seq[1]

        # Calculate percent identity
        alignment = pairwise2.align.globalxx(sequence, unique_sequence, one_alignment_only=True)
        alignment_length = len(alignment[0][0])
        num_identical = sum(a == b for a, b in zip(alignment[0][0], alignment[0][1]))
        percent_identity = (num_identical / alignment_length) * 100

        # If percent identity is above the threshold, mark as not unique and save details
        if percent_identity >= threshold:
            is_unique = False
            excluded_sequences.append((pdb_id, sequence))
            similarity_records.append((pdb_id, sequence, unique_pdb_id, unique_sequence, percent_identity))
            print(f"PDB ID {pdb_id} has high percent identity with PDB ID {unique_pdb_id}. Sequence excluded from alignment.")
            break

    # If sequence is unique, add it to the list of unique sequences
    if is_unique:
        unique_sequences.append((pdb_id, sequence))
        print(f"PDB ID {pdb_id} is unique. Sequence saved.")

    # Calculate elapsed time and remaining time
    elapsed_time = time.time() - start_time
    remaining_time = (elapsed_time / (i + 1)) * (len(df) - i - 1)

    # Print progress and time information
    print(f"Progress: {i+1}/{len(df)} | Elapsed Time: {elapsed_time:.2f}s | Remaining Time: {remaining_time:.2f}s")

# Create DataFrames for the unique sequences, excluded sequences, and similarity records
df_unique = pd.DataFrame(unique_sequences, columns=["PDB ID", "Sequence"])
df_excluded = pd.DataFrame(excluded_sequences, columns=["PDB ID", "Sequence"])
df_similarity = pd.DataFrame(similarity_records, columns=["PDB ID", "Sequence", "Similar to PDB ID", "Similar Sequence", "Percent Identity"])

# Path to output files
output_file_unique = "L:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_greater_than_1/mixed/filtered_unique.xlsx"
output_file_excluded = "L:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_greater_than_1/mixed/filtered_exclude.xlsx"
output_file_similarity = "L:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_greater_than_1/mixed/filtered_details.xlsx"

# Save the unique sequences to an XLSX file
df_unique.to_excel(output_file_unique, index=False)

# Save the excluded sequences to an XLSX file
df_excluded.to_excel(output_file_excluded, index=False)

# Save the detailed similarity records (excluded sequences) to an XLSX file
df_similarity.to_excel(output_file_similarity, index=False)

print("Sequence identity analysis completed and results saved.")

In [None]:
#Copy the PDB file based on unique_sequence
import pandas as pd
import shutil
import os

# Paths to input Excel files
input_file_unique = "L:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_greater_than_1/mixed/filtered_unique.xlsx"

# Path to the source directory containing PDB files
source_dir = "L:/Zn-installer_rawdata/241111_Mn_Final/Final_3/metal_count_greater_than_1/mixed/"

# Paths to the destination directories
destination_dir_unique = "L:/Zn-installer_rawdata/241111_Mn_Final//Final/"

# Create the destination directories if they do not exist
os.makedirs(destination_dir_unique, exist_ok=True)

# Function to copy PDB files based on a DataFrame
def copy_pdb_files(df, destination_dir):
    pdb_ids = df['PDB ID'].tolist()
    for pdb_id in pdb_ids:
        source_file = os.path.join(source_dir, f"{pdb_id}.pdb")
        destination_file = os.path.join(destination_dir, f"{pdb_id}.pdb")
        if os.path.exists(source_file):
            shutil.copy(source_file, destination_file)
            print(f"Copied: {pdb_id}.pdb to {destination_dir}")
        else:
            print(f"File not found: {pdb_id}.pdb")

# Read Excel files using pandas
df_unique = pd.read_excel(input_file_unique)


# Copy PDB files for unique sequences
copy_pdb_files(df_unique, destination_dir_unique)

In [None]:
#split 3(final): Filter out coordination # 3 with distance parameter from metalPDB
import os
import pandas as pd
from Bio import PDB
import numpy as np

# Define the set of allowed residues
allowed_residues = {'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'LYS', 'MET', 'PHE', 'SER', 'THR', 'TYR', 'VAL'}

def calculate_coordination_info(structure, mn_distance_thresholds, allowed_residues, pdb_id):
    coordination_info = {}
    distances_info = []

    def calculate_distance(atom1, atom2):
        # Calculate the Euclidean distance between two atoms
        x1, y1, z1 = atom1.get_coord()
        x2, y2, z2 = atom2.get_coord()
        distance = ((x1 - x2)**2 + (y1 - y2)**2 + (z1 - z2)**2)**0.5
        return distance

    for mn_model_id, mn_model in enumerate(structure):
        for mn_chain_id, mn_chain in enumerate(mn_model):
            for mn_residue in mn_chain:
                if mn_residue.get_resname() == 'MN':
                    mn_atom = mn_residue['MN']
                    coordination_info_for_mn = set()
                    processed_residues = set()

                    for model_id, model in enumerate(structure):
                        for chain_id, chain in enumerate(model):
                            for residue in chain:
                                if residue.get_resname() not in allowed_residues:
                                    continue

                                coord_atoms = set()  # Use a set to store unique coordinated atoms for the current residue

                                for atom in residue:
                                    atom_name = atom.id

                                    if residue.get_resname() == 'HIS' and atom_name == 'ND1':
                                        min_distance, max_distance = mn_distance_thresholds['HIS_ND1']
                                    elif residue.get_resname() == 'HIS' and atom_name == 'NE2':
                                        min_distance, max_distance = mn_distance_thresholds['HIS_NE2']
                                    elif residue.get_resname() == 'HIS' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['HIS_N']
                                    elif residue.get_resname() == 'HIS' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['HIS_O']
                                    elif residue.get_resname() == 'GLU' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['GLU_N']
                                    elif residue.get_resname() == 'GLU' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['GLU_O']
                                    elif residue.get_resname() == 'ASP' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['ASP_N']
                                    elif residue.get_resname() == 'ASP' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['ASP_O']
                                    elif residue.get_resname() == 'GLU' and atom_name == 'OE1':
                                        min_distance, max_distance = mn_distance_thresholds['GLU_OE']
                                    elif residue.get_resname() == 'GLU' and atom_name == 'OE2':
                                        min_distance, max_distance = mn_distance_thresholds['GLU_OE']
                                    elif residue.get_resname() == 'ASP' and atom_name == 'OD1':
                                        min_distance, max_distance = mn_distance_thresholds['ASP_OD']
                                    elif residue.get_resname() == 'ASP' and atom_name == 'OD2':
                                        min_distance, max_distance = mn_distance_thresholds['ASP_OD']
                                    elif residue.get_resname() == 'ALA' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['ALA_N']
                                    elif residue.get_resname() == 'ALA' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['ALA_O']
                                    elif residue.get_resname() == 'ARG' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['ARG_N']
                                    elif residue.get_resname() == 'ARG' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['ARG_O']
                                    elif residue.get_resname() == 'ASN' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['ASN_N']
                                    elif residue.get_resname() == 'ASN' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['ASN_O']
                                    elif residue.get_resname() == 'GLN' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['GLN_N']
                                    elif residue.get_resname() == 'GLN' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['GLN_O']
                                    elif residue.get_resname() == 'LYS' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['LYS_N']
                                    elif residue.get_resname() == 'LYS' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['LYS_O']
                                    elif residue.get_resname() == 'SER' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['SER_N']
                                    elif residue.get_resname() == 'SER' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['SER_O']
                                    elif residue.get_resname() == 'THR' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['THR_N']
                                    elif residue.get_resname() == 'THR' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['THR_O']
                                    elif residue.get_resname() == 'TYR' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['TYR_N']
                                    elif residue.get_resname() == 'TYR' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['TYR_O']
                                    elif residue.get_resname() == 'CYS' and atom_name == 'SG':
                                        min_distance, max_distance = mn_distance_thresholds['CYS']
                                    elif residue.get_resname() == 'MET' and atom_name == 'SD':
                                        min_distance, max_distance = mn_distance_thresholds['MET']
                                    elif residue.get_resname() == 'ARG' and atom_name == 'NH1':
                                        min_distance, max_distance = mn_distance_thresholds['ARG_NH1']
                                    elif residue.get_resname() == 'ARG' and atom_name == 'NH2':
                                        min_distance, max_distance = mn_distance_thresholds['ARG_NH2']
                                    elif residue.get_resname() == 'ARG' and atom_name == 'NE':
                                        min_distance, max_distance = mn_distance_thresholds['ARG_NE']
                                    elif residue.get_resname() == 'ASN' and atom_name == 'OD1':
                                        min_distance, max_distance = mn_distance_thresholds['ASN_OD1']
                                    elif residue.get_resname() == 'ASN' and atom_name == 'ND2':
                                        min_distance, max_distance = mn_distance_thresholds['ASN_ND2']
                                    elif residue.get_resname() == 'GLN' and atom_name == 'OE1':
                                        min_distance, max_distance = mn_distance_thresholds['GLN_OE1']
                                    elif residue.get_resname() == 'GLN' and atom_name == 'NE2':
                                        min_distance, max_distance = mn_distance_thresholds['GLN_NE2']
                                    elif residue.get_resname() == 'GLY' and atom_name == 'N':
                                        min_distance, max_distance = mn_distance_thresholds['GLY_N']
                                    elif residue.get_resname() == 'GLY' and atom_name == 'O':
                                        min_distance, max_distance = mn_distance_thresholds['GLY_O']
                                    elif residue.get_resname() == 'LYS' and atom_name == 'NZ':
                                        min_distance, max_distance = mn_distance_thresholds['LYS_NZ']
                                    elif residue.get_resname() == 'SER' and atom_name == 'OG':
                                        min_distance, max_distance = mn_distance_thresholds['SER_OG']
                                    elif residue.get_resname() == 'THR' and atom_name == 'OG1':
                                        min_distance, max_distance = mn_distance_thresholds['THR_OG1']
                                    elif residue.get_resname() == 'TYR' and atom_name == 'OH':
                                        min_distance, max_distance = mn_distance_thresholds['TYR_OH']
                                    else:
                                        min_distance, max_distance = 0, 0

                                    # Calculate the distance between mn_atom and atom
                                    distance = calculate_distance(mn_atom, atom)

                                    if min_distance <= distance <= max_distance:
                                        coord_atoms.add((atom_name, residue.get_id()[1], distance))
                                        distances_info.append((pdb_id, mn_chain.get_id(), mn_residue.get_id()[1], mn_residue.get_resname(),
                                                              chain.get_id(), residue.get_id()[1], residue.get_resname(), atom_name, distance))

                                if coord_atoms:
                                    sorted_coord_atoms = sorted(coord_atoms, key=lambda x: x[1])[:3]
                                    coord_residue_info = (
                                        pdb_id,
                                        structure.header['idcode'],
                                        mn_chain.get_id(),
                                        mn_residue.get_id()[1],
                                        mn_residue.get_resname(),
                                        chain.get_id(),
                                        residue.get_id()[1],
                                        residue.get_resname()
                                    )

                                    for coord_atom_info in sorted_coord_atoms:
                                        coord_residue_info += (coord_atom_info[0],)

                                    coordination_info_for_mn.add(coord_residue_info)
                                processed_residues.add(residue.get_id())

                    coordinated_residues_count = len(coordination_info_for_mn)

                    if coordinated_residues_count in coordination_info:
                        coordination_info[coordinated_residues_count].extend(coordination_info_for_mn)
                    else:
                        coordination_info[coordinated_residues_count] = list(coordination_info_for_mn)

    return coordination_info, distances_info


# Directory containing PDB files
pdb_directory = "L:/Zn-installer_rawdata/241111_Mn_Final//Final/"

# Dictionary defining distance thresholds for copper coordination with various residues
# Define distance thresholds for each coordination
mn_distance_thresholds = {
    'HIS_ND1': (0, 2.75),
    'HIS_NE2': (0, 2.75),
    'HIS_N': (0, 2.75),
    'HIS_O': (0, 2.85),
    'GLU_OE': (0, 2.85),
    'ASP_OD': (0, 2.85),
    'ASP_N': (0, 2.75),
    'ASP_O': (0, 2.85),
    'GLU_N': (0, 2.75),
    'GLU_O': (0, 2.85),
    'ALA_N': (0, 2.75),
    'ALA_O': (0, 2.85),
    'CYS': (0, 2.75),
    'MET': (0, 2.75),
    'ARG_NH1': (0, 2.75),
    'ARG_NH2': (0, 2.75),
    'ARG_N': (0, 2.75),
    'ARG_O': (0, 2.85),
    'ARG_NE': (0, 2.75),
    'ASN_OD1': (0, 2.85),
    'ASN_ND2': (0, 2.75),
    'ASN_N': (0, 2.75),
    'ASN_O': (0, 2.85),
    'GLN_OE1': (0, 2.85),
    'GLN_NE2': (0, 2.75),
    'GLN_O': (0, 2.85),
    'GLN_N': (0, 2.75),
    'GLY_N': (0, 2.75),
    'GLY_O': (0, 2.85),
    'LYS_NZ': (0, 2.75),
    'LYS_N': (0, 2.75),
    'LYS_O': (0, 2.85),
    'SER_OG': (0, 2.85),
    'SER_N': (0, 2.75),
    'SER_O': (0, 2.85),
    'THR_OG1': (0, 2.85),
    'THR_N': (0, 2.75),
    'THR_O': (0, 2.85),
    'TYR_OH': (0, 2.85),
    'TYR_N': (0, 2.75),
    'TYR_O': (0, 2.85),
}

# Initialize a dictionary to store coordination information for different coordination numbers
coordination_info_dict = {}
distances_info_list = []

# Loop through each PDB file in the specified directory
for filename in os.listdir(pdb_directory):
    if filename.endswith('.pdb'):
        pdb_file_path = os.path.join(pdb_directory, filename)
        pdb_id = filename.split('.')[0]  # Extract the PDB ID from the filename

        # Parse the PDB file
        parser = PDB.PDBParser(QUIET=True)
        structure = parser.get_structure('protein', pdb_file_path)

        # Calculate coordination information for the current PDB file
        coordination_info, distances_info = calculate_coordination_info(structure, mn_distance_thresholds, allowed_residues, pdb_id)

        # Get the number of columns from the first row of data (if available)
        num_columns = len(next(iter(coordination_info.values()), [])) if coordination_info else 0

        # Generate column names dynamically
        columns = [f'Column{i}' for i in range(num_columns)]

        # Add the coordination information to the coordination_info_dict
        for coordination_number, info_list in coordination_info.items():
            if coordination_number in coordination_info_dict:
                coordination_info_dict[coordination_number].extend(info_list)
            else:
                coordination_info_dict[coordination_number] = info_list

        # Add the distances information to the distances_info_list
        distances_info_list.extend(distances_info)

# Create and save Excel files for all coordination numbers
for coordination_number, info_list in coordination_info_dict.items():
    # Generate columns dynamically based on the maximum number of columns in the data
    max_columns = max(len(row) for row in info_list) if info_list else 0
    columns = ['Entry ID', 'PDB ID', 'Metal Chain ID', 'Metal Residue number', 'Metal', 'Chain ID', 'Residue number', 'Residue name']

    # Add binding atom columns based on max number of binding atoms found
    for i in range(1, max_columns - 8 + 1):
        columns.append(f'Binding atom{i}')

    # Create a DataFrame with a fixed number of columns and fill missing values with NaN
    df_coordination = pd.DataFrame(info_list, columns=columns).fillna(np.nan)

    output_excel_path = f'L:/Zn-installer_rawdata/241111_Mn_Final//Final/new_{coordination_number}.xlsx'
    df_coordination.to_excel(output_excel_path, index=False)

print('Data saved to properly formatted Excel files for all coordination numbers.')

Data saved to properly formatted Excel files for all coordination numbers.


In [None]:
#change the format of excel file
import pandas as pd

# Load the Excel file
coordination_df = pd.read_excel('L:/Zn-installer_rawdata/241111_Mn_Final//Final/new_3.xlsx')

# Remove extra single quotes from column names
coordination_df.columns = coordination_df.columns.str.replace("'", "")

# Prepare the data for horizontal arrangement based on PDB_ID, Metal Chain ID, and Metal Residue number
grouped = coordination_df.groupby(['Entry ID', 'Metal Chain ID', 'Metal Residue number'])

# Create a new DataFrame to hold the horizontally arranged data
horizontal_data = []

for name, group in grouped:
    row = list(name)
    for _, data in group.iterrows():
        row.extend([
            data['Chain ID'], data['Residue number'], data['Residue name'], data['Binding atom1']
        ])
    horizontal_data.append(row)

# Determine the column names for the new DataFrame
max_columns = max(len(row) for row in horizontal_data)
columns = ['Entry ID', 'Metal Chain ID', 'Metal Residue number'] + \
    [item for i in range((max_columns - 3) // 4) for item in [f'Chain ID{i+1}', f'Residue_number{i+1}', f'Residue_name{i+1}', f'Binding atom{i+1}']]

# Create the horizontally arranged DataFrame
horizontal_df = pd.DataFrame(horizontal_data, columns=columns)

# Save the resulting DataFrame to a new Excel file
horizontal_df.to_excel('L:/Zn-installer_rawdata/241111_Mn_Final//Final/new_3_format.xlsx', index=False)

In [None]:
#calculate the pie angle
import pandas as pd
import numpy as np
from Bio.PDB import PDBParser
from Bio.PDB.PDBExceptions import PDBConstructionWarning
import warnings
from math import acos, degrees

# Load the Excel file
excel_file = 'L:/Zn-installer_rawdata/241111_Mn_Final//Final/new_3_format.xlsx'
df = pd.read_excel(excel_file)

# Specify the directory containing PDB files
pdb_directory = 'L:/Zn-installer_rawdata/241111_Mn_Final//Final/'

def parse_pdb_id(pdb_id):
    return pdb_id.strip()

def calculate_distance(coord1, coord2):
    return np.linalg.norm(coord1 - coord2)

def calculate_angle(vector1, vector2):
    """
    Calculate the angle between two vectors.
    """
    unit_vector1 = vector1 / np.linalg.norm(vector1)
    unit_vector2 = vector2 / np.linalg.norm(vector2)
    dot_product = np.dot(unit_vector1, unit_vector2)
    angle = degrees(acos(dot_product))
    return angle

# Add columns for distances and angles
for i in range(1, 4):
    df[f'Calpha_X_{i}'] = None
    df[f'Calpha_Y_{i}'] = None
    df[f'Calpha_Z_{i}'] = None
    df[f'Cbeta_X_{i}'] = None
    df[f'Cbeta_Y_{i}'] = None
    df[f'Cbeta_Z_{i}'] = None
    df[f'Angle_{i}'] = None

for i in range(3):
    for j in range(i + 1, 3):
        df[f'Calpha_Distance_{i+1}_{j+1}'] = None
        df[f'Cbeta_Distance_{i+1}_{j+1}'] = None
        df[f'Angle_{i+1}_{j+1}'] = None

# Calculate distances, angles, and save alpha and beta carbon coordinates
for index, row in df.iterrows():
    pdb_id = parse_pdb_id(row['Entry ID'])
    chain_ids = [row['Chain ID1'], row['Chain ID2'], row['Chain ID3']]
    residues = [row['Residue_number1'], row['Residue_number2'], row['Residue_number3']]

    pdb_filename = f"{pdb_directory}/{pdb_id}.pdb"
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", PDBConstructionWarning)
        parser = PDBParser(QUIET=True)
        structure = parser.get_structure(pdb_id, pdb_filename)

    model = structure[0]
    residue_coords = []

    for i in range(3):
        chain_id = chain_ids[i]
        residue_number = int(residues[i])
        try:
            chain = model[chain_id]
            residue = chain[residue_number]

            ca_coord = np.array(residue['CA'].get_coord())
            cb_coord = np.array(residue['CB'].get_coord())

            df.at[index, f'Calpha_X_{i+1}'] = ca_coord[0]
            df.at[index, f'Calpha_Y_{i+1}'] = ca_coord[1]
            df.at[index, f'Calpha_Z_{i+1}'] = ca_coord[2]

            df.at[index, f'Cbeta_X_{i+1}'] = cb_coord[0]
            df.at[index, f'Cbeta_Y_{i+1}'] = cb_coord[1]
            df.at[index, f'Cbeta_Z_{i+1}'] = cb_coord[2]

            residue_coords.append((ca_coord, cb_coord))
        except KeyError:
            residue_coords.append((None, None))
            continue

    for i in range(3):
        for j in range(i + 1, 3):
            ca_coord_i, cb_coord_i = residue_coords[i]
            ca_coord_j, cb_coord_j = residue_coords[j]

            if ca_coord_i is not None and ca_coord_j is not None:
                alpha_distance = calculate_distance(ca_coord_i, ca_coord_j)
                df.at[index, f'Calpha_Distance_{i+1}_{j+1}'] = alpha_distance

            if cb_coord_i is not None and cb_coord_j is not None:
                beta_distance = calculate_distance(cb_coord_i, cb_coord_j)
                df.at[index, f'Cbeta_Distance_{i+1}_{j+1}'] = beta_distance

            if ca_coord_i is not None and cb_coord_i is not None and ca_coord_j is not None and cb_coord_j is not None:
                vector_i = ca_coord_j - ca_coord_i
                vector_j = cb_coord_j - cb_coord_i
                angle = calculate_angle(vector_i, vector_j)
                df.at[index, f'Angle_{i+1}_{j+1}'] = angle

# Save the updated DataFrame to a new Excel file
df.to_excel('L:/Zn-installer_rawdata/241111_Mn_Final//Final/Geometric_parameters.xlsx', index=False)