In [None]:
# @markdown # Step 1: Install all neccessary packages
# @markdown Please make sure to select a runtime with **High-RAM** before running this step.
# Install all necessary packages
!pip install biopython
!pip install --upgrade tqdm
!pip install -q condacolab
import condacolab
condacolab.install()

# Install PyMOL using Conda
!mamba install -c conda-forge pymol-open-source -y

# Import necessary modules
from Bio.PDB import PDBParser, Selection, NeighborSearch
from Bio.PDB.Polypeptide import is_aa
from tqdm import tqdm
import torch

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
#@markdown # Step 2: Mutate the glycine to alanine
#@markdown # **Please remove all water molecules and ligands before running.**

# Import the necessary modules
import pymol2

#@markdown **Note:** Specify the paths to the input and output PDB files below.

#@markdown ### Enter the path to your input PDB file:
pdb_file_path = "/content/3ljm.pdb"  #@param {type:"string"}

#@markdown ### Enter the path to your output PDB file:
output_file_path = "/content/3tis_run_alanine.pdb"  #@param {type:"string"}

# Create an instance of the PyMOL session
with pymol2.PyMOL() as pymol:
    # Initialize PyMOL
    pymol.cmd.reinitialize()

    # Load the structure file
    pymol.cmd.load(pdb_file_path)

    # Identify glycine residues
    glycine_residues = pymol.cmd.get_model("resn GLY").atom

    # Loop through glycine residues
    for atom in glycine_residues:
        residue_num = atom.resi
        chain = atom.chain
        # Construct the selection string in the format "resi X and chain Y"
        selection_str = f"resi {residue_num} and chain {chain}"
        # Apply the mutation using the mutagenesis command
        pymol.cmd.wizard("mutagenesis")
        pymol.cmd.refresh_wizard()
        pymol.cmd.get_wizard().do_select(selection_str)
        pymol.cmd.get_wizard().set_mode("ALA")
        pymol.cmd.get_wizard().apply()
        pymol.cmd.delete(selection_str)  # Delete the original residue to avoid clashes

    # Save the mutated structure
    pymol.cmd.save(output_file_path)


In [None]:
# Existing imports and setup
import pandas as pd
import numpy as np
from Bio.PDB import PDBParser
import itertools
import os
from IPython.display import display, Markdown

# Markdown documentation for file pathways

# @markdown # Step 3A: Run the Metal-Installer

# Import the necessary modules
import pymol2
from IPython.display import display, Markdown
import requests  # Required for downloading files from GitHub

# @markdown **Note:** Specify the paths to the input and output files below.

# @markdown ### Enter the path to your input PDB file:
pdb_file = "/content/3tis_run_alanine.pdb"  # @param {type:"string"}

# @markdown ### Enter the path to your output Excel file:
output_excel_file = "/content/3tis_run_alanine_Cu_3His.xlsx"  # @param {type:"string"}


# @markdown ### Enter the path to your PyMOL script file:
pymol_script_file = "/content/3tis_run_alanine_Cu_3His.pml"  # @param {type:"string"}

# @markdown ### Set the metal type to use:
Metal = 'Cu'  # @param ["Zn", "Mn", "Cu", "Fe"]

# @markdown ### Select the combination type:
# @markdown **Note:** 2His/1Cys is only available to Cu
Combinations = '3His'  # @param ["3His", "2His_1Asp", "2His_1Glu", "2His_1Cys"]

# @markdown ### Choose a threshold for analysis:
Range = '4'  # @param ["1", "2", "3", "4","5"]

# @markdown ### Specify the specific residue number (use 0 for no specific residue):
specific_residue_number = 24  # @param {type:"integer"}

# Additional code to construct the URL for thresholds based on user input and download the file

# Base URL for the thresholds files on GitHub
base_url = "https://raw.githubusercontent.com/SNU-Songlab/Metal-Installer-code/main/Threshold"

# Construct the full URL based on user input
thresholds_url = f"{base_url}/{Metal}/{Combinations}/{Range}.xlsx"

# Define the local path to save the downloaded file
thresholds_file = "/content/thresholds.xlsx"  # This path will be used throughout the script

# Download the file from GitHub
response = requests.get(thresholds_url)

# Check if the request was successful and save the file
if response.status_code == 200:
    with open(thresholds_file, 'wb') as file:
        file.write(response.content)
    print(f"File downloaded successfully from {thresholds_url} and saved as {thresholds_file}")
else:
    raise ValueError(f"Failed to download file from {thresholds_url}. Status code: {response.status_code}")

# Load thresholds from the downloaded Excel file
thresholds_df = pd.read_excel(thresholds_file, sheet_name='Sheet1')

# Extract threshold values
thresholds = {}
for _, row in thresholds_df.iterrows():
    parameter = row['Parameter']
    min_value = row['Min']
    max_value = row['Max']

    if pd.notna(min_value) and pd.notna(max_value):
        thresholds[parameter] = (min_value, max_value)

# Assign threshold values
alpha_distance_range = thresholds['alpha_distance_range']
beta_distance_range = thresholds['beta_distance_range']
ratio_threshold_range = thresholds['ratio_threshold_range']
pie_threshold_range = thresholds['pie_threshold_range']

# PDB Parser setup
parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', pdb_file)
model = structure[0]
residues = [residue for residue in model.get_residues() if residue.get_id()[0] == ' ']

# Step 1: Filter combinations to include specific residue number if specified
if specific_residue_number != 0:
    combinations = [
        comb for comb in itertools.combinations(residues, 3)
        if any(res.get_id()[1] == specific_residue_number for res in comb)
    ]
else:
    combinations = list(itertools.combinations(residues, 3))

# Distance filter
filtered_data_distances = []

for idx, combination in enumerate(combinations):
    alpha_distances, beta_distances = [], []

    try:
        for res1, res2 in itertools.combinations(combination, 2):
            if res1.has_id('CA') and res2.has_id('CA'):
                ca1, ca2 = res1['CA'].coord, res2['CA'].coord
                alpha_distance = np.linalg.norm(ca1 - ca2)
            else:
                continue

            if res1.has_id('CB') and res2.has_id('CB'):
                cb1, cb2 = res1['CB'].coord, res2['CB'].coord
                beta_distance = np.linalg.norm(cb1 - cb2)
            else:
                continue

            if (alpha_distance_range[0] <= alpha_distance <= alpha_distance_range[1] and
                    beta_distance_range[0] <= beta_distance <= beta_distance_range[1]):
                alpha_distances.append(alpha_distance)
                beta_distances.append(beta_distance)

        if len(alpha_distances) >= 3 and len(beta_distances) >= 3:
            filtered_data_distances.append({
                'PDB_ID': pdb_file,
                'Combination': combination,
                'Coord_chain_id_number1': combination[0].get_full_id()[2],
                'Coord_residue_number1': combination[0].get_full_id()[3][1],
                'Coord_residue_name1': combination[0].get_resname(),
                'Coord_atom_name1': 'CA',
                'Coord_chain_id_number2': combination[1].get_full_id()[2],
                'Coord_residue_number2': combination[1].get_full_id()[3][1],
                'Coord_residue_name2': combination[1].get_resname(),
                'Coord_atom_name2': 'CA',
                'Coord_chain_id_number3': combination[2].get_full_id()[2],
                'Coord_residue_number3': combination[2].get_full_id()[3][1],
                'Coord_residue_name3': combination[2].get_resname(),
                'Coord_atom_name3': 'CA',
                'Alpha Distance 1': alpha_distances[0],
                'Alpha Distance 2': alpha_distances[1],
                'Alpha Distance 3': alpha_distances[2],
                'Beta Distance 1': beta_distances[0],
                'Beta Distance 2': beta_distances[1],
                'Beta Distance 3': beta_distances[2]
            })

    except KeyError as e:
        print(f"Error processing combination {combination}: {e}")

# Create DataFrame for distances
column_order = [
    'PDB_ID',
    'Combination',
    'Coord_chain_id_number1', 'Coord_residue_number1', 'Coord_residue_name1', 'Coord_atom_name1',
    'Coord_chain_id_number2', 'Coord_residue_number2', 'Coord_residue_name2', 'Coord_atom_name2',
    'Coord_chain_id_number3', 'Coord_residue_number3', 'Coord_residue_name3', 'Coord_atom_name3',
    'Alpha Distance 1', 'Alpha Distance 2', 'Alpha Distance 3',
    'Beta Distance 1', 'Beta Distance 2', 'Beta Distance 3'
]

df_distances = pd.DataFrame(filtered_data_distances)
df_distances = df_distances[column_order]

# Ratio filter
filtered_data_ratio = []

for idx, row in df_distances.iterrows():
    alpha_distances = [row['Alpha Distance 1'], row['Alpha Distance 2'], row['Alpha Distance 3']]
    beta_distances = [row['Beta Distance 1'], row['Beta Distance 2'], row['Beta Distance 3']]

    for i in range(3):
        alpha_distance_i = alpha_distances[i]
        beta_distance_i = beta_distances[i]
        ratio = alpha_distance_i / beta_distance_i

        if not (ratio_threshold_range[0] <= ratio <= ratio_threshold_range[1]):
            break
    else:
        filtered_data_ratio.append(row)

df_ratio = pd.DataFrame(filtered_data_ratio)
df_ratio = df_ratio[column_order]


# Pie filter
def calculate_pie(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    magnitude_product = np.linalg.norm(vector1) * np.linalg.norm(vector2)
    if magnitude_product == 0:
        return np.nan
    cosine_angle = dot_product / magnitude_product
    cosine_angle = np.clip(cosine_angle, -1.0, 1.0)
    return np.degrees(np.arccos(cosine_angle))

def process_row(row):
    pdb_file_path = pdb_file
    if not os.path.isfile(pdb_file_path):
        print(f"PDB file not found: {pdb_file_path}")
        return [None, None, None]

    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('protein', pdb_file_path)
    model = structure[0]

    try:
        residues = row['Combination']
        print(f"Residues loaded: {residues}")

        pies = []
        pairs = [(0, 1), (0, 2), (1, 2)]

        for i, j in pairs:
            try:
                CA1 = residues[i]['CA']
                CA2 = residues[j]['CA']
                CB1 = residues[i]['CB'] if 'CB' in residues[i] else CA1
                CB2 = residues[j]['CB'] if 'CB' in residues[j] else CA2

                vector_CA = CA2.coord - CA1.coord
                vector_CB = CB2.coord - CB1.coord

                angle = calculate_pie(vector_CA, vector_CB)
                pies.append(angle)
            except KeyError as e:
                print(f"KeyError for residues {residues[i]} and {residues[j]}: {e}")
                pies.append(None)

        return pies
    except KeyError as e:
        print(f"KeyError: {e}")
        return [None, None, None]

pie_results = df_ratio.apply(process_row, axis=1, result_type='expand')
df_ratio[['Pie_1_2', 'Pie_1_3', 'Pie_2_3']] = pie_results

# Create filter columns based on pie thresholds
for col in ['Pie_1_2', 'Pie_1_3', 'Pie_2_3']:
    df_ratio[f'{col}_Filter'] = df_ratio.apply(lambda row: pie_threshold_range[0] < row[col] < pie_threshold_range[1] if pd.notnull(row[col]) else False, axis=1)

df_ratio['Pie_Filter'] = df_ratio[[f'{col}_Filter' for col in ['Pie_1_2', 'Pie_1_3', 'Pie_2_3']]].all(axis=1)

df_final_filter = df_ratio[df_ratio['Pie_Filter']]

# Save all DataFrames into a single Excel file with different tabs
with pd.ExcelWriter(output_excel_file) as writer:
    df_distances.to_excel(writer, sheet_name='Distances', index=False)
    df_ratio.to_excel(writer, sheet_name='Ratio', index=False)
    df_final_filter.to_excel(writer, sheet_name='Pie', index=False)

# Generate PyMOL script file
pymol_script_commands = []
df_final_filter['Combination_Number'] = range(1, len(df_final_filter) + 1)

for index, row in df_final_filter.iterrows():
    combination = row['Combination']
    chain1, res1 = combination[0].get_full_id()[2], combination[0].get_full_id()[3][1]
    chain2, res2 = combination[1].get_full_id()[2], combination[1].get_full_id()[3][1]
    chain3, res3 = combination[2].get_full_id()[2], combination[2].get_full_id()[3][1]

    selection_name = f"obj{row['Combination_Number']:02d}"
    pymol_script_commands.append(f"select {selection_name}, (chain {chain1} and resi {res1}) or (chain {chain2} and resi {res2}) or (chain {chain3} and resi {res3})")
    pymol_script_commands.append(f"create {selection_name}_residue1, /{pdb_file}//{chain1}/{res1}")
    pymol_script_commands.append(f"create {selection_name}_residue2, /{pdb_file}//{chain2}/{res2}")
    pymol_script_commands.append(f"create {selection_name}_residue3, /{pdb_file}//{chain3}/{res3}")

with open(pymol_script_file, 'w') as f:
    f.write("# PyMOL script for visualizing filtered residue combinations\n\n")
    for command in pymol_script_commands:
        f.write(command + '\n')

print(f"\nResults saved to {output_excel_file}")
print(f"PyMOL script saved to {pymol_script_file}")


In [None]:
# Existing imports and setup
import pandas as pd
import numpy as np
from Bio.PDB import PDBParser
import itertools
import os
from IPython.display import display, Markdown

# Markdown documentation for file pathways

# @markdown # Step 3B: Run the Metal-Installer

# Import the necessary modules
import pymol2
from IPython.display import display, Markdown
import requests  # Required for downloading files from GitHub

# @markdown **Note:** Specify the paths to the input and output files below.

# @markdown ### Enter the path to your input PDB file:
pdb_file = "/content/1EP0_alanine_dimer.pdb"  # @param {type:"string"}

# @markdown ### Enter the path to your output Excel file:
output_excel_file = "/content/1EP0_alanine_dimer_FE_2His_1asp_130.xlsx"  # @param {type:"string"}

# @markdown ### Enter the path to your PyMOL script file:
pymol_script_file = "/content/OmpF_dimer_alanine_3His_Full.pml"  # @param {type:"string"}

# @markdown ### Specify the specific residue number (use 0 for no specific residue):
specific_residue_number = 0  # @param {type:"integer"}

# @markdown ### Set the thresholds for analysis
# @markdown **Alpha Distance**: Enter the minimum and maximum values.
alpha_distance_min = 5.5  # @param {type:"number"}
alpha_distance_max = 10  # @param {type:"number"}

# @markdown **Beta Distance**: Enter the minimum and maximum values.
beta_distance_min = 6.0  # @param {type:"number"}
beta_distance_max = 9  # @param {type:"number"}

# @markdown **Ratio Threshold**: Enter the minimum and maximum values.
ratio_threshold_min = 0.7  # @param {type:"number"}
ratio_threshold_max = 1.4  # @param {type:"number"}

# @markdown **Pie Threshold**: Enter the minimum and maximum values.
pie_threshold_min = 0  # @param {type:"number"}
pie_threshold_max = 15  # @param {type:"number"}

# Set thresholds based on user inputs
alpha_distance_range = (alpha_distance_min, alpha_distance_max)
beta_distance_range = (beta_distance_min, beta_distance_max)
ratio_threshold_range = (ratio_threshold_min, ratio_threshold_max)
pie_threshold_range = (pie_threshold_min, pie_threshold_max)

# Output the ranges to confirm
print(f"Alpha Distance Range: {alpha_distance_range}")
print(f"Beta Distance Range: {beta_distance_range}")
print(f"Ratio Threshold Range: {ratio_threshold_range}")
print(f"Pie Threshold Range: {pie_threshold_range}")

# Capture the inputs into variables
alpha_distance_range = (alpha_distance_min, alpha_distance_max)
beta_distance_range = (beta_distance_min, beta_distance_max)
ratio_threshold_range = (ratio_threshold_min, ratio_threshold_max)
pie_threshold_range = (pie_threshold_min, pie_threshold_max)

# Continue with the script using these threshold values
print(f"Alpha Distance Range: {alpha_distance_range}")
print(f"Beta Distance Range: {beta_distance_range}")
print(f"Ratio Threshold Range: {ratio_threshold_range}")
print(f"Pie Threshold Range: {pie_threshold_range}")

# PDB Parser setup
parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', pdb_file)
model = structure[0]
residues = [residue for residue in model.get_residues() if residue.get_id()[0] == ' ']

# Step 1: Filter combinations to include specific residue number if specified
if specific_residue_number != 0:
    combinations = [
        comb for comb in itertools.combinations(residues, 3)
        if any(res.get_id()[1] == specific_residue_number for res in comb)
    ]
else:
    combinations = list(itertools.combinations(residues, 3))


# Distance filter
filtered_data_distances = []

for idx, combination in enumerate(combinations):
    alpha_distances, beta_distances = [], []

    try:
        for res1, res2 in itertools.combinations(combination, 2):
            if res1.has_id('CA') and res2.has_id('CA'):
                ca1, ca2 = res1['CA'].coord, res2['CA'].coord
                alpha_distance = np.linalg.norm(ca1 - ca2)
            else:
                continue

            if res1.has_id('CB') and res2.has_id('CB'):
                cb1, cb2 = res1['CB'].coord, res2['CB'].coord
                beta_distance = np.linalg.norm(cb1 - cb2)
            else:
                continue

            if (alpha_distance_range[0] <= alpha_distance <= alpha_distance_range[1] and
                    beta_distance_range[0] <= beta_distance <= beta_distance_range[1]):
                alpha_distances.append(alpha_distance)
                beta_distances.append(beta_distance)

        if len(alpha_distances) >= 3 and len(beta_distances) >= 3:
            filtered_data_distances.append({
                'PDB_ID': pdb_file,
                'Combination': combination,
                'Coord_chain_id_number1': combination[0].get_full_id()[2],
                'Coord_residue_number1': combination[0].get_full_id()[3][1],
                'Coord_residue_name1': combination[0].get_resname(),
                'Coord_atom_name1': 'CA',
                'Coord_chain_id_number2': combination[1].get_full_id()[2],
                'Coord_residue_number2': combination[1].get_full_id()[3][1],
                'Coord_residue_name2': combination[1].get_resname(),
                'Coord_atom_name2': 'CA',
                'Coord_chain_id_number3': combination[2].get_full_id()[2],
                'Coord_residue_number3': combination[2].get_full_id()[3][1],
                'Coord_residue_name3': combination[2].get_resname(),
                'Coord_atom_name3': 'CA',
                'Alpha Distance 1': alpha_distances[0],
                'Alpha Distance 2': alpha_distances[1],
                'Alpha Distance 3': alpha_distances[2],
                'Beta Distance 1': beta_distances[0],
                'Beta Distance 2': beta_distances[1],
                'Beta Distance 3': beta_distances[2]
            })

    except KeyError as e:
        print(f"Error processing combination {combination}: {e}")

# Create DataFrame for distances
column_order = [
    'PDB_ID',
    'Combination',
    'Coord_chain_id_number1', 'Coord_residue_number1', 'Coord_residue_name1', 'Coord_atom_name1',
    'Coord_chain_id_number2', 'Coord_residue_number2', 'Coord_residue_name2', 'Coord_atom_name2',
    'Coord_chain_id_number3', 'Coord_residue_number3', 'Coord_residue_name3', 'Coord_atom_name3',
    'Alpha Distance 1', 'Alpha Distance 2', 'Alpha Distance 3',
    'Beta Distance 1', 'Beta Distance 2', 'Beta Distance 3'
]

df_distances = pd.DataFrame(filtered_data_distances)
df_distances = df_distances[column_order]

# Ratio filter
filtered_data_ratio = []

for idx, row in df_distances.iterrows():
    alpha_distances = [row['Alpha Distance 1'], row['Alpha Distance 2'], row['Alpha Distance 3']]
    beta_distances = [row['Beta Distance 1'], row['Beta Distance 2'], row['Beta Distance 3']]

    for i in range(3):
        alpha_distance_i = alpha_distances[i]
        beta_distance_i = beta_distances[i]
        ratio = alpha_distance_i / beta_distance_i

        if not (ratio_threshold_range[0] <= ratio <= ratio_threshold_range[1]):
            break
    else:
        filtered_data_ratio.append(row)

df_ratio = pd.DataFrame(filtered_data_ratio)
df_ratio = df_ratio[column_order]

# Pie filter
def calculate_pie(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    magnitude_product = np.linalg.norm(vector1) * np.linalg.norm(vector2)
    if magnitude_product == 0:
        return np.nan
    cosine_angle = dot_product / magnitude_product
    cosine_angle = np.clip(cosine_angle, -1.0, 1.0)
    return np.degrees(np.arccos(cosine_angle))

def process_row(row):
    pdb_file_path = pdb_file
    if not os.path.isfile(pdb_file_path):
        print(f"PDB file not found: {pdb_file_path}")
        return [None, None, None]

    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('protein', pdb_file_path)
    model = structure[0]

    try:
        residues = row['Combination']
        print(f"Residues loaded: {residues}")

        pies = []
        pairs = [(0, 1), (0, 2), (1, 2)]

        for i, j in pairs:
            try:
                CA1 = residues[i]['CA']
                CA2 = residues[j]['CA']
                CB1 = residues[i]['CB'] if 'CB' in residues[i] else CA1
                CB2 = residues[j]['CB'] if 'CB' in residues[j] else CA2

                vector_CA = CA2.coord - CA1.coord
                vector_CB = CB2.coord - CB1.coord

                angle = calculate_pie(vector_CA, vector_CB)
                pies.append(angle)
            except KeyError as e:
                print(f"KeyError for residues {residues[i]} and {residues[j]}: {e}")
                pies.append(None)

        return pies
    except KeyError as e:
        print(f"KeyError: {e}")
        return [None, None, None]

pie_results = df_ratio.apply(process_row, axis=1, result_type='expand')
df_ratio[['Pie_1_2', 'Pie_1_3', 'Pie_2_3']] = pie_results

# Create filter columns based on pie thresholds
for col in ['Pie_1_2', 'Pie_1_3', 'Pie_2_3']:
    df_ratio[f'{col}_Filter'] = df_ratio.apply(lambda row: pie_threshold_range[0] < row[col] < pie_threshold_range[1] if pd.notnull(row[col]) else False, axis=1)

df_ratio['Pie_Filter'] = df_ratio[[f'{col}_Filter' for col in ['Pie_1_2', 'Pie_1_3', 'Pie_2_3']]].all(axis=1)

df_final_filter = df_ratio[df_ratio['Pie_Filter']]
# Remove '.pdb' from the PDB_ID in the final filtered DataFrame
df_final_filter['PDB_ID'] = df_final_filter['PDB_ID'].str.replace('.pdb', '', regex=False)
# Save all DataFrames into a single Excel file with different tabs
with pd.ExcelWriter(output_excel_file) as writer:
    df_distances.to_excel(writer, sheet_name='Distances', index=False)
    df_ratio.to_excel(writer, sheet_name='Ratio', index=False)
    df_final_filter.to_excel(writer, sheet_name='Pie', index=False)

# Generate PyMOL script file
pymol_script_commands = []
df_final_filter['Combination_Number'] = range(1, len(df_final_filter) + 1)

for index, row in df_final_filter.iterrows():
    combination = row['Combination']
    chain1, res1 = combination[0].get_full_id()[2], combination[0].get_full_id()[3][1]
    chain2, res2 = combination[1].get_full_id()[2], combination[1].get_full_id()[3][1]
    chain3, res3 = combination[2].get_full_id()[2], combination[2].get_full_id()[3][1]

    selection_name = f"obj{row['Combination_Number']:02d}"
    pymol_script_commands.append(f"select {selection_name}, (chain {chain1} and resi {res1}) or (chain {chain2} and resi {res2}) or (chain {chain3} and resi {res3})")
    pymol_script_commands.append(f"create {selection_name}_residue1, /{pdb_file}//{chain1}/{res1}")
    pymol_script_commands.append(f"create {selection_name}_residue2, /{pdb_file}//{chain2}/{res2}")
    pymol_script_commands.append(f"create {selection_name}_residue3, /{pdb_file}//{chain3}/{res3}")

with open(pymol_script_file, 'w') as f:
    f.write("# PyMOL script for visualizing filtered residue combinations\n\n")
    for command in pymol_script_commands:
        f.write(command + '\n')

print(f"\nResults saved to {output_excel_file}")
print(f"PyMOL script saved to {pymol_script_file}")





In [None]:
from Bio.PDB import PDBParser
import pandas as pd
import os
from IPython.display import display, Markdown

# Markdown documentation for file pathways

# @markdown # Step 4A: Preparation steps for metal-sites expectation (Coordinates extraction)

# @markdown **Note:** Specify the paths to the input and output files below.
# @markdown ### Enter the path to your input excel file (The result of the step 3)
input_file = '/content/3tis_run_alanine_Cu_3His.xlsx' # @param {type:"string"}
df_pie = pd.read_excel(input_file, sheet_name='Pie')

# @markdown ### Enter the path to your input PDB file:
pdb_file = "/content/3tis_run_alanine.pdb"  # @param {type:"string"}
parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', pdb_file)

# Extract the PDB ID by removing the directory and `.pdb` extension
pdb_id = os.path.basename(pdb_file).replace('.pdb', '')

# Function to extract Cα and Cβ coordinates for a given residue
def extract_coordinates(chain, res_id, atom_name):
    try:
        residue = chain[res_id]
        atom_coord = residue[atom_name].coord
        return atom_coord
    except KeyError:
        return [None, None, None]

# Pre-fetch chains to avoid repetitive lookups
chains = {chain.id: chain for chain in structure[0]}

# Loop through each row in the Excel file and extract coordinates
ca_coords = []
cb_coords = []

for idx, row in df_pie.iterrows():
    chain1 = chains.get(row['Coord_chain_id_number1'])
    chain2 = chains.get(row['Coord_chain_id_number2'])
    chain3 = chains.get(row['Coord_chain_id_number3'])

    # Extract chain, residue, and atom info for each of the three residues
    res1_coord = extract_coordinates(chain1, row['Coord_residue_number1'], 'CA')
    res2_coord = extract_coordinates(chain2, row['Coord_residue_number2'], 'CA')
    res3_coord = extract_coordinates(chain3, row['Coord_residue_number3'], 'CA')

    # Add coordinates for each residue
    ca_coords.append([*res1_coord, *res2_coord, *res3_coord])

    # If Cβ is also needed:
    res1_cb = extract_coordinates(chain1, row['Coord_residue_number1'], 'CB')
    res2_cb = extract_coordinates(chain2, row['Coord_residue_number2'], 'CB')
    res3_cb = extract_coordinates(chain3, row['Coord_residue_number3'], 'CB')

    cb_coords.append([*res1_cb, *res2_cb, *res3_cb])

# Convert the extracted coordinates to DataFrames
ca_columns = ['CA1_X', 'CA1_Y', 'CA1_Z', 'CA2_X', 'CA2_Y', 'CA2_Z', 'CA3_X', 'CA3_Y', 'CA3_Z']
cb_columns = ['CB1_X', 'CB1_Y', 'CB1_Z', 'CB2_X', 'CB2_Y', 'CB2_Z', 'CB3_X', 'CB3_Y', 'CB3_Z']

df_ca = pd.DataFrame(ca_coords, columns=ca_columns)
df_cb = pd.DataFrame(cb_coords, columns=cb_columns)

# Merge the coordinates with the original DataFrame
df_pie = pd.concat([df_pie.reset_index(drop=True), df_ca, df_cb], axis=1)

# Remove `.pdb` from the `PDB_ID` column if it exists
if 'PDB_ID' in df_pie.columns:
    df_pie['PDB_ID'] = df_pie['PDB_ID'].str.replace('.pdb', '', regex=False)
# @markdown ### Enter the path to your output Excel file:
output_file = '/content/3ttis_coordinates.xlsx'   # @param {type:"string"}
df_pie.to_excel(output_file, index=False)

print(f"Coordinates extracted and saved to {output_file}")




Coordinates extracted and saved to /content/3ttis_coordinates.xlsx


In [None]:
#Final (Dynamically:Last_One+edge): 진짜 이거 ratio 까지 되는거 (마지막):찐찐찐
import numpy as np
import pandas as pd
import os
from Bio.PDB import PDBParser
import requests

# Markdown documentation for file pathways

# @markdown # Step 4B: Run the metal-sites expectation

# Define input and output file paths
# @markdown ### Enter the path to your input excel file (The result of the step 4A)
input_coords_file = '/content/3ttis_coordinates_1.xlsx' # @param {type:"string"}

# Load input file
df_alanine = pd.read_excel(input_coords_file)

# Define file download paths
prob_map_file = '/content/map.xlsx'
thresholds_file = '/content/threshold.xlsx'

# Download files from GitHub
base_url = "https://raw.githubusercontent.com/SNU-Songlab/Metal-Installer-code/main/probability/"
# @markdown ### Set the metal type to use:
Metal = 'Cu'  # @param ["Zn", "Mn", "Cu", "Fe"]
# @markdown ### Select the combination type:
# @markdown **Note:** 3His:Zn/Cu/Fe & 2His/1Asp:Zn/Fe/Mn & 2His/1Glu: Zn/Fe/Mn & 2His/1Cys: Cu
Combinations = '3His'  # @param ["3His", "2His_1Asp", "2His_1Glu", "2His_1Cys"]

map_url = f"{base_url}/{Metal}/{Combinations}/map.xlsx"
thresholds_url = f"{base_url}/{Metal}/{Combinations}/threshold.xlsx"

# Download probability map
response = requests.get(map_url)
if response.status_code == 200:
    with open(prob_map_file, 'wb') as file:
        file.write(response.content)
else:
    raise ValueError(f"Failed to download file from {map_url}. Status code: {response.status_code}")

# Download thresholds file
response = requests.get(thresholds_url)
if response.status_code == 200:
    with open(thresholds_file, 'wb') as file:
        file.write(response.content)
else:
    raise ValueError(f"Failed to download file from {thresholds_url}. Status code: {response.status_code}")

# Load downloaded Excel files
thresholds_df = pd.read_excel(thresholds_file, sheet_name='Sheet1')
df_precomputed_prob_map = pd.read_excel(prob_map_file)

def calculate_ratio(current_point, ca_xyz, cb_xyz):
    # Calculate distances to Ca and Cb atoms
    ca_distances = np.linalg.norm(ca_xyz - current_point, axis=1)
    cb_distances = np.linalg.norm(cb_xyz - current_point, axis=1)
    # Return ratios for each residue
    return ca_distances / cb_distances

# Extract thresholds into a dictionary
thresholds = {}
for _, row in thresholds_df.iterrows():
    parameter = row['Parameter']
    min_value = row['Min']
    max_value = row['Max']
    if pd.notna(min_value) and pd.notna(max_value):
        thresholds[parameter] = (min_value, max_value)

required_keys = ['ca_distances_calc', 'cb_distances_calc', 'ratio', 'angle']

for key in required_keys:
    if key not in thresholds:
        raise KeyError(f"Missing key '{key}' in thresholds file.")


# Define bin edges for CA-Zn distances, CB-Zn distances, and angles
prob_map_file = '/content/map.xlsx'
df_precomputed_prob_map = pd.read_excel(prob_map_file)

ca_bins = np.sort(df_precomputed_prob_map['Calpha_Zn_Dist'].unique())
cb_bins = np.sort(df_precomputed_prob_map['Cbeta_Zn_Dist'].unique())
angle_bins = np.sort(df_precomputed_prob_map['CA-Zn-CB_Angle'].unique())

# Pivot the probability map into a 3D array format
pivoted_prob_map = df_precomputed_prob_map.pivot_table(
    index='Calpha_Zn_Dist', columns=['Cbeta_Zn_Dist', 'CA-Zn-CB_Angle'], values='Probability', fill_value=0
)
prob_map_3d = pivoted_prob_map.values.reshape((len(ca_bins), len(cb_bins), len(angle_bins)))

# Function to load a PDB file based on entry ID
def load_pdb_structure(entry_id, pdb_directory):
    pdb_parser = PDBParser()
    pdb_file_path = os.path.join(pdb_directory, f"{entry_id}.pdb")
    structure = pdb_parser.get_structure(entry_id, pdb_file_path)
    return structure

# Function to score Zn positions
def score_zn_predictions(ca_distances, cb_distances, angles, prob_map_3d, ca_bins, cb_bins, angle_bins):
    ca_bin_indices = np.digitize(ca_distances, ca_bins) - 1
    cb_bin_indices = np.digitize(cb_distances, cb_bins) - 1
    angle_bin_indices = np.digitize(angles, angle_bins) - 1
    probabilities = []
    valid = True
    for cbin, bbin, abin in zip(ca_bin_indices, cb_bin_indices, angle_bin_indices):
        if 0 <= cbin < prob_map_3d.shape[0] and 0 <= bbin < prob_map_3d.shape[1] and 0 <= abin < prob_map_3d.shape[2]:
            prob_value = prob_map_3d[cbin, bbin, abin]
            if prob_value == 0:
                valid = False
                break
            probabilities.append(prob_value)
        else:
            valid = False
            break
    final_score = np.prod(probabilities) if valid else None
    return final_score

# Function to calculate angles between Zn-Cα and Zn-Cβ vectors for each triplet
def calculate_angles(zn_coords, ca_coords_triplet, cb_coords_triplet):
    angles = []
    for i in range(3):
        v1 = ca_coords_triplet[i] - zn_coords
        v2 = cb_coords_triplet[i] - zn_coords
        cos_theta = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
        angle = np.arccos(np.clip(cos_theta, -1.0, 1.0))
        angles.append(np.degrees(angle))
    return angles

# Function to filter Zn candidates by distance thresholds
def filter_by_distance_threshold(ca_coords, cb_coords, zn_candidates, ca_xyz, cb_xyz):
    filtered_candidates = []
    for zn_candidate in zn_candidates:
        # Calculate distances to Cα and Cβ
        ca_distances_calc = np.linalg.norm(zn_candidate - ca_coords, axis=1)
        cb_distances_calc = np.linalg.norm(zn_candidate - cb_coords, axis=1)

        # Calculate Zn-Cα/Zn-Cβ ratios
        ratio = ca_distances_calc / cb_distances_calc

        # Calculate angles
        angles = np.array(calculate_angles(zn_candidate, ca_xyz, cb_xyz))

        # Apply thresholds
        if (np.all((thresholds['ca_distances_calc'][0] <= ca_distances_calc) & (ca_distances_calc <= thresholds['ca_distances_calc'][1])) and
            np.all((thresholds['cb_distances_calc'][0] <= cb_distances_calc) & (cb_distances_calc <= thresholds['cb_distances_calc'][1])) and
            np.all((thresholds['ratio'][0] <= ratio) & (ratio <= thresholds['ratio'][1]))):
            filtered_candidates.append(zn_candidate)

    return np.array(filtered_candidates)



def define_excluded_triads(triad_residues, structure):
    excluded_residues = set()
    for _, residue_number in triad_residues:
        for model in structure:
            for chain in model:
                for residue in chain:
                    if residue.get_id()[1] == residue_number:
                        excluded_residues.add((chain.id, residue.get_id()[1]))
    return excluded_residues

# Function to perform proximity filtering and record nearby amino acids
def find_proximity_amino_acids(structure, zn_candidate, excluded_residues, exclusion_radius=2.5):
    nearby_amino_acids = []

    for model in structure:
        for chain in model:
            for residue in chain:
                # Skip if the residue is in the excluded set
                if (chain.id, residue.get_id()[1]) in excluded_residues:
                    continue

                for atom in residue:
                    atom_coords = atom.coord
                    distance_to_atom = np.linalg.norm(zn_candidate - atom_coords)
                    if distance_to_atom < exclusion_radius:
                        nearby_amino_acids.append({
                            'Chain_ID': chain.id,
                            'Residue_Number': residue.get_id()[1],
                            'Residue_Name': residue.get_resname(),
                            'Atom_Name': atom.get_name(),
                            'Distance_to_Zn': distance_to_atom
                        })
                        break  # Log only once per residue within radius
    return nearby_amino_acids
def define_excluded_triads(triad_residues, structure):
    excluded_residues = set()
    for _, residue_number in triad_residues:
        for model in structure:
            for chain in model:
                for residue in chain:
                    if residue.get_id()[1] == residue_number:
                        excluded_residues.add((chain.id, residue.get_id()[1]))
    return excluded_residues
# Function for proximity filtering with explicit exclusion of defined triads
def proximity_filter(structure, zn_candidate, excluded_residues, exclusion_radius=2.5):
    """
    Perform proximity filtering, excluding residues with the same residue number across chains.
    """
    for model in structure:
        for chain in model:
            for residue in chain:
                chain_id = chain.id
                residue_number = residue.get_id()[1]  # Residue sequence number

                # Skip residues if they match any in the exclusion set
                if (chain_id, residue_number) in excluded_residues:
                    continue

                for atom in residue:
                    atom_coords = atom.coord
                    distance_to_atom = np.linalg.norm(zn_candidate - atom_coords)
                    if distance_to_atom < exclusion_radius:
                        return False  # Invalid candidate due to proximity to excluded residue

    return True

# Main function to estimate Zn candidates with precise boundary handling from Excel
def estimate_zn_iterative(ca_coords, cb_coords, prob_map_3d, ca_bins, cb_bins, angle_bins, pdb_directory, grid_resolution=0.2):
    # Extract thresholds from the DataFrame
    thresholds = {}
    for _, row in thresholds_df.iterrows():
        parameter = row['Parameter']
        min_value = row['Min']
        max_value = row['Max']
        if pd.notna(min_value) and pd.notna(max_value):
            thresholds[parameter] = (min_value, max_value)

    required_keys = ['ca_distances_calc', 'cb_distances_calc', 'ratio', 'angle']

    for key in required_keys:
        if key not in thresholds:
            raise KeyError(f"Missing key '{key}' in thresholds file.")

    zn_coords_list = []
    best_scores = []
    angle_list = []
    proximity_data = []  # Store proximity information for each valid Zn candidate

    for i in range(len(ca_coords)):
        entry_id = df_alanine['PDB_ID'][i]
        structure = load_pdb_structure(entry_id, pdb_directory)

        # Extract and reshape Cα and Cβ coordinates
        ca_xyz = ca_coords.iloc[i].values.reshape(3, 3)
        cb_xyz = cb_coords.iloc[i].values.reshape(3, 3)

        # Define triad residues (ignoring chain IDs initially)
        triad_residues = [
            (None, df_alanine.at[i, 'Coord_residue_number1']),
            (None, df_alanine.at[i, 'Coord_residue_number2']),
            (None, df_alanine.at[i, 'Coord_residue_number3'])
        ]

        # Define excluded residues considering all chains with the same residue numbers
        excluded_residues = define_excluded_triads(triad_residues, structure)

        # Initialize shared region boundaries
        shared_x_min = -np.inf
        shared_x_max = np.inf
        shared_y_min = -np.inf
        shared_y_max = np.inf
        shared_z_min = -np.inf
        shared_z_max = np.inf

        # Define inner and outer box boundaries from thresholds
        inner_boxes = []
        for j in range(3):
            x_min_inner = min(ca_xyz[j, 0], cb_xyz[j, 0]) - thresholds['ca_distances_calc'][0]
            x_max_inner = max(ca_xyz[j, 0], cb_xyz[j, 0]) + thresholds['ca_distances_calc'][0]

            y_min_inner = min(ca_xyz[j, 1], cb_xyz[j, 1]) - thresholds['cb_distances_calc'][0]
            y_max_inner = max(ca_xyz[j, 1], cb_xyz[j, 1]) + thresholds['cb_distances_calc'][0]

            z_min_inner = min(ca_xyz[j, 2], cb_xyz[j, 2]) - thresholds['ca_distances_calc'][0]
            z_max_inner = max(ca_xyz[j, 2], cb_xyz[j, 2]) + thresholds['ca_distances_calc'][0]

            inner_boxes.append((x_min_inner, x_max_inner, y_min_inner, y_max_inner, z_min_inner, z_max_inner))

            x_min_outer = min(ca_xyz[j, 0], cb_xyz[j, 0]) - thresholds['ca_distances_calc'][1]
            x_max_outer = max(ca_xyz[j, 0], cb_xyz[j, 0]) + thresholds['ca_distances_calc'][1]

            y_min_outer = min(ca_xyz[j, 1], cb_xyz[j, 1]) - thresholds['cb_distances_calc'][1]
            y_max_outer = max(ca_xyz[j, 1], cb_xyz[j, 1]) + thresholds['cb_distances_calc'][1]

            z_min_outer = min(ca_xyz[j, 2], cb_xyz[j, 2]) - thresholds['ca_distances_calc'][1]
            z_max_outer = max(ca_xyz[j, 2], cb_xyz[j, 2]) + thresholds['ca_distances_calc'][1]

            # Update shared region with intersection
            shared_x_min = max(shared_x_min, x_min_outer - grid_resolution)
            shared_x_max = min(shared_x_max, x_max_outer + grid_resolution)
            shared_y_min = max(shared_y_min, y_min_outer - grid_resolution)
            shared_y_max = min(shared_y_max, y_max_outer + grid_resolution)
            shared_z_min = max(shared_z_min, z_min_outer - grid_resolution)
            shared_z_max = min(shared_z_max, z_max_outer + grid_resolution)

        # Ensure valid search space exists
        if shared_x_min >= shared_x_max or shared_y_min >= shared_y_max or shared_z_min >= shared_z_max:
            print(f"No shared search space for Entry {i}: {entry_id}")
            zn_coords_list.append("no metal")
            best_scores.append(0)
            angle_list.append([None, None, None])
            continue

        # Debug print for shared search region
        print(f"Shared Region for Entry {i}: {entry_id}")
        print(f"x_min: {shared_x_min}, x_max: {shared_x_max}")
        print(f"y_min: {shared_y_min}, y_max: {shared_y_max}")
        print(f"z_min: {shared_z_min}, z_max: {shared_z_max}")

        # Generate Zn candidates grid within the shared region excluding inner boxes
        zn_candidates = []
        total_grid_points = 0
        distance_valid_points = 0
        angle_valid_points = 0
        ratio_valid_points = 0
        probability_valid_points = 0

        # First find shared region with coarse grid
        shared_region_found = False
        if shared_x_min < shared_x_max and shared_y_min < shared_y_max and shared_z_min < shared_z_max:
            shared_region_found = True

                # If shared region exists, search with finer grid
        if shared_region_found:
            grid_resolution = 0.2  # Use the input parameter
            total_grid_points = 0
            distance_valid_points = 0
            angle_valid_points = 0
            ratio_valid_points = 0  # Renamed for clarity
            probability_valid_points = 0
            valid_points = []
            all_scores = []

            for x in np.arange(shared_x_min, shared_x_max + 1e-8, grid_resolution):
                for y in np.arange(shared_y_min, shared_y_max + 1e-8, grid_resolution):
                    for z in np.arange(shared_z_min, shared_z_max + 1e-8, grid_resolution):
                        total_grid_points += 1

                        # Check both corner and center points
                        corner_point = np.array([x, y, z])
                        center_point = np.array([
                            x + grid_resolution/2,
                            y + grid_resolution/2,
                            z + grid_resolution/2
                        ])

                        for point in [corner_point, center_point]:
                            # Distance check
                            distances_ca = np.linalg.norm(ca_xyz - point, axis=1)
                            distances_cb = np.linalg.norm(cb_xyz - point, axis=1)

                            distance_condition = (np.all((thresholds['ca_distances_calc'][0] <= distances_ca) &
                                                       (distances_ca <= thresholds['ca_distances_calc'][1])) and
                                               np.all((thresholds['cb_distances_calc'][0] <= distances_cb) &
                                                       (distances_cb <= thresholds['cb_distances_calc'][1])))

                            if distance_condition:
                                distance_valid_points += 1

                                # Calculate angles
                                angles = calculate_angles(point, ca_xyz, cb_xyz)
                                angle_condition = all(thresholds['angle'][0] <= angle <= thresholds['angle'][1]
                                                   for angle in angles)

                                if angle_condition:
                                    angle_valid_points += 1

                                    # Calculate ratios
                                    ratios = calculate_ratio(point, ca_xyz, cb_xyz)
                                    ratio_condition = np.all((thresholds['ratio'][0] <= ratios) &
                                                           (ratios <= thresholds['ratio'][1]))

                                    if ratio_condition:
                                        ratio_valid_points += 1  # Only increment if ratio check passes

                                        # Calculate probability score
                                        score = score_zn_predictions(
                                            distances_ca,
                                            distances_cb,
                                            angles,
                                            prob_map_3d, ca_bins, cb_bins, angle_bins)

                                        if score is not None and score > 0:
                                            probability_valid_points += 1  # Only increment if probability check passes
                                            valid_points.append(point)
                                            all_scores.append(score)
                                            zn_candidates.append([x, y, z])

            # Print filtering statistics
            print(f"Total grid points searched: {total_grid_points}")
            print(f"Points passing distance criteria: {distance_valid_points}")
            print(f"Points passing angle criteria: {angle_valid_points}")
            print(f"Points passing ratio criteria: {ratio_valid_points}")
            print(f"Points passing probability criteria: {probability_valid_points}")

        zn_candidates = np.array(zn_candidates)

        # Filter Zn candidates by distance and proximity
        distance_filtered_candidates = filter_by_distance_threshold(ca_xyz, cb_xyz, zn_candidates, ca_xyz, cb_xyz)

        for zn_candidate in distance_filtered_candidates:
            angles = calculate_angles(zn_candidate, ca_xyz, cb_xyz)
            if any(angle > thresholds['angle'][1] or angle < thresholds['angle'][0] for angle in angles):
                continue

            score = score_zn_predictions(
                np.linalg.norm(zn_candidate - ca_xyz, axis=1),
                np.linalg.norm(zn_candidate - cb_xyz, axis=1),
                angles,
                prob_map_3d, ca_bins, cb_bins, angle_bins
            )

            if score is not None and score > 0:
                valid = proximity_filter(structure, zn_candidate, excluded_residues)
                if valid:
                    zn_coords_list.append(zn_candidate)
                    best_scores.append(score)
                    angle_list.append(angles)
                    break

        if len(zn_coords_list) <= i:
            zn_coords_list.append("no metal")
            best_scores.append(0)
            angle_list.append([None, None, None])

    return zn_coords_list, best_scores, angle_list, proximity_data


# Define coordinates for Zn estimation and specify PDB directory
ca_coords = df_alanine[['CA1_X', 'CA1_Y', 'CA1_Z', 'CA2_X', 'CA2_Y', 'CA2_Z', 'CA3_X', 'CA3_Y', 'CA3_Z']]
cb_coords = df_alanine[['CB1_X', 'CB1_Y', 'CB1_Z', 'CB2_X', 'CB2_Y', 'CB2_Z', 'CB3_X', 'CB3_Y', 'CB3_Z']]
pdb_directory = '/content/3p43_alanine.pdb'  # Replace with actual path to PDB files

# Run Zn estimation with grid generation, scoring, and iterative proximity filtering
estimated_zn_coords_grid, zn_scores, angles_list, proximity_data = estimate_zn_iterative(
    ca_coords, cb_coords, prob_map_3d, ca_bins, cb_bins, angle_bins, pdb_directory, grid_resolution=0.2
)

# After calculating Zn coordinates, scores, and angles
df_alanine['Zn_X_Grid'] = [coords[0] if not isinstance(coords, str) else None for coords in estimated_zn_coords_grid]
df_alanine['Zn_Y_Grid'] = [coords[1] if not isinstance(coords, str) else None for coords in estimated_zn_coords_grid]
df_alanine['Zn_Z_Grid'] = [coords[2] if not isinstance(coords, str) else None for coords in estimated_zn_coords_grid]
df_alanine['Zn_Score'] = zn_scores
df_alanine['Angle_1'], df_alanine['Angle_2'], df_alanine['Angle_3'] = zip(*angles_list)

# Remove rows where Zn_Score is 0
df_alanine = df_alanine[df_alanine['Zn_Score'] != 0]

# @markdown ### Save the Filtered DataFrame to an Excel File
output_file_path = '/content/3ttis_coordinates_1_result.xlsx'  # @param {type:"string"}
df_alanine.to_excel(output_file_path, index=False)
# @markdown **Filtered Zn coordinates, scores, and angles saved.**
print(f"Filtered Zn coordinates, scores, and angles saved to '{output_file_path}'")

# @markdown ### Save the Proximity Information to an Excel File
df_proximity = pd.DataFrame(proximity_data)
output_proximity_file = '/content/test_proximity3.xlsx' # @param {type:"string"}
df_proximity.to_excel(output_proximity_file, index=False)
# @markdown **Proximity amino acid details saved.**
print(f"Proximity amino acid details saved to '{output_proximity_file}'")

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import os
from Bio.PDB import PDBParser
import requests

# Markdown documentation for file pathways

# @markdown # Step 5: Analysis the result (Apply to the PDB file)

# Load input file
input_file_path = "/content/3ttis_coordinates_1_result.xlsx" # @param {type:"string"}
df_new = pd.read_excel(input_file_path)

# Generate PyMOL script file
pymol_script_commands = []
df_new['Combination_Number'] = range(1, len(df_new) + 1)

# Generate the PyMOL script for both valid and invalid Zn binding forms
for index, row in df_new.iterrows():
    # Retrieve chain and residue information
    chain1, res1 = row['Coord_chain_id_number1'], row['Coord_residue_number1']
    chain2, res2 = row['Coord_chain_id_number2'], row['Coord_residue_number2']
    chain3, res3 = row['Coord_chain_id_number3'], row['Coord_residue_number3']

    # Retrieve Zn coordinates
    zn_x, zn_y, zn_z = row['Zn_X_Grid'], row['Zn_Y_Grid'], row['Zn_Z_Grid']

    selection_name = f"obj{row['Combination_Number']:02d}"

    # Select the residues
    pymol_script_commands.append(f"select {selection_name}, (chain {chain1} and resi {res1}) or (chain {chain2} and resi {res2}) or (chain {chain3} and resi {res3})")

    # Create the objects for the residues
    pymol_script_commands.append(f"create {selection_name}_residue1, /{row['PDB_ID']}//{chain1}/{res1}")
    pymol_script_commands.append(f"create {selection_name}_residue2, /{row['PDB_ID']}//{chain2}/{res2}")
    pymol_script_commands.append(f"create {selection_name}_residue3, /{row['PDB_ID']}//{chain3}/{res3}")

    # Check if Zn coordinates are available
    if not pd.isna(zn_x) and not pd.isna(zn_y) and not pd.isna(zn_z):
        # Zn coordinates are present, add the Zn pseudoatom
        zn_name = f"{selection_name}_Metal"
        pymol_script_commands.append(f"pseudoatom {zn_name}, pos=[{zn_x}, {zn_y}, {zn_z}], elem=Metal, name={zn_name}")
        pymol_script_commands.append(f"show sphere, {zn_name}")
    else:
        # Zn coordinates are missing, mark this combination as non-binding
        pymol_script_commands.append(f"# {selection_name} does not bind Zn")

# Save the commands into a PyMOL script
pymol_script_file = "/content/3tis_final.pml" # @param {type:"string"}
with open(pymol_script_file, 'w') as f:
    f.write("# PyMOL script for visualizing both Zn-binding and non-binding residue combinations\n\n")
    for command in pymol_script_commands:
        f.write(command + '\n')

print(f"PyMOL script saved to {pymol_script_file}")

PyMOL script saved to /content/3tis_final.pml


In [None]:
import pandas as pd

# Markdown documentation for file pathways

# @markdown # Step 5: Analysis the result (Sort the result based on the metal-ligating ligands)


# Load the provided Excel file
file_path = '/content/3ttis_coordinates_1_result.xlsx' # @param {type:"string"}
excel_data = pd.ExcelFile(file_path)

# Load the data from the first sheet
df = excel_data.parse('Sheet1')

# Define function to count specific residues in the 'Combination' column
def count_residues(row):
    # Count occurrences of specific residue names in the 'Combination' column
    residue_names = ['HIS', 'CYS', 'GLU', 'ASP']
    count = sum(row['Combination'].count(residue) for residue in residue_names)
    return count

# Apply the function to each row and store the result in a new column 'Residue_Count'
df['Residue_Count'] = df.apply(count_residues, axis=1)

# Split the data based on the count of residues and save to separate sheets in a new Excel file
output_file_path = '/content/3ttis_coordinates_1_result_residue_count.xlsx'  # @param {type:"string"}
with pd.ExcelWriter(output_file_path) as writer:
    for count in df['Residue_Count'].unique():
        df_filtered = df[df['Residue_Count'] == count]
        df_filtered.to_excel(writer, sheet_name=f'Residue_Count_{count}', index=False)

print("File saved at:", output_file_path)

File saved at: /content/3ttis_coordinates_1_result_residue_count.xlsx
