In [None]:
# @markdown # Step 1: Install all neccessary packages
# Install all necessary packages
!pip install biopython
!pip install --upgrade tqdm
!apt-get install -y pymol

# Import necessary modules
from Bio.PDB import PDBParser, Selection, NeighborSearch
from Bio.PDB.Polypeptide import is_aa
from tqdm import tqdm
import torch

In [None]:
#@markdown # Step 2: Mutate the glycine to alanine

# Import the necessary modules
import pymol2

#@markdown **Note:** Specify the paths to the input and output PDB files below.

#@markdown ### Enter the path to your input PDB file:
pdb_file_path = "/content/1c2a.pdb"  #@param {type:"string"}

#@markdown ### Enter the path to your output PDB file:
output_file_path = "/content/1c2a_alanine.pdb"  #@param {type:"string"}

# Create an instance of the PyMOL session
with pymol2.PyMOL() as pymol:
    # Initialize PyMOL
    pymol.cmd.reinitialize()

    # Load the structure file
    pymol.cmd.load(pdb_file_path)

    # Identify glycine residues
    glycine_residues = pymol.cmd.get_model("resn GLY").atom

    # Loop through glycine residues
    for atom in glycine_residues:
        residue_num = atom.resi
        chain = atom.chain
        # Construct the selection string in the format "resi X and chain Y"
        selection_str = f"resi {residue_num} and chain {chain}"
        # Apply the mutation using the mutagenesis command
        pymol.cmd.wizard("mutagenesis")
        pymol.cmd.refresh_wizard()
        pymol.cmd.get_wizard().do_select(selection_str)
        pymol.cmd.get_wizard().set_mode("ALA")
        pymol.cmd.get_wizard().apply()
        pymol.cmd.delete(selection_str)  # Delete the original residue to avoid clashes

    # Save the mutated structure
    pymol.cmd.save(output_file_path)


In [None]:
# Existing imports and setup
import pandas as pd
import numpy as np
from Bio.PDB import PDBParser
import itertools
import os
from IPython.display import display, Markdown

# Markdown documentation for file pathways

# @markdown # Step 3A: Run the Metal-Installer

# Import the necessary modules
import pymol2
from IPython.display import display, Markdown
import requests  # Required for downloading files from GitHub

# @markdown **Note:** Specify the paths to the input and output files below.

# @markdown ### Enter the path to your input PDB file:
pdb_file = "/content/1c2a_alanine.pdb"  # @param {type:"string"}

# @markdown ### Enter the path to your output Excel file:
output_excel_file = "/content/1c2a_alanine_94.xlsx"  # @param {type:"string"}

# @markdown ### Enter the path to your PyMOL script file:
pymol_script_file = "/content/1c2a_alanine_94.pml"  # @param {type:"string"}

# @markdown ### Set the metal type to use:
Metal = 'Mn'  # @param ["Zn", "Mn", "Cu"]

# @markdown ### Select the combination type:
# @markdown **Note:** 2His/1Cys is only available to Cu
Combinations = '2His_1Asp'  # @param ["3His", "2His_1Asp", "2His_1Glu", "2His_1Cys"]

# @markdown ### Choose a threshold for analysis:
Range = '4'  # @param ["1", "2", "3", "4","5"]

# @markdown ### Specify the specific residue number (use 0 for no specific residue):
specific_residue_number = 0  # @param {type:"integer"}

# Additional code to construct the URL for thresholds based on user input and download the file

# Base URL for the thresholds files on GitHub
base_url = "https://raw.githubusercontent.com/SNU-Songlab/Metal-Installer-code/main/Threshold"

# Construct the full URL based on user input
thresholds_url = f"{base_url}/{Metal}/{Combinations}/{Range}.xlsx"

# Define the local path to save the downloaded file
thresholds_file = "/content/thresholds.xlsx"  # This path will be used throughout the script

# Download the file from GitHub
response = requests.get(thresholds_url)

# Check if the request was successful and save the file
if response.status_code == 200:
    with open(thresholds_file, 'wb') as file:
        file.write(response.content)
    print(f"File downloaded successfully from {thresholds_url} and saved as {thresholds_file}")
else:
    raise ValueError(f"Failed to download file from {thresholds_url}. Status code: {response.status_code}")

# Load thresholds from the downloaded Excel file
thresholds_df = pd.read_excel(thresholds_file, sheet_name='Sheet1')

# Extract threshold values
thresholds = {}
for _, row in thresholds_df.iterrows():
    parameter = row['Parameter']
    min_value = row['Min']
    max_value = row['Max']

    if pd.notna(min_value) and pd.notna(max_value):
        thresholds[parameter] = (min_value, max_value)

# Assign threshold values
alpha_distance_range = thresholds['alpha_distance_range']
beta_distance_range = thresholds['beta_distance_range']
ratio_threshold_range = thresholds['ratio_threshold_range']
pie_threshold_range = thresholds['pie_threshold_range']

# PDB Parser setup
parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', pdb_file)
model = structure[0]
residues = [residue for residue in model.get_residues() if residue.get_id()[0] == ' ']

# Step 1: Filter combinations to include specific residue number if specified
if specific_residue_number != 0:
    combinations = [
        comb for comb in itertools.combinations(residues, 3)
        if any(res.get_id()[1] == specific_residue_number for res in comb)
    ]
else:
    combinations = list(itertools.combinations(residues, 3))

# Distance filter
filtered_data_distances = []

for idx, combination in enumerate(combinations):
    alpha_distances, beta_distances = [], []

    try:
        for res1, res2 in itertools.combinations(combination, 2):
            if res1.has_id('CA') and res2.has_id('CA'):
                ca1, ca2 = res1['CA'].coord, res2['CA'].coord
                alpha_distance = np.linalg.norm(ca1 - ca2)
            else:
                continue

            if res1.has_id('CB') and res2.has_id('CB'):
                cb1, cb2 = res1['CB'].coord, res2['CB'].coord
                beta_distance = np.linalg.norm(cb1 - cb2)
            else:
                continue

            if (alpha_distance_range[0] <= alpha_distance <= alpha_distance_range[1] and
                    beta_distance_range[0] <= beta_distance <= beta_distance_range[1]):
                alpha_distances.append(alpha_distance)
                beta_distances.append(beta_distance)

        if len(alpha_distances) >= 3 and len(beta_distances) >= 3:
            filtered_data_distances.append({
                'PDB_ID': pdb_file,
                'Combination': combination,
                'Coord_chain_id_number1': combination[0].get_full_id()[2],
                'Coord_residue_number1': combination[0].get_full_id()[3][1],
                'Coord_residue_name1': combination[0].get_resname(),
                'Coord_atom_name1': 'CA',
                'Coord_chain_id_number2': combination[1].get_full_id()[2],
                'Coord_residue_number2': combination[1].get_full_id()[3][1],
                'Coord_residue_name2': combination[1].get_resname(),
                'Coord_atom_name2': 'CA',
                'Coord_chain_id_number3': combination[2].get_full_id()[2],
                'Coord_residue_number3': combination[2].get_full_id()[3][1],
                'Coord_residue_name3': combination[2].get_resname(),
                'Coord_atom_name3': 'CA',
                'Alpha Distance 1': alpha_distances[0],
                'Alpha Distance 2': alpha_distances[1],
                'Alpha Distance 3': alpha_distances[2],
                'Beta Distance 1': beta_distances[0],
                'Beta Distance 2': beta_distances[1],
                'Beta Distance 3': beta_distances[2]
            })

    except KeyError as e:
        print(f"Error processing combination {combination}: {e}")

# Create DataFrame for distances
column_order = [
    'PDB_ID',
    'Combination',
    'Coord_chain_id_number1', 'Coord_residue_number1', 'Coord_residue_name1', 'Coord_atom_name1',
    'Coord_chain_id_number2', 'Coord_residue_number2', 'Coord_residue_name2', 'Coord_atom_name2',
    'Coord_chain_id_number3', 'Coord_residue_number3', 'Coord_residue_name3', 'Coord_atom_name3',
    'Alpha Distance 1', 'Alpha Distance 2', 'Alpha Distance 3',
    'Beta Distance 1', 'Beta Distance 2', 'Beta Distance 3'
]

df_distances = pd.DataFrame(filtered_data_distances)
df_distances = df_distances[column_order]

# Ratio filter
filtered_data_ratio = []

for idx, row in df_distances.iterrows():
    alpha_distances = [row['Alpha Distance 1'], row['Alpha Distance 2'], row['Alpha Distance 3']]
    beta_distances = [row['Beta Distance 1'], row['Beta Distance 2'], row['Beta Distance 3']]

    for i in range(3):
        alpha_distance_i = alpha_distances[i]
        beta_distance_i = beta_distances[i]
        ratio = alpha_distance_i / beta_distance_i

        if not (ratio_threshold_range[0] <= ratio <= ratio_threshold_range[1]):
            break
    else:
        filtered_data_ratio.append(row)

df_ratio = pd.DataFrame(filtered_data_ratio)
df_ratio = df_ratio[column_order]


# Pie filter
def calculate_pie(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    magnitude_product = np.linalg.norm(vector1) * np.linalg.norm(vector2)
    if magnitude_product == 0:
        return np.nan
    cosine_angle = dot_product / magnitude_product
    cosine_angle = np.clip(cosine_angle, -1.0, 1.0)
    return np.degrees(np.arccos(cosine_angle))

def process_row(row):
    pdb_file_path = pdb_file
    if not os.path.isfile(pdb_file_path):
        print(f"PDB file not found: {pdb_file_path}")
        return [None, None, None]

    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('protein', pdb_file_path)
    model = structure[0]

    try:
        residues = row['Combination']
        print(f"Residues loaded: {residues}")

        pies = []
        pairs = [(0, 1), (0, 2), (1, 2)]

        for i, j in pairs:
            try:
                CA1 = residues[i]['CA']
                CA2 = residues[j]['CA']
                CB1 = residues[i]['CB'] if 'CB' in residues[i] else CA1
                CB2 = residues[j]['CB'] if 'CB' in residues[j] else CA2

                vector_CA = CA2.coord - CA1.coord
                vector_CB = CB2.coord - CB1.coord

                angle = calculate_pie(vector_CA, vector_CB)
                pies.append(angle)
            except KeyError as e:
                print(f"KeyError for residues {residues[i]} and {residues[j]}: {e}")
                pies.append(None)

        return pies
    except KeyError as e:
        print(f"KeyError: {e}")
        return [None, None, None]

pie_results = df_ratio.apply(process_row, axis=1, result_type='expand')
df_ratio[['Pie_1_2', 'Pie_1_3', 'Pie_2_3']] = pie_results

# Create filter columns based on pie thresholds
for col in ['Pie_1_2', 'Pie_1_3', 'Pie_2_3']:
    df_ratio[f'{col}_Filter'] = df_ratio.apply(lambda row: pie_threshold_range[0] < row[col] < pie_threshold_range[1] if pd.notnull(row[col]) else False, axis=1)

df_ratio['Pie_Filter'] = df_ratio[[f'{col}_Filter' for col in ['Pie_1_2', 'Pie_1_3', 'Pie_2_3']]].all(axis=1)

df_final_filter = df_ratio[df_ratio['Pie_Filter']]

# Save all DataFrames into a single Excel file with different tabs
with pd.ExcelWriter(output_excel_file) as writer:
    df_distances.to_excel(writer, sheet_name='Distances', index=False)
    df_ratio.to_excel(writer, sheet_name='Ratio', index=False)
    df_final_filter.to_excel(writer, sheet_name='Pie', index=False)

# Generate PyMOL script file
pymol_script_commands = []
df_final_filter['Combination_Number'] = range(1, len(df_final_filter) + 1)

for index, row in df_final_filter.iterrows():
    combination = row['Combination']
    chain1, res1 = combination[0].get_full_id()[2], combination[0].get_full_id()[3][1]
    chain2, res2 = combination[1].get_full_id()[2], combination[1].get_full_id()[3][1]
    chain3, res3 = combination[2].get_full_id()[2], combination[2].get_full_id()[3][1]

    selection_name = f"obj{row['Combination_Number']:02d}"
    pymol_script_commands.append(f"select {selection_name}, (chain {chain1} and resi {res1}) or (chain {chain2} and resi {res2}) or (chain {chain3} and resi {res3})")
    pymol_script_commands.append(f"create {selection_name}_residue1, /{pdb_file}//{chain1}/{res1}")
    pymol_script_commands.append(f"create {selection_name}_residue2, /{pdb_file}//{chain2}/{res2}")
    pymol_script_commands.append(f"create {selection_name}_residue3, /{pdb_file}//{chain3}/{res3}")

with open(pymol_script_file, 'w') as f:
    f.write("# PyMOL script for visualizing filtered residue combinations\n\n")
    for command in pymol_script_commands:
        f.write(command + '\n')

print(f"\nResults saved to {output_excel_file}")
print(f"PyMOL script saved to {pymol_script_file}")


In [None]:
# Existing imports and setup
import pandas as pd
import numpy as np
from Bio.PDB import PDBParser
import itertools
import os
from IPython.display import display, Markdown

# Markdown documentation for file pathways

# @markdown # Step 3B: Run the Metal-Installer

# Import the necessary modules
import pymol2
from IPython.display import display, Markdown
import requests  # Required for downloading files from GitHub

# @markdown **Note:** Specify the paths to the input and output files below.

# @markdown ### Enter the path to your input PDB file:
pdb_file = "/content/1c2a_alanine.pdb"  # @param {type:"string"}

# @markdown ### Enter the path to your output Excel file:
output_excel_file = "/content/1ca2_model_alanine.xlsx"  # @param {type:"string"}

# @markdown ### Enter the path to your PyMOL script file:
pymol_script_file = "/content/OmpF_dimer_alanine_83_2_output_tml.pml"  # @param {type:"string"}

# @markdown ### Specify the specific residue number (use 0 for no specific residue):
specific_residue_number = 96  # @param {type:"integer"}

# @markdown ### Set the thresholds for analysis
# @markdown **Alpha Distance**: Enter the minimum and maximum values.
alpha_distance_min = 4.0  # @param {type:"number"}
alpha_distance_max = 10.4  # @param {type:"number"}

# @markdown **Beta Distance**: Enter the minimum and maximum values.
beta_distance_min = 5.0  # @param {type:"number"}
beta_distance_max = 9.3  # @param {type:"number"}

# @markdown **Ratio Threshold**: Enter the minimum and maximum values.
ratio_threshold_min = 0.7  # @param {type:"number"}
ratio_threshold_max = 1.4  # @param {type:"number"}

# @markdown **Pie Threshold**: Enter the minimum and maximum values.
pie_threshold_min = 0  # @param {type:"number"}
pie_threshold_max = 20  # @param {type:"number"}

# Set thresholds based on user inputs
alpha_distance_range = (alpha_distance_min, alpha_distance_max)
beta_distance_range = (beta_distance_min, beta_distance_max)
ratio_threshold_range = (ratio_threshold_min, ratio_threshold_max)
pie_threshold_range = (pie_threshold_min, pie_threshold_max)

# Output the ranges to confirm
print(f"Alpha Distance Range: {alpha_distance_range}")
print(f"Beta Distance Range: {beta_distance_range}")
print(f"Ratio Threshold Range: {ratio_threshold_range}")
print(f"Pie Threshold Range: {pie_threshold_range}")

# Capture the inputs into variables
alpha_distance_range = (alpha_distance_min, alpha_distance_max)
beta_distance_range = (beta_distance_min, beta_distance_max)
ratio_threshold_range = (ratio_threshold_min, ratio_threshold_max)
pie_threshold_range = (pie_threshold_min, pie_threshold_max)

# Continue with the script using these threshold values
print(f"Alpha Distance Range: {alpha_distance_range}")
print(f"Beta Distance Range: {beta_distance_range}")
print(f"Ratio Threshold Range: {ratio_threshold_range}")
print(f"Pie Threshold Range: {pie_threshold_range}")

# PDB Parser setup
parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', pdb_file)
model = structure[0]
residues = [residue for residue in model.get_residues() if residue.get_id()[0] == ' ']

# Step 1: Filter combinations to include specific residue number if specified
if specific_residue_number != 0:
    combinations = [
        comb for comb in itertools.combinations(residues, 3)
        if any(res.get_id()[1] == specific_residue_number for res in comb)
    ]
else:
    combinations = list(itertools.combinations(residues, 3))


# Distance filter
filtered_data_distances = []

for idx, combination in enumerate(combinations):
    alpha_distances, beta_distances = [], []

    try:
        for res1, res2 in itertools.combinations(combination, 2):
            if res1.has_id('CA') and res2.has_id('CA'):
                ca1, ca2 = res1['CA'].coord, res2['CA'].coord
                alpha_distance = np.linalg.norm(ca1 - ca2)
            else:
                continue

            if res1.has_id('CB') and res2.has_id('CB'):
                cb1, cb2 = res1['CB'].coord, res2['CB'].coord
                beta_distance = np.linalg.norm(cb1 - cb2)
            else:
                continue

            if (alpha_distance_range[0] <= alpha_distance <= alpha_distance_range[1] and
                    beta_distance_range[0] <= beta_distance <= beta_distance_range[1]):
                alpha_distances.append(alpha_distance)
                beta_distances.append(beta_distance)

        if len(alpha_distances) >= 3 and len(beta_distances) >= 3:
            filtered_data_distances.append({
                'PDB_ID': pdb_file,
                'Combination': combination,
                'Coord_chain_id_number1': combination[0].get_full_id()[2],
                'Coord_residue_number1': combination[0].get_full_id()[3][1],
                'Coord_residue_name1': combination[0].get_resname(),
                'Coord_atom_name1': 'CA',
                'Coord_chain_id_number2': combination[1].get_full_id()[2],
                'Coord_residue_number2': combination[1].get_full_id()[3][1],
                'Coord_residue_name2': combination[1].get_resname(),
                'Coord_atom_name2': 'CA',
                'Coord_chain_id_number3': combination[2].get_full_id()[2],
                'Coord_residue_number3': combination[2].get_full_id()[3][1],
                'Coord_residue_name3': combination[2].get_resname(),
                'Coord_atom_name3': 'CA',
                'Alpha Distance 1': alpha_distances[0],
                'Alpha Distance 2': alpha_distances[1],
                'Alpha Distance 3': alpha_distances[2],
                'Beta Distance 1': beta_distances[0],
                'Beta Distance 2': beta_distances[1],
                'Beta Distance 3': beta_distances[2]
            })

    except KeyError as e:
        print(f"Error processing combination {combination}: {e}")

# Create DataFrame for distances
column_order = [
    'PDB_ID',
    'Combination',
    'Coord_chain_id_number1', 'Coord_residue_number1', 'Coord_residue_name1', 'Coord_atom_name1',
    'Coord_chain_id_number2', 'Coord_residue_number2', 'Coord_residue_name2', 'Coord_atom_name2',
    'Coord_chain_id_number3', 'Coord_residue_number3', 'Coord_residue_name3', 'Coord_atom_name3',
    'Alpha Distance 1', 'Alpha Distance 2', 'Alpha Distance 3',
    'Beta Distance 1', 'Beta Distance 2', 'Beta Distance 3'
]

df_distances = pd.DataFrame(filtered_data_distances)
df_distances = df_distances[column_order]

# Ratio filter
filtered_data_ratio = []

for idx, row in df_distances.iterrows():
    alpha_distances = [row['Alpha Distance 1'], row['Alpha Distance 2'], row['Alpha Distance 3']]
    beta_distances = [row['Beta Distance 1'], row['Beta Distance 2'], row['Beta Distance 3']]

    for i in range(3):
        alpha_distance_i = alpha_distances[i]
        beta_distance_i = beta_distances[i]
        ratio = alpha_distance_i / beta_distance_i

        if not (ratio_threshold_range[0] <= ratio <= ratio_threshold_range[1]):
            break
    else:
        filtered_data_ratio.append(row)

df_ratio = pd.DataFrame(filtered_data_ratio)
df_ratio = df_ratio[column_order]

# Pie filter
def calculate_pie(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    magnitude_product = np.linalg.norm(vector1) * np.linalg.norm(vector2)
    if magnitude_product == 0:
        return np.nan
    cosine_angle = dot_product / magnitude_product
    cosine_angle = np.clip(cosine_angle, -1.0, 1.0)
    return np.degrees(np.arccos(cosine_angle))

def process_row(row):
    pdb_file_path = pdb_file
    if not os.path.isfile(pdb_file_path):
        print(f"PDB file not found: {pdb_file_path}")
        return [None, None, None]

    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('protein', pdb_file_path)
    model = structure[0]

    try:
        residues = row['Combination']
        print(f"Residues loaded: {residues}")

        pies = []
        pairs = [(0, 1), (0, 2), (1, 2)]

        for i, j in pairs:
            try:
                CA1 = residues[i]['CA']
                CA2 = residues[j]['CA']
                CB1 = residues[i]['CB'] if 'CB' in residues[i] else CA1
                CB2 = residues[j]['CB'] if 'CB' in residues[j] else CA2

                vector_CA = CA2.coord - CA1.coord
                vector_CB = CB2.coord - CB1.coord

                angle = calculate_pie(vector_CA, vector_CB)
                pies.append(angle)
            except KeyError as e:
                print(f"KeyError for residues {residues[i]} and {residues[j]}: {e}")
                pies.append(None)

        return pies
    except KeyError as e:
        print(f"KeyError: {e}")
        return [None, None, None]

pie_results = df_ratio.apply(process_row, axis=1, result_type='expand')
df_ratio[['Pie_1_2', 'Pie_1_3', 'Pie_2_3']] = pie_results

# Create filter columns based on pie thresholds
for col in ['Pie_1_2', 'Pie_1_3', 'Pie_2_3']:
    df_ratio[f'{col}_Filter'] = df_ratio.apply(lambda row: pie_threshold_range[0] < row[col] < pie_threshold_range[1] if pd.notnull(row[col]) else False, axis=1)

df_ratio['Pie_Filter'] = df_ratio[[f'{col}_Filter' for col in ['Pie_1_2', 'Pie_1_3', 'Pie_2_3']]].all(axis=1)

df_final_filter = df_ratio[df_ratio['Pie_Filter']]

# Save all DataFrames into a single Excel file with different tabs
with pd.ExcelWriter(output_excel_file) as writer:
    df_distances.to_excel(writer, sheet_name='Distances', index=False)
    df_ratio.to_excel(writer, sheet_name='Ratio', index=False)
    df_final_filter.to_excel(writer, sheet_name='Pie', index=False)

# Generate PyMOL script file
pymol_script_commands = []
df_final_filter['Combination_Number'] = range(1, len(df_final_filter) + 1)

for index, row in df_final_filter.iterrows():
    combination = row['Combination']
    chain1, res1 = combination[0].get_full_id()[2], combination[0].get_full_id()[3][1]
    chain2, res2 = combination[1].get_full_id()[2], combination[1].get_full_id()[3][1]
    chain3, res3 = combination[2].get_full_id()[2], combination[2].get_full_id()[3][1]

    selection_name = f"obj{row['Combination_Number']:02d}"
    pymol_script_commands.append(f"select {selection_name}, (chain {chain1} and resi {res1}) or (chain {chain2} and resi {res2}) or (chain {chain3} and resi {res3})")
    pymol_script_commands.append(f"create {selection_name}_residue1, /{pdb_file}//{chain1}/{res1}")
    pymol_script_commands.append(f"create {selection_name}_residue2, /{pdb_file}//{chain2}/{res2}")
    pymol_script_commands.append(f"create {selection_name}_residue3, /{pdb_file}//{chain3}/{res3}")

with open(pymol_script_file, 'w') as f:
    f.write("# PyMOL script for visualizing filtered residue combinations\n\n")
    for command in pymol_script_commands:
        f.write(command + '\n')

print(f"\nResults saved to {output_excel_file}")
print(f"PyMOL script saved to {pymol_script_file}")



