In [3]:
# Install all necessary packages
!pip install biopython
!pip install --upgrade tqdm
!apt-get install -y pymol

# Import necessary modules
from Bio.PDB import PDBParser, Selection, NeighborSearch
from Bio.PDB.Polypeptide import is_aa
from tqdm import tqdm
import torch

Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  apbs apbs-data freeglut3 libapbs3 libevdev2 libglu1-mesa libgudev-1.0-0 libinput-bin libinput10
  libmaloc1 libmd4c0 libmtdev1 libqt5core5a libqt5dbus5 libqt5designer5 libqt5gui5 libqt5help5
  libqt5network5 libqt5opengl5 libqt5printsupport5 libqt5sql5 libqt5sql5-sqlite libqt5svg5
  libqt5test5 libqt5widgets5 libqt5xml5 libwacom-bin libwacom-common libwacom9 libxcb-icccm4
  libxcb-image0 libxcb-keysyms1 libxcb-render-util

In [None]:
# Import the necessary modules
import pymol2
pdb_file_path = "/content/3iq5.pdb"
output_file_path = "/content/3iq5_alanine.pdb"

# Create an instance of the PyMOL session
with pymol2.PyMOL() as pymol:
    # Initialize PyMOL
    pymol.cmd.reinitialize()

    # Load the structure file
    pymol.cmd.load(pdb_file_path)

    # Identify glycine residues
    glycine_residues = pymol.cmd.get_model("resn GLY").atom

    # Loop through glycine residues
    for atom in glycine_residues:
        residue_num = atom.resi
        chain = atom.chain
        # Construct the selection string in the format "resi X and chain Y"
        selection_str = f"resi {residue_num} and chain {chain}"
        # Apply the mutation using the mutagenesis command
        pymol.cmd.wizard("mutagenesis")
        pymol.cmd.refresh_wizard()
        pymol.cmd.get_wizard().do_select(selection_str)
        pymol.cmd.get_wizard().set_mode("ALA")
        pymol.cmd.get_wizard().apply()
        pymol.cmd.delete(selection_str)  # Delete the original residue to avoid clashes

    # Save the mutated structure
    pymol.cmd.save(output_file_path)

In [7]:
#final_all_result_file
import pandas as pd
import numpy as np
from Bio.PDB import PDBParser
import itertools
import os

# Configuration and thresholds
pdb_file = "/content/3iq5_alanine.pdb"  # Path to the alanine mutated pdb file
output_excel_file = "/content/3iq5_alanine_100.xlsx"  # Output for all filtered results in one file
pymol_script_file = "/content/3iq5_alanine_100_output_tml.pml"  # PyMOL script file for visualizing candidates for mutation

# Thresholds
alpha_distance_range = (3.8, 10.4)  # Threshold for alpha distances
beta_distance_range = (5.0, 9.3)  # Threshold for beta distances
ratio_threshold_range = (0.7 1.4)  # Threshold for alpha/beta distance ratio
theta_threshold_range = (1.5, 110)  # Unified theta threshold range
pie_threshold_range = (0, 15)  # Unified pie threshold range
specific_residue_number = 100  # Specific residue number to target in combinations

# PDB Parser setup
parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', pdb_file)
model = structure[0]
residues = [residue for residue in model.get_residues() if residue.get_id()[0] == ' ']

# Function to calculate angle between residue vectors (theta)
def calculate_theta(residue1, residue2):
    vector_ca1 = residue1['CB'].coord - residue1['CA'].coord if residue1.has_id('CB') else np.zeros(3)
    vector_ca2 = residue2['CB'].coord - residue2['CA'].coord if residue2.has_id('CB') else np.zeros(3)
    dot_product = np.dot(vector_ca1, vector_ca2)
    magnitude_product = np.linalg.norm(vector_ca1) * np.linalg.norm(vector_ca2)
    if magnitude_product == 0:
        return np.nan
    cosine_angle = dot_product / magnitude_product
    cosine_angle = np.clip(cosine_angle, -1.0, 1.0)
    return np.degrees(np.arccos(cosine_angle))

# Step 1: Filter combinations to include specific residue number
combinations = itertools.combinations(residues, 3)
selected_combinations = [
    comb for comb in combinations
    if any(res.get_id()[1] == specific_residue_number for res in comb)
]

# Distance filter
filtered_data_distances = []

for idx, combination in enumerate(selected_combinations):
    alpha_distances, beta_distances = [], []

    try:
        for res1, res2 in itertools.combinations(combination, 2):
            if res1.has_id('CA') and res2.has_id('CA'):
                ca1, ca2 = res1['CA'].coord, res2['CA'].coord
                alpha_distance = np.linalg.norm(ca1 - ca2)
            else:
                continue

            if res1.has_id('CB') and res2.has_id('CB'):
                cb1, cb2 = res1['CB'].coord, res2['CB'].coord
                beta_distance = np.linalg.norm(cb1 - cb2)
            else:
                continue

            if (alpha_distance_range[0] <= alpha_distance <= alpha_distance_range[1] and
                    beta_distance_range[0] <= beta_distance <= beta_distance_range[1]):
                alpha_distances.append(alpha_distance)
                beta_distances.append(beta_distance)

        if len(alpha_distances) >= 3 and len(beta_distances) >= 3:
            filtered_data_distances.append({
                'PDB_ID': pdb_file,
                'Combination': combination,
                'Coord_chain_id_number1': combination[0].get_full_id()[2],
                'Coord_residue_number1': combination[0].get_full_id()[3][1],
                'Coord_residue_name1': combination[0].get_resname(),
                'Coord_atom_name1': 'CA',
                'Coord_chain_id_number2': combination[1].get_full_id()[2],
                'Coord_residue_number2': combination[1].get_full_id()[3][1],
                'Coord_residue_name2': combination[1].get_resname(),
                'Coord_atom_name2': 'CA',
                'Coord_chain_id_number3': combination[2].get_full_id()[2],
                'Coord_residue_number3': combination[2].get_full_id()[3][1],
                'Coord_residue_name3': combination[2].get_resname(),
                'Coord_atom_name3': 'CA',
                'Alpha Distance 1': alpha_distances[0],
                'Alpha Distance 2': alpha_distances[1],
                'Alpha Distance 3': alpha_distances[2],
                'Beta Distance 1': beta_distances[0],
                'Beta Distance 2': beta_distances[1],
                'Beta Distance 3': beta_distances[2]
            })

    except KeyError as e:
        print(f"Error processing combination {combination}: {e}")

# Create DataFrame for distances
column_order = [
    'PDB_ID',
    'Combination',
    'Coord_chain_id_number1', 'Coord_residue_number1', 'Coord_residue_name1', 'Coord_atom_name1',
    'Coord_chain_id_number2', 'Coord_residue_number2', 'Coord_residue_name2', 'Coord_atom_name2',
    'Coord_chain_id_number3', 'Coord_residue_number3', 'Coord_residue_name3', 'Coord_atom_name3',
    'Alpha Distance 1', 'Alpha Distance 2', 'Alpha Distance 3',
    'Beta Distance 1', 'Beta Distance 2', 'Beta Distance 3'
]

df_distances = pd.DataFrame(filtered_data_distances)
df_distances = df_distances[column_order]

# Ratio filter
filtered_data_ratio = []

for idx, row in df_distances.iterrows():
    alpha_distances = [row['Alpha Distance 1'], row['Alpha Distance 2'], row['Alpha Distance 3']]
    beta_distances = [row['Beta Distance 1'], row['Beta Distance 2'], row['Beta Distance 3']]

    for i in range(3):
        alpha_distance_i = alpha_distances[i]
        beta_distance_i = beta_distances[i]
        ratio = alpha_distance_i / beta_distance_i

        if not (ratio_threshold_range[0] <= ratio <= ratio_threshold_range[1]):
            break
    else:
        filtered_data_ratio.append(row)

df_ratio = pd.DataFrame(filtered_data_ratio)
df_ratio = df_ratio[column_order]

# Theta filter
filtered_data_theta = []

for idx, row in df_ratio.iterrows():
    combination = row['Combination']
    theta_1_2 = calculate_theta(combination[0], combination[1])
    theta_1_3 = calculate_theta(combination[0], combination[2])
    theta_2_3 = calculate_theta(combination[1], combination[2])

    if (theta_threshold_range[0] <= theta_1_2 <= theta_threshold_range[1] and
            theta_threshold_range[0] <= theta_1_3 <= theta_threshold_range[1] and
            theta_threshold_range[0] <= theta_2_3 <= theta_threshold_range[1]):
        row['Theta_1_2'] = theta_1_2
        row['Theta_1_3'] = theta_1_3
        row['Theta_2_3'] = theta_2_3
        filtered_data_theta.append(row)

df_theta = pd.DataFrame(filtered_data_theta)
df_theta = df_theta[column_order + ['Theta_1_2', 'Theta_1_3', 'Theta_2_3']]

# Pie filter
def calculate_pie(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    magnitude_product = np.linalg.norm(vector1) * np.linalg.norm(vector2)
    if magnitude_product == 0:
        return np.nan
    cosine_angle = dot_product / magnitude_product
    cosine_angle = np.clip(cosine_angle, -1.0, 1.0)
    return np.degrees(np.arccos(cosine_angle))

def process_row(row):
    pdb_file_path = pdb_file
    if not os.path.isfile(pdb_file_path):
        print(f"PDB file not found: {pdb_file_path}")
        return [None, None, None]

    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('protein', pdb_file_path)
    model = structure[0]

    try:
        residues = row['Combination']
        print(f"Residues loaded: {residues}")

        pies = []
        pairs = [(0, 1), (0, 2), (1, 2)]

        for i, j in pairs:
            try:
                CA1 = residues[i]['CA']
                CA2 = residues[j]['CA']
                CB1 = residues[i]['CB'] if 'CB' in residues[i] else CA1
                CB2 = residues[j]['CB'] if 'CB' in residues[j] else CA2

                vector_CA = CA2.coord - CA1.coord
                vector_CB = CB2.coord - CB1.coord

                angle = calculate_pie(vector_CA, vector_CB)
                pies.append(angle)
            except KeyError as e:
                print(f"KeyError for residues {residues[i]} and {residues[j]}: {e}")
                pies.append(None)

        return pies
    except KeyError as e:
        print(f"KeyError: {e}")
        return [None, None, None]

pie_results = df_theta.apply(process_row, axis=1, result_type='expand')
df_theta[['Pie_1_2', 'Pie_1_3', 'Pie_2_3']] = pie_results

# Create filter columns based on pie thresholds
for col in ['Pie_1_2', 'Pie_1_3', 'Pie_2_3']:
    df_theta[f'{col}_Filter'] = df_theta.apply(lambda row: pie_threshold_range[0] < row[col] < pie_threshold_range[1] if pd.notnull(row[col]) else False, axis=1)

df_theta['Pie_Filter'] = df_theta[[f'{col}_Filter' for col in ['Pie_1_2', 'Pie_1_3', 'Pie_2_3']]].all(axis=1)

df_final_filter = df_theta[df_theta['Pie_Filter']]

# Save all DataFrames into a single Excel file with different tabs
with pd.ExcelWriter(output_excel_file) as writer:
    df_distances.to_excel(writer, sheet_name='Distances', index=False)
    df_ratio.to_excel(writer, sheet_name='Ratio', index=False)
    df_theta.to_excel(writer, sheet_name='Theta', index=False)
    df_final_filter.to_excel(writer, sheet_name='Pie', index=False)

# Generate PyMOL script file
pymol_script_commands = []
df_final_filter['Combination_Number'] = range(1, len(df_final_filter) + 1)

for index, row in df_final_filter.iterrows():
    combination = row['Combination']
    chain1, res1 = combination[0].get_full_id()[2], combination[0].get_full_id()[3][1]
    chain2, res2 = combination[1].get_full_id()[2], combination[1].get_full_id()[3][1]
    chain3, res3 = combination[2].get_full_id()[2], combination[2].get_full_id()[3][1]

    selection_name = f"obj{row['Combination_Number']:02d}"
    pymol_script_commands.append(f"select {selection_name}, (chain {chain1} and resi {res1}) or (chain {chain2} and resi {res2}) or (chain {chain3} and resi {res3})")
    pymol_script_commands.append(f"create {selection_name}_residue1, /{pdb_file}//{chain1}/{res1}")
    pymol_script_commands.append(f"create {selection_name}_residue2, /{pdb_file}//{chain2}/{res2}")
    pymol_script_commands.append(f"create {selection_name}_residue3, /{pdb_file}//{chain3}/{res3}")

with open(pymol_script_file, 'w') as f:
    f.write("# PyMOL script for visualizing filtered residue combinations\n\n")
    for command in pymol_script_commands:
        f.write(command + '\n')

print(f"\nResults saved to {output_excel_file}")
print(f"PyMOL script saved to {pymol_script_file}")

Residues loaded: (<Residue CYS het=  resseq=96 icode= >, <Residue THR het=  resseq=97 icode= >, <Residue ALA het=  resseq=100 icode= >)
Residues loaded: (<Residue CYS het=  resseq=96 icode= >, <Residue ASN het=  resseq=99 icode= >, <Residue ALA het=  resseq=100 icode= >)
Residues loaded: (<Residue THR het=  resseq=97 icode= >, <Residue ALA het=  resseq=100 icode= >, <Residue CYS het=  resseq=101 icode= >)
Residues loaded: (<Residue ASN het=  resseq=99 icode= >, <Residue ALA het=  resseq=100 icode= >, <Residue GLN het=  resseq=103 icode= >)
Residues loaded: (<Residue ALA het=  resseq=100 icode= >, <Residue CYS het=  resseq=101 icode= >, <Residue LYS het=  resseq=104 icode= >)

Results saved to /content/1CA2_alanine_96_analysis_results.xlsx
PyMOL script saved to /content/1CA2_alanine_96_output_tml.pml
