### load AF model evaluation metrics ###

In [2]:
import os 
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns

In [3]:
def parse_AF_eval(plddt_file, confidence_file):
    if not os.path.exists(plddt_file) or not os.path.exists(confidence_file):
        print(f"File not found: {plddt_file} or {confidence_file}")
        return
    with open(plddt_file, 'r') as f:
        pdb_plddt_dict = json.load(f)
    with open(confidence_file, 'r') as f:
        pdb_confidence_dict = json.load(f)
    
    eval_dict = {
        'mean_plddt': np.mean(pdb_plddt_dict['atom_plddts']),
        'All_mean_pae': np.mean(pdb_plddt_dict['pae']),
        'iptm': pdb_confidence_dict['iptm'],
        'ptm': pdb_confidence_dict['ptm']
    }
    return eval_dict

In [27]:
import glob

AF_struct_dir = '/Users/liyao/Downloads/folds_2025_09_05_03_36'
protein_name = 'egfr_bindermut'
model_evals = {}

dirs = glob.glob(f'{AF_struct_dir}/{protein_name}*')
print(len(dirs))
for folder in dirs:
    design = os.path.basename(folder)
    binder, repeat = design.split('_')[:-1], design.split('_')[-1]
    binder = '_'.join(binder)
    for rank in range(5):
        pdb_plddt = f'{folder}/fold_{design}_full_data_{rank}.json'
        pdb_confidence= f'{folder}/fold_{design}_summary_confidences_{rank}.json'
        model_evals[(binder,repeat,rank)]= parse_AF_eval(pdb_plddt, pdb_confidence)
print(model_evals)

15
{('egfr_bindermut10', '1', 0): {'mean_plddt': 91.42284340372152, 'All_mean_pae': 6.0619088021981185, 'iptm': 0.94, 'ptm': 0.9}, ('egfr_bindermut10', '1', 1): {'mean_plddt': 91.18326782354171, 'All_mean_pae': 6.211734512948891, 'iptm': 0.93, 'ptm': 0.89}, ('egfr_bindermut10', '1', 2): {'mean_plddt': 91.15999581852394, 'All_mean_pae': 6.167820451864908, 'iptm': 0.93, 'ptm': 0.9}, ('egfr_bindermut10', '1', 3): {'mean_plddt': 91.17816224127117, 'All_mean_pae': 6.212008227188071, 'iptm': 0.93, 'ptm': 0.9}, ('egfr_bindermut10', '1', 4): {'mean_plddt': 91.25411248170604, 'All_mean_pae': 6.210671177785543, 'iptm': 0.93, 'ptm': 0.89}, ('egfr_bindermut06', '2', 0): {'mean_plddt': 90.58287533931927, 'All_mean_pae': 6.374147138477865, 'iptm': 0.93, 'ptm': 0.89}, ('egfr_bindermut06', '2', 1): {'mean_plddt': 90.63515138859887, 'All_mean_pae': 6.395198968186628, 'iptm': 0.93, 'ptm': 0.88}, ('egfr_bindermut06', '2', 2): {'mean_plddt': 90.54224055126332, 'All_mean_pae': 6.4268752183540885, 'iptm': 0

In [31]:
model_evals_df = pd.DataFrame(model_evals).T
model_evals_df = model_evals_df.reset_index()
model_evals_df.columns = ['design','repeat', 'rank', 'mean_plddt', 'All_mean_pae', 'iptm', 'ptm']
#display(model_evals_df.loc[model_evals_df['design'].str.contains('hsa_gamut05')].sort_values('mean_plddt', ascending=False)[:10])

top_1_per_design = model_evals_df.loc[model_evals_df.groupby('design')['mean_plddt'].idxmax()]
selected_pair = list(top_1_per_design[['design', 'repeat','rank']].itertuples(index=False, name=None))
display(top_1_per_design.sort_values('mean_plddt', ascending=False))
select_models = [f'{AF_struct_dir}/{design}_{repeat}/fold_{design}_{repeat}_model_{rank}.cif' for design,repeat,rank in selected_pair]

Unnamed: 0,design,repeat,rank,mean_plddt,All_mean_pae,iptm,ptm
0,egfr_bindermut10,1,0,91.422843,6.061909,0.94,0.9
52,egfr_bindermut04,2,2,91.299874,6.323947,0.93,0.89
45,egfr_bindermut06,1,0,90.894057,6.304005,0.93,0.89
25,egfr_bindermut02,2,0,90.488673,6.523144,0.94,0.88
71,egfr_bindermut09,2,1,90.173375,6.641846,0.93,0.88


In [23]:
import shutil
from Bio.PDB import MMCIFParser, PDBIO

output_dir = '/Users/liyao/Desktop/Tsuda_Lab/Projects/Andrejs_TrajMPNN/data/AF_predictions/selected'
for model in select_models:
    parser = MMCIFParser(QUIET=True)
    structure = parser.get_structure('model', model)
    pdb_filename = os.path.join(output_dir, os.path.basename(model).replace('.cif', '.pdb'))
    io = PDBIO()
    io.set_structure(structure)
    io.save(pdb_filename)

### cluster AF models based on RMSD ###

In [20]:
from Bio.PDB import PDBParser, MMCIFParser
import numpy as np

def extract_chain_coordinates(structure_file, chain_id, atom_name=["CA"]):
    """
    Extracts the coordinates of specified atoms in specified chains from a PDB or mmCIF file.
    
    :param structure_file: Path to the PDB or mmCIF file
    :param chain_id: List of chain IDs to extract (or a single chain ID as a string)
    :param atom_name: List of atom names to filter (e.g., ["CA", "CB"]). Defaults to ["CA"].
    :return: A NumPy array of shape (n_atoms, 3) containing the atomic coordinates
    """
    # Determine parser based on file extension
    if structure_file.endswith('.pdb'):
        parser = PDBParser(QUIET=True)
    elif structure_file.endswith('.cif') or structure_file.endswith('.mmcif'):
        parser = MMCIFParser(QUIET=True)
    else:
        raise ValueError("Unsupported file format. Please provide a .pdb or .cif/.mmcif file.")
    
    structure = parser.get_structure('structure', structure_file)
    
    # Ensure chain_id and atom_name are lists
    if isinstance(chain_id, str):
        chain_id = [chain_id]
    if isinstance(atom_name, str):
        atom_name = [atom_name]

    # Extract coordinates with chain and atom filters
    coordinates = []
    for id in chain_id:
        chain = structure[0][id]
        for atom in chain.get_atoms():
            if atom.get_name() in atom_name:
                coordinates.append(atom.coord)
    
    return np.array(coordinates)

def calculate_RMSD(coord1, coord2):
    """
    Calculates the root-mean-square deviation (RMSD) between two sets of atomic coordinates.
    
    :param coord1: NumPy array of shape (n_atoms, 3) containing the first set of atomic coordinates
    :param coord2: NumPy array of shape (n_atoms, 3) containing the second set of atomic coordinates
    :return: The RMSD between the two coordinate sets
    """
    # Check if the number of atoms is the same in both coordinate sets
    if coord1.shape[0] != coord2.shape[0]:
        raise ValueError("The number of atoms in the two coordinate sets is different.")
    
    # Calculate the RMSD
    diff = coord1 - coord2
    rmsd = np.sqrt(np.mean(np.sum(diff**2, axis=1)))
    
    return rmsd

In [24]:
RMSD_dict = {}
chain_ids = ['A', 'B']
for i in seeds:
    for j in range(0, 4):
        ref_coord = extract_chain_coordinates(f'{AF_struct_dir}/{protien_name}_seed{i}/fold_{protien_name}_seed{i}_model_{j}.cif', chain_ids)
        for k in range(j+1, 5):
            compare_coord = extract_chain_coordinates(f'{AF_struct_dir}/{protien_name}_seed{i}/fold_{protien_name}_seed{i}_model_{k}.cif', chain_ids)
            RMSD_dict[(i,j,k)] = calculate_RMSD(ref_coord, compare_coord)
print(RMSD_dict)


{(1, 0, 1): 27.454308, (1, 0, 2): 23.082153, (1, 0, 3): 20.98222, (1, 0, 4): 22.817755, (1, 1, 2): 27.783411, (1, 1, 3): 28.234781, (1, 1, 4): 28.79486, (1, 2, 3): 10.040766, (1, 2, 4): 14.338803, (1, 3, 4): 16.893402, (2, 0, 1): 28.479342, (2, 0, 2): 27.088152, (2, 0, 3): 21.378466, (2, 0, 4): 24.406746, (2, 1, 2): 12.121987, (2, 1, 3): 19.387264, (2, 1, 4): 23.792034, (2, 2, 3): 14.864815, (2, 2, 4): 20.279856, (2, 3, 4): 27.091763, (3, 0, 1): 15.288367, (3, 0, 2): 28.163675, (3, 0, 3): 24.094091, (3, 0, 4): 23.007807, (3, 1, 2): 23.993904, (3, 1, 3): 28.047935, (3, 1, 4): 22.999348, (3, 2, 3): 23.56113, (3, 2, 4): 26.504272, (3, 3, 4): 27.112698, (4, 0, 1): 24.134193, (4, 0, 2): 26.506433, (4, 0, 3): 24.022974, (4, 0, 4): 26.956429, (4, 1, 2): 23.725182, (4, 1, 3): 25.331045, (4, 1, 4): 21.758139, (4, 2, 3): 27.843176, (4, 2, 4): 20.373838, (4, 3, 4): 27.216896, (5, 0, 1): 21.154747, (5, 0, 2): 25.772144, (5, 0, 3): 23.021336, (5, 0, 4): 24.64439, (5, 1, 2): 19.861696, (5, 1, 3): 6.

In [28]:
from DockQ.DockQ import load_PDB, run_on_all_native_interfaces
mapping = {"A": "A", "B": "B"}
pdb_files = [f'{AF_struct_dir}/{protien_name}_seed{i}/fold_{protien_name}_seed{i}_model_{j}.cif' for i in seeds for j in range(0, 5)]
print(len(pdb_files))

for i in range(0, len(pdb_files)):
    for j in range(i+1, len(pdb_files)):
        pdb1 = load_PDB(pdb_files[i])
        pdb2 = load_PDB(pdb_files[j])
        result = run_on_all_native_interfaces(pdb1, pdb2, chain_map=mapping)
        print(result)
        break
'''for i in seeds:
    for j in range(0, 4):
        native = load_PDB(f'{AF_struct_dir}/{protien_name}_seed{i}/fold_{protien_name}_seed{i}_model_{j}.cif')
        for k in range(j+1, 5):
            model = load_PDB(f'{AF_struct_dir}/{protien_name}_seed{i}/fold_{protien_name}_seed{i}_model_{k}.cif')
            result = run_on_all_native_interfaces(model, native, chain_map=mapping)
            RMSD_dict[(i,j,k)] = result[0].get('Lrms', np.nan)
            break'''

50


"for i in seeds:\n    for j in range(0, 4):\n        native = load_PDB(f'{AF_struct_dir}/{protien_name}_seed{i}/fold_{protien_name}_seed{i}_model_{j}.cif')\n        for k in range(j+1, 5):\n            model = load_PDB(f'{AF_struct_dir}/{protien_name}_seed{i}/fold_{protien_name}_seed{i}_model_{k}.cif')\n            result = run_on_all_native_interfaces(model, native, chain_map=mapping)\n            RMSD_dict[(i,j,k)] = result[0].get('Lrms', np.nan)\n            break"

### HDOCK: rigid body global docking ###

In [40]:
import subprocess

def run_HDOCK(hdock_path, receptor, ligand, out_file):
    command = f"{hdock_path} {receptor} {ligand} -out {out_file}"
    subprocess.run(command, shell=True)

In [None]:
import os
os.chdir('/home/lwang/models/HDOCKlite-v1.1')
protien_name = 'vh16_vl106'
pdb_dir = f'./pdb'
os.mkdir('./output') if not os.path.exists('./output') else None

receptors = ['Wuhan_spike_clean.pdb', 'Omicron_spike_clean.pdb', 'Delta_spike_clean.pdb']
ligands = [file for file in os.listdir(f"./pdb") if file.startswith(f"fold_{protien_name}")]

for receptor in receptors:
    for ligand in ligands:
        out_file = f'./output/{receptor.split("_")[0]}_{ligand.split(".")[0]}.out'
        run_HDOCK('./hdock', f'{pdb_dir}/{receptor}', f'{pdb_dir}/{ligand}', out_file)
