In [1]:
import os
import glob
import json
import shutil
import numpy as np
import pandas as pd
from pathlib import Path
from pyrosetta import * 
init()
from pyrosetta.rosetta.protocols import *
from Bio.PDB.Polypeptide import aa1, aa3
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.PDB import PDBParser

PyRosetta-4 2022 [Rosetta PyRosetta4.conda.linux.cxx11thread.serialization.CentOS.python39.Release 2022.04+release.0c9f888efdecfed435625915cc2c8cddbb485ee9 2022-01-26T19:42:42] retrieved from: http://www.pyrosetta.org
(C) Copyright Rosetta Commons Member Institutions. Created in JHU by Sergey Lyskov and PyRosetta Team.
core.init: Checking for fconfig files in pwd and ./rosetta/flags
core.init: Rosetta version: PyRosetta4.conda.linux.cxx11thread.serialization.CentOS.python39.Release r309 2022.04+release.0c9f888 0c9f888efdecfed435625915cc2c8cddbb485ee9 http://www.pyrosetta.org 2022-01-26T19:42:42
core.init: command: PyRosetta -ex1 -ex2aro -database /home/tsatler/anaconda3/envs/pyro/lib/python3.9/site-packages/pyrosetta/database
basic.random.init_random_generator: 'RNG device' seed mode, using '/dev/urandom', seed=-1402484020 seed_offset=0 real_seed=-1402484020 thread_index=0
basic.random.init_random_generator: RandomGenerator:init: Normal mode, seed=-1402484020 RG_type=mt19937


# Analysis functions

In [2]:
def calculate_scores(scores_path, binder_len=None, is_binder_second=False):
    scores = json.loads(Path(scores_path).read_text())

    plddt = np.mean(scores['plddt'])
    pae = np.array(scores['pae'])

    if is_binder_second:
        pae_binder = np.mean(pae[binder_len:, binder_len:]) if binder_len else None
        pae_target = np.mean(pae[:binder_len, :binder_len]) if binder_len else None
        plddt_binder = np.mean(scores['plddt'][binder_len:]) if binder_len else None
        plddt_target = np.mean(scores['plddt'][:binder_len]) if binder_len else None
        pae_int1 = np.mean(pae[binder_len:, :binder_len]) if binder_len else None
        pae_int2 = np.mean(pae[:binder_len, binder_len:]) if binder_len else None
    else:
        pae_binder = np.mean(pae[:binder_len, :binder_len]) if binder_len else None
        pae_target = np.mean(pae[binder_len:, binder_len:]) if binder_len else None
        plddt_binder = np.mean(scores['plddt'][:binder_len]) if binder_len else None
        plddt_target = np.mean(scores['plddt'][binder_len:]) if binder_len else None
        pae_int1 = np.mean(pae[:binder_len, binder_len:]) if binder_len else None
        pae_int2 = np.mean(pae[binder_len:, :binder_len]) if binder_len else None

    pae_int_tot = (pae_int1 + pae_int2) / 2 if binder_len else None

    results = {'plddt': plddt, 'pae': np.mean(pae)}
    if binder_len:
        results.update({
            'binder_plddt': plddt_binder,
            'target_plddt': plddt_target,
            'pae_binder': pae_binder,
            'pae_target': pae_target,
            'pae_int_tot': pae_int_tot
        })

    return results

def calculate_residue_difference(seq1, seq2):
    # Ensure sequences have the same length
    if len(seq1) != len(seq2):
        raise ValueError("Sequences must have the same length.")

    # Calculate the number of different residues
    num_different = sum(a != b for a, b in zip(seq1, seq2))

    # Calculate the percentage of different residues
    percentage = (num_different / len(seq1)) * 100

    return percentage

def align_structures(pdb1, pdb2):
    """Take two structure and superimpose pdb1 on pdb2"""
    import Bio.PDB

    pdb_parser = Bio.PDB.PDBParser(QUIET=True)
    # Get the structures
    ref_structure = pdb_parser.get_structure("ref", pdb1)
    sample_structure = pdb_parser.get_structure("sample", pdb2)

    aligner = Bio.PDB.cealign.CEAligner()
    aligner.set_reference(ref_structure)
    aligner.align(sample_structure)

    return aligner.rms

def get_sasa(pose):
    '''Calculate the total and hydrophobic sasa'''
    rsd_sasa = pyrosetta.rosetta.utility.vector1_double()
    rsd_hydrophobic_sasa = pyrosetta.rosetta.utility.vector1_double()
    rosetta.core.scoring.calc_per_res_hydrophobic_sasa(pose, rsd_sasa, rsd_hydrophobic_sasa, 1.4) #The last arguement is the probe radius

    return sum(rsd_sasa), sum(rsd_hydrophobic_sasa)

def calculate_charge(chain, ph=7.4):
    """
    Calculates the charge of the protein chain at a specific pH.
    
    Parameters:
        chain (Bio.PDB.Chain.Chain): The protein chain to calculate the charge of.
        ph (float): The pH value to calculate the charge at. Default is 7.4.
        
    Returns:
        The charge of the protein at the specified pH.
    """
    # Extract the sequence of amino acid residues in the chain
    sequence = ''
    for residue in chain.get_residues():
        resname = residue.get_resname()
        if resname in aa3:
            sequence += aa1[aa3.index(resname)]
        else:
            print(f"Skipping residue {resname} because it is not a standard amino acid.")
            continue

    # Create a ProteinAnalysis object from the sequence
    protein_analysis = ProteinAnalysis(sequence)

    # Calculate the charge of the protein at a specific pH
    charge = protein_analysis.charge_at_pH(ph)

    return charge

def calculate_sap_score(pose, chain="B"):

    # Select only chain B using a SwitchChainOrder mover
    select_chain = XmlObjects.static_get_mover(f'<SwitchChainOrder name="so" chain_order="{chain}"/>')
    chain = pose.clone()
    select_chain.apply(chain)

    # Calculate the SAP score for chain B
    sap_score_metric = XmlObjects.static_get_simple_metric('<SapScoreMetric name="sap_metric"/>')
    sap_score_value = sap_score_metric.calculate(chain)

    # Return the SAP score value
    #sap_score_value = sap_score_metric.get(1)
    return sap_score_value

# Run analysis
if target sequence is provided, plddt and pae will be calculated for target and binder aswell. It assumes that the binder is the first sequence, else add flag to calculate scores function

In [3]:

# Set intputs
af2_input_folder="output/7urv_FMC63_D_cdrs/tmp/af2_in"
af2_output_folder="output/7urv_FMC63_D_cdrs/af2"
af_ids=pd.read_csv(f"{af2_output_folder}/af2_sequences.csv")
chain_id="A" #chain to analyze
ref_seq="DIQMTQTTSSLSASLGDRVTISCRASQDISKYLNWYQQKPDGTVKLLIYHTSRLHSGVPSRFSGSGSGTDYSLTISNLEQEDIATYFCQQGNTLPYTFGGGTKLEITGGGGSGGGGSGGGGSEVKLQESGPGLVAPSQSLSVTCTVSGVSLPDYGVSWIRQPPRKGLEWLGVIWGSETTYYNSALKSRLTIIKDNSKSQVFLKMNSLQTDDTAIYYCAKHYYYGGSYAMDYWGQGTSVTVSS"
ref_pdb="examples/7urv_FMC63.pdb"

target_seq=""

# Create an empty DataFrame
if target_seq:
    scores_df = pd.DataFrame(columns=['id', 'plddt', 'pae', 'binder_plddt','target_plddt','pae_binder','pae_target','pae_int_tot', 'diff%', 'sasa', 'hydro_sasa', 'charge', 'sap', 'rmsd', 'pdb', 'seq'])
else:
    scores_df = pd.DataFrame(columns=['id', 'plddt', 'pae', 'diff%', 'sasa', 'hydro_sasa', 'charge', 'sap', 'rmsd', 'pdb', 'seq'])

# Iterate over dataframe of sequences
for _, row in af_ids.iterrows():
    id = row['id']
    seq = row['seq']
    
    # Check if value is already in dataframe
    if id in scores_df['id'].values:
        print(f"{id} already present, skipping...")
        continue  # Skip if 'id' is already present
    
    # Find json and pdb files
    json_file = glob.glob(f"{af2_input_folder}/*/{id}*rank_001*.json")[0]
    pdb_file = glob.glob(f"{af2_input_folder}/*/{id}*rank_001*.pdb")[0]

    if not json_file or not pdb_file:
        print(f"Could not find pdb or json file for id: {id}")
        continue

    # Copy best pdb file to output folder
    new_pdb_path = f"{af2_output_folder}/{id}.pdb"
    shutil.copy(pdb_file, new_pdb_path)

    # Load PDB structure and create pose
    pose = pose_from_file(pdb_file)
    parser = PDBParser()
    structure = parser.get_structure('protein', pdb_file)
    chain = structure[0][chain_id]

    # Calculate metrics
    af2_scores=calculate_scores(json_file)
    plddt=af2_scores['plddt']
    pae=af2_scores['pae']
    if target_seq: #'binder_plddt','target_plddt','pae_binder','pae_target','pae_int_tot'
        binder_plddt=af2_scores['binder_plddt']
        target_plddt=af2_scores['target_plddt']
        pae_binder=af2_scores['pae_binder']
        pae_target=af2_scores['pae_target']
        pae_int_tot=af2_scores['pae_int_tot']
    diff=calculate_residue_difference(ref_seq,seq)
    sasa, hyd_sasa=get_sasa(pose)
    charge=calculate_charge(chain)
    sap=calculate_sap_score(pose,chain="A")
    rmsd=align_structures(ref_pdb,pdb_file)

    # Create a new row with the calculated metrics
    if target_seq:
        new_row = pd.DataFrame({
        'id': [id],
        'plddt': [plddt],
        'pae': [pae],
        'binder_plddt': [binder_plddt],
        'target_plddt': [target_plddt],
        'pae_binder': [pae_binder],
        'pae_target': [pae_target],
        'pae_int_tot': [pae_int_tot],
        'diff%': [diff],
        'sasa': [sasa],
        'hydro_sasa': [hyd_sasa],
        'charge': [charge],
        'sap': [sap],
        'rmsd': [rmsd],
        'pdb': [new_pdb_path],
        'seq': [seq],
        })
    
    else:
        new_row = pd.DataFrame({
            'id': id,
            'plddt': [plddt],
            'pae': [pae],
            'diff%': [diff],
            'sasa': [sasa],
            'hydro_sasa': [hyd_sasa],
            'charge': [charge],
            'sap': [sap],
            'rmsd': [rmsd],
            'pdb': new_pdb_path,
            'seq': seq
        })
    
    # Concatenate the new row to the DataFrame
    scores_df = pd.concat([scores_df, new_row], ignore_index=True)

    # Save the DataFrame to a file
    scores_df.to_csv(f'{af2_output_folder}/metrics.csv', index=False)


core.chemical.GlobalResidueTypeSet: Finished initializing fa_standard residue type set.  Created 983 residue types
core.chemical.GlobalResidueTypeSet: Total time to initialize 1.12031 seconds.
core.import_pose.import_pose: File 'output/7urv_FMC63_D_cdrs/tmp/af2_in/AF2out_10/0_1_7urv_FMC63_D_cdrs_unrelaxed_rank_001_alphafold2_ptm_model_4_seed_000.pdb' automatically determined to be of type PDB
core.conformation.Conformation: Found disulfide between residues 23 88
core.conformation.Conformation: current variant for 23 CYS
core.conformation.Conformation: current variant for 88 CYS
core.conformation.Conformation: current variant for 23 CYD
core.conformation.Conformation: current variant for 88 CYD
protocols.rosetta_scripts.RosettaScriptsParser: Validating input script...
protocols.rosetta_scripts.RosettaScriptsSchemaValidator: Generating XML Schema for rosetta_scripts...
protocols.rosetta_scripts.RosettaScriptsSchemaValidator: ...done
protocols.rosetta_scripts.RosettaScriptsSchemaValidator