## Calculate binding scores and correlate with ΔG

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import r2_score
import math
import statistics

from constants import *

In [None]:
# Define input vars
probs_df = pd.read_csv(PROBS_CSV)
sabdab_df = pd.read_csv(SABDAB_CSV)
pdb_list = probs_df['pdb'].drop_duplicates().tolist()
bind_prob_threshold = 50.0
bind_scores = {}

In [None]:
# Reformat the SAbDab dataframe
sabdab_df = sabdab_df[sabdab_df['pdb'].isin(pdb_list)]
sabdab_df['delta_g'] = pd.to_numeric(sabdab_df['delta_g'])

In [None]:
# Reformat the binding probabilities dataframe to keep residue-level data
probs_df = pd.DataFrame(probs_df.groupby(['pdb','chain_id','res_seq_num','residue'])['paratope_probability'].mean())
probs_df = probs_df.reset_index()

In [None]:
# Calculate binding score from binding probabilities and add to the SAbDab dataframe
for pdb_code in pdb_list:
    pdb_df = probs_df[probs_df['pdb'] == pdb_code]
    pdb_df = pdb_df[pdb_df['paratope_probability'] > bind_prob_threshold]
    bind_probs = pdb_df['paratope_probability'].tolist()
    try:
        binding_score = math.log(statistics.mean(bind_probs) * len(bind_probs))
    except:
        binding_score = 0
    bind_scores[pdb_code] = binding_score

sabdab_df['binding_score'] = sabdab_df['pdb'].map(bind_scores)

In [None]:
# Plot binding probabilities
probs_df['paratope_probability'].astype(float).plot()

In [None]:
# Plot ΔG
sabdab_df['delta_g'].plot()

In [None]:
# Correlate binding scores with ΔG
plt.rc('axes', labelsize=20)
plt.rcParams.update({'font.size': 15})
r2 = r2_score(sabdab_df['delta_g'], sabdab_df['binding_score'])
sabdab_df.plot(x='delta_g', y='binding_score', kind='scatter', figsize=(20,10), fontsize=15, title=f"R-squared = {r2}")