In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

In [None]:
normal_edge_data = pd.read_csv("https://raw.githubusercontent.com/OpenFreeEnergy/IndustryBenchmarks2024/refs/heads/main/industry_benchmarks/analysis/processed_results/combined_pymbar3_edge_data.csv")
rerun_edge_data = pd.read_csv("https://raw.githubusercontent.com/OpenFreeEnergy/IndustryBenchmarks2024/refs/heads/main/industry_benchmarks/analysis/processed_results/reruns/rerun_pymbar3_edge_data.csv")
private_edge_data = pd.read_csv("https://raw.githubusercontent.com/OpenFreeEnergy/IndustryBenchmarks2024/refs/heads/main/industry_benchmarks/analysis/private_processed_results/combined_pymbar3_edge_data.csv")
private_edge_data = private_edge_data[private_edge_data["failed"] != True]
normal_edge_data = normal_edge_data[(normal_edge_data["system name"] != "pfkfb3") & (normal_edge_data["failed"] != True)]
normal_edge_data = pd.concat([normal_edge_data, rerun_edge_data], ignore_index=True)
normal_edge_data

In [None]:
private_edge_data

In [None]:
# load the public and private DG data and calculate the all-to-all pairwaise differences
public_dg_data = pd.read_csv("https://raw.githubusercontent.com/OpenFreeEnergy/IndustryBenchmarks2024/refs/heads/main/industry_benchmarks/analysis/processed_results/combined_pymbar3_calculated_dg_data.csv")
rerun_dg_data = pd.read_csv("https://raw.githubusercontent.com/OpenFreeEnergy/IndustryBenchmarks2024/refs/heads/main/industry_benchmarks/analysis/processed_results/reruns/rerun_pymbar3_calculated_dg_data.csv")
private_dg_data = pd.read_csv("https://raw.githubusercontent.com/OpenFreeEnergy/IndustryBenchmarks2024/refs/heads/main/industry_benchmarks/analysis/private_processed_results/combined_pymbar3_calculated_dg_data.csv")
public_dg_data = public_dg_data[(public_dg_data["system name"] != "pfkfb3") ]
public_dg_data = pd.concat([public_dg_data, rerun_dg_data], ignore_index=True)

In [None]:
# calculate all pairwise differences for the public data and priavte data 
def calc_pairwise_differences(df, public=True):
    if public:
        group_name = "system group"
        system_name = "system name"
    else:
        group_name = "partner_id"
        system_name = "dataset_name"
    pairwise_diffs = []
    for system in df[group_name].unique():
        system_df = df[df[group_name] == system].copy(deep=True).reset_index(drop=True)
        for target in system_df[system_name].unique():
            target_df = system_df[system_df[system_name] == target].copy(deep=True).reset_index(drop=True)
            # get a list of unique ligand names
            ligands = target_df["ligand name"].unique()
            for i, ligand1 in enumerate(ligands):
                for j, ligand2 in enumerate(ligands):
                    if i >= j:  # skip self-comparisons
                        continue
                    # get the ddg values for these ligands
                    exp_dg1 = target_df[target_df["ligand name"] == ligand1]["Exp DG (kcal/mol)"].values[0]
                    exp_dg2 = target_df[target_df["ligand name"] == ligand2]["Exp DG (kcal/mol)"].values[0]
                    openfe_dg1 = target_df[target_df["ligand name"] == ligand1]["DG (kcal/mol)"].values[0]
                    openfe_dg2 = target_df[target_df["ligand name"] == ligand2]["DG (kcal/mol)"].values[0]
                    # add to a new dataframe
                    new_row = {
                        "System": target,
                        "Class": system,
                        "Ligand 1": ligand1,
                        "Ligand 2": ligand2,
                        "Exp DDG (kcal/mol)": exp_dg2 - exp_dg1,
                        "OpenFE DDG (kcal/mol)": openfe_dg2 - openfe_dg1,
                    }
                    pairwise_diffs.append(new_row)
    return pd.DataFrame(pairwise_diffs)

In [None]:
public_pairwise_diffs = calc_pairwise_differences(public_dg_data)
private_pairwise_diffs = calc_pairwise_differences(private_dg_data, public=False)

In [None]:
def calculate_system_pairwise_rmse(df):
    rmse_data = []
    for system in df["Class"].unique():
        system_df = df[df["Class"] == system].copy(deep=True).reset_index(drop=True)
        targets = system_df["System"].unique()
        for target in targets:
            target_df = system_df[system_df["System"] == target].copy(deep=True).reset_index(drop=True)
            
            rmse = ((target_df["OpenFE DDG (kcal/mol)"] - target_df["Exp DDG (kcal/mol)"]) ** 2).mean() ** 0.5
            mue = np.abs(target_df["OpenFE DDG (kcal/mol)"] - target_df["Exp DDG (kcal/mol)"]).mean()
            # bootstrap the RMSE
            n_bootstraps = 1000
            bootstrapped_rmse = np.zeros(n_bootstraps)
            bootstapped_mue = np.zeros(n_bootstraps)
            for i in range(n_bootstraps):
                sample_df = target_df.sample(frac=1, replace=True)
                bootstrapped_rmse[i] = ((sample_df["OpenFE DDG (kcal/mol)"] - sample_df["Exp DDG (kcal/mol)"]) ** 2).mean() ** 0.5
                bootstapped_mue[i] = np.abs(sample_df["OpenFE DDG (kcal/mol)"] - sample_df["Exp DDG (kcal/mol)"]).mean()
            lower_bound = np.percentile(bootstrapped_rmse, 2.5)
            upper_bound = np.percentile(bootstrapped_rmse, 97.5)
            rmse_data.append({"System": target, "RMSE (kcal/mol)": rmse, 
            "RMSE lower": lower_bound, "RMSE upper": upper_bound, "Class": system, 
            "n ligands": len(target_df["Ligand 1"].unique())
            "MUE": mue, "MUE lower": np.percentile(bootstapped_mue, 2.5),
            "MUE upper": np.percentile(bootstapped_mue, 97.5)})
    # now calculate the weighted RMSE and add it to the end of the dataframe
    rmse_df = pd.DataFrame(rmse_data)
    weighted_rmse = np.sqrt(np.sum(rmse_df["RMSE (kcal/mol)"] ** 2 * rmse_df["n ligands"]) / rmse_df["n ligands"].sum())
    weighted_mue = 
    # bootstrap the weighted RMSE
    bootstrapped_weighted_rmse = np.zeros(n_bootstraps)
    for i in range(n_bootstraps):
        sample_df = rmse_df.sample(frac=1, replace=True)
        bootstrapped_weighted_rmse[i] = np.sqrt(np.sum(sample_df["RMSE (kcal/mol)"] ** 2 * sample_df["n ligands"]) / sample_df["n ligands"].sum())
    lower_bound_weighted = np.percentile(bootstrapped_weighted_rmse, 2.5)
    upper_bound_weighted = np.percentile(bootstrapped_weighted_rmse, 97.5)
    # sort the dataframe by RMSE
    rmse_df.sort_values(by=["Class", "RMSE (kcal/mol)"], inplace=True)
    # concatenate the weighted RMSE to the end of the dataframe
    row_data = {
        "System": "Weighted\nRMSE",
        "RMSE (kcal/mol)": weighted_rmse,
        "RMSE lower": lower_bound_weighted,
        "RMSE upper": upper_bound_weighted,
        "Class": "Overall",
        "n ligands": rmse_df["n ligands"].sum()
    }
    rmse_data = pd.concat([rmse_df, pd.DataFrame([row_data])], ignore_index=True)
    return rmse_data

In [None]:
system_pairwise_rmse_private = calculate_system_pairwise_rmse(private_pairwise_diffs)

In [None]:
system_pairwise_rmse_private

In [None]:
# plot the pairwise RMSEs for the private datasets
import numpy as np
x = np.arange(len(system_pairwise_rmse_private))

# Set up the figure
fig, ax = plt.subplots(figsize=(16, 8))
bar_width = 0.6

# Plot bars with error bars
rmse_err_openfe = [system_pairwise_rmse_private['RMSE (kcal/mol)'] - system_pairwise_rmse_private['RMSE lower'], system_pairwise_rmse_private['RMSE upper'] - system_pairwise_rmse_private['RMSE (kcal/mol)']]
# "OpenFE": "#009384", "FEP+": "#d9c4b1"
ax.bar(x, system_pairwise_rmse_private['RMSE (kcal/mol)'], yerr=rmse_err_openfe, width=bar_width, label='OpenFE',
       color='#009384', capsize=3)

# Set labels and ticks
ax.set_xticks(x)
names = system_pairwise_rmse_private['System'].str.replace("_", " ")
ax.set_xticklabels(names, rotation=90, fontsize=8)
ax.set_ylabel(r"Pairwise $\Delta\Delta$G$_{calc}$ RMSE (kcal/mol)", fontsize=12)


unique_classes = system_pairwise_rmse_private['Class'].unique()
# # remove overall from the unique classes and add it to the end
unique_classes = [cls for cls in unique_classes if cls != "Overall"]
unique_classes.append("Overall")
class_bounds = system_pairwise_rmse_private.groupby('Class').size().cumsum().to_dict()
# # move overall to the end
class_bounds["Overall"] = len(system_pairwise_rmse_private) + 1

# change the roche end limit
class_bounds["Roche"] -= 1
print(class_bounds)
class_start = 0

colors = sns.color_palette("colorblind", len(unique_classes))
class_conversion = {"bayer_macrocycles": "Bayer\nMacrocycles", "charge_annihilation_set": "Charge\nAnnihilation", "fragments": "Fragments", "jacs_set": "JACS", "janssen_bace": "Janssen", "merck": "Merck", "miscellaneous_set": "Misc", "scaffold_hopping_set": "Scaffold\nHopping", "water_set": "Water", "mcs_docking_set": "MCS\nDocking"}
for i, cls in enumerate(unique_classes):
    cls_name = class_conversion.get(cls, cls)
    end = class_bounds[cls]
    # add dashed lines for the span but not the fill
    if cls != "ASAP":
        ax.axvline(class_start - 0.5, linestyle='--', linewidth=2)
#     ax.axvspan(class_start - 0.5, end - 0.5, facecolor=colors[i], alpha=0.2)
    center = (class_start + end - 1) / 2
    ax.text(center, ax.get_ylim()[1] + 0.1, cls_name, ha='center', va='bottom', fontsize=10, weight='bold')
    class_start = end

plt.tight_layout()
plt.xlim(-0.5, len(system_pairwise_rmse_private) - 0.5)
plt.savefig("per_system_pairwise_rmse_private.png", dpi=300, bbox_inches='tight')

In [None]:
# plot the ecdfs of the pairwise differences for openfe
import numpy as np
fig, ax = plt.subplots(figsize=(8, 6))
# plot the ecdf for the public data
# calculate the abs errors
private_error = np.abs(private_pairwise_diffs["OpenFE DDG (kcal/mol)"] - private_pairwise_diffs["Exp DDG (kcal/mol)"])
sns.ecdfplot(private_error, label="Private", ax=ax, color='#009384')
# workout the percentage of values below 1 kcal/mol on the private data
private_below_1 = np.sum(private_error < 1) / len(private_error) 
# add a line at 1 kcal/mol for private
ax.axvline(x=1, ymax=private_below_1, color='k', linestyle='--', linewidth=2)
ax.plot([0, 1], [private_below_1, private_below_1], color='#009384', linestyle='--', linewidth=2)
# add the text for the percentage below 1 kcal/mol
ax.text(0.1, private_below_1 + 0.02, f"{private_below_1:.2%}", color='#009384', fontsize=13, zorder=10)
# do the same for 2 kcal/mol
private_below_2 = np.sum(private_error < 2) / len(private_error)
ax.axvline(x=2, ymax=private_below_2, color='k', linestyle='--', linewidth=2)
ax.plot([0, 2], [private_below_2, private_below_2], color='#009384', linestyle='--', linewidth=2)
# add the text for the percentage below 2 kcal/mol
ax.text(0.1, private_below_2 + 0.02, f"{private_below_2:.2%}", color='#009384', fontsize=13)
# same again at 3 kcal/mol
private_below_3 = np.sum(private_error < 3) / len(private_error)
ax.axvline(x=3, ymax=private_below_3, color='k', linestyle='--', linewidth=2)
ax.plot([0, 3], [private_below_3, private_below_3], color='#009384', linestyle='--', linewidth=2)
ax.text(0.1, private_below_3 + 0.02, f"{private_below_3:.2%}", color='#009384', fontsize=13)
# plot the ecdf for the private data
# set the labels and title
ax.set_xlabel(r"Pairwise $|\Delta\Delta$G$_{calc}-\Delta\Delta$G$_{exp}|$ (kcal/mol)", fontdict={"fontsize": 15})
ax.set_ylabel("Cumulative Probability", fontdict={"fontsize": 15})
ax.tick_params(axis='x', labelsize=15)
ax.tick_params(axis='y', labelsize=15)
plt.xlim(left=0)
# plt.legend()
plt.tight_layout()
plt.savefig("ecdf_pairwise_differences_private.png", dpi=300, bbox_inches='tight')

In [None]:
private_error.sort_values(inplace=True, ascending=False)
private_error.head(10)

In [None]:
# calculate the probability of the correct sign for the pairwise differences for the private data
def bin_sign_correctness(data, bin_size=1.0):
    """
    Bin the absolute experimental DDG values and calculate the probability of getting the sign correct.
    
    Parameters:
    - data: DataFrame with 'Exp DDG (kcal/mol)', 'OpenFE DDG (kcal/mol)', 'FEP+ DDG (kcal/mol)' columns.
    - bin_size: Size of the bins for absolute DDG values.
    
    Returns:
    - DataFrame with bins and probabilities for OpenFE and FEP+.
    """
    # Create bins
    data['abs_exp_ddg'] = np.abs(data['Exp DDG (kcal/mol)'])
    # max value
    max_value = data['abs_exp_ddg'].max()
    bins = np.arange(0, max_value + bin_size, bin_size)
    
    # Bin the data
    data['bin'] = pd.cut(data['abs_exp_ddg'], bins=bins, right=False)
    
    # Calculate probabilities
    results = []
    # sort the bins by the lower edge
    for b in sorted(data['bin'].unique(), key=lambda x: x.left):
        subset = data[data['bin'] == b]
        if len(subset) == 0:
            continue
        # Calculate the probability of correct sign for OpenFE and FEP+ and bootstrap the results
        openfe_correct = np.sum(np.sign(subset['OpenFE DDG (kcal/mol)']) == np.sign(subset['Exp DDG (kcal/mol)']))
        total = len(subset)
        # Bootstrap the probabilities
        nboots = 1000
        openfe_probs = []
        for _ in range(nboots):
            bootstrapped_subset = subset.sample(n=len(subset), replace=True)
            openfe_correct_boot = np.sum(np.sign(bootstrapped_subset['OpenFE DDG (kcal/mol)']) == np.sign(bootstrapped_subset['Exp DDG (kcal/mol)']))
            total_boot = len(bootstrapped_subset)
            openfe_probs.append(openfe_correct_boot / total_boot)
        # Calculate mean and 95% CI
        openfe_mean = np.mean(openfe_probs)
        openfe_low = np.percentile(openfe_probs, 2.5)
        openfe_high = np.percentile(openfe_probs, 97.5)
        # store the results so we can use hue to split the data
        results.append({
            'bin': b.left,
            'OpenFE Probability': openfe_mean,
            'OpenFE Probability Lower': openfe_low,
            'OpenFE Probability Upper': openfe_high,
        })
        # x+=1
    return pd.DataFrame(results)
# Calculate the binned probabilities
binned_probabilities = bin_sign_correctness(private_pairwise_diffs, bin_size=0.5)

In [None]:

# Plot the binned probabilities using a bar plot with error bars
fig, ax = plt.subplots(figsize=(10, 6))
binned = binned_probabilities.copy(deep=True)
# add padding data for the missing bins
all_bins = binned['bin'].unique()
# sort the bins
sorted_bins = np.sort(all_bins)
new_bins = np.arange(0, sorted_bins.max() + 0.5, 0.5)
data_to_add = []
for b in new_bins:
    if b not in sorted_bins:
        data_to_add.append({
            'bin': b,
            'OpenFE Probability': np.nan,
            'OpenFE Probability Lower': np.nan,
            'OpenFE Probability Upper': np.nan
        })
binned_probabilities = pd.concat([binned_probabilities, pd.DataFrame(data_to_add)], ignore_index=True)
sns.barplot(data=binned_probabilities, x='bin', y='OpenFE Probability', ax=ax, color='#009384', width=1.0)
# add error bars for the probabilities
ax.set_xlabel(r"|$\Delta\Delta$G$_{exp}$| (kcal/mol)", fontsize=14)
ax.set_ylabel(r"Probability Correct pairwise $\Delta\Delta$G$_{calc}$ Sign", fontsize=14)
# set the x ticks to go from 0 to the max bin value in 1 kcal/mol increments
x_ticks = np.arange(1.5, 20.5, 2)
ax.errorbar(binned_probabilities['bin'] *2, binned_probabilities['OpenFE Probability'],
            yerr=[binned_probabilities['OpenFE Probability'] - binned_probabilities['OpenFE Probability Lower'],
                  binned_probabilities['OpenFE Probability Upper'] - binned_probabilities['OpenFE Probability']],
            fmt='none', color='black', capsize=5)
# add scatter points for the probabilities
ax.scatter(binned_probabilities['bin'] *2, binned_probabilities['OpenFE Probability'], color='#009384', edgecolor='black', s=50)
print(x_ticks)
print(ax.get_xticks())
ax.set_xticks(x_ticks)
ax.set_xticklabels([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], fontsize=12)
# plt.xlim((-0.5,14.5))
# # save the figure
# plt.legend(fontsize=12)
plt.tight_layout()
plt.ylim(bottom=0.4)
plt.xlim(left=-0.5)
plt.savefig("binned_probabilities_private.png", dpi=300, bbox_inches='tight')

In [None]:
# plot the inter repeat range in the DDG predictions as an ecdf
fig, ax = plt.subplots(figsize=(8, 6))
for dataset, label in [(normal_edge_data, "Public"), (private_edge_data, "Private")]:
    all_ranges = []
    good_overlap = []
    for _, row in dataset.iterrows():
        data = []
        # get the overlaps for the solvent and complex repeats
        overlaps = [row[f"solvent_repeat_{i}_smallest_overlap"] for i in range(3)]
        for i in range(3):
            overlaps.append(row[f"complex_repeat_{i}_smallest_overlap"])

        # calculate the ddg for each repeat
        for i in range(3):
            ddg = row[f"complex_repeat_{i}_DG (kcal/mol)"] - row[f"solvent_repeat_{i}_DG (kcal/mol)"]
            data.append(ddg)

        ddg_range = max(data) - min(data)
        # append the range to the all_ranges list
        all_ranges.append(ddg_range)
        # if the overlaps are all greater than 0.03, append the ddg_range to the good_overlap list
        if min(overlaps) >= 0.03:
            good_overlap.append(max(data) - min(data))

    # workout the percentage of edges with a repeat range less than 1 kcal/mol
    all_ranges = np.array(all_ranges)
    good_overlap = np.array(good_overlap)
    below_1 = np.sum(all_ranges < 1) / len(all_ranges)
    below_1_overlap = np.sum(good_overlap < 1) / len(good_overlap)
    print(f"{label} - Percentage of edges with repeat range < 1 kcal/mol: {below_1:.2%}")
    print(f"{label} - Percentage of edges with repeat range < 1 kcal/mol and overlap > 0.03: {below_1_overlap:.2%}")
    # remove the outlier values from the all_ranges
    colour = sns.color_palette()[0] if label == "Public" else sns.color_palette()[1]
    all_ranges = [x for x in all_ranges if x < 300]  # filter out values greater than 300
    sns.ecdfplot(all_ranges, ax=ax, label=f"{label}-all", linewidth=2, color=colour)
    # plot with the same colour but dash line for good overlap
    good_overlap = [x for x in good_overlap if x < 300]  # filter out values greater than 300
    sns.ecdfplot(good_overlap, ax=ax, label=f"{label}-overlap > 0.03", linestyle='--', linewidth=2, color=colour)
ax.set_xlabel(r"|$\Delta\Delta$G$_{calc}$| repeat range (kcal/mol)", fontsize=12)
ax.set_ylabel("Cumulative Probability", fontsize=12)
# set the axis ticks fontsize
ax.tick_params(axis='both', which='major', labelsize=12)
plt.legend(fontsize=12)
plt.savefig("ddg_repeat_range_ecdf.png", dpi=300, bbox_inches='tight')

In [None]:
# add the inter repeat range and the smallest overlap to the public and private edge data and sort by the inter repeat range
def add_inter_repeat_range_and_overlap(edge_data):
    inter_repeat_ranges = []
    smallest_overlaps = []
    for _, row in edge_data.iterrows():
        data = []
        # get the overlaps for the solvent and complex repeats
        overlaps = [row[f"solvent_repeat_{i}_smallest_overlap"] for i in range(3)]
        for i in range(3):
            overlaps.append(row[f"complex_repeat_{i}_smallest_overlap"])

        # calculate the ddg for each repeat
        for i in range(3):
            ddg = row[f"complex_repeat_{i}_DG (kcal/mol)"] - row[f"solvent_repeat_{i}_DG (kcal/mol)"]
            data.append(ddg)

        ddg_range = max(data) - min(data)
        inter_repeat_ranges.append(ddg_range)
        smallest_overlaps.append(min(overlaps))

    edge_data["inter_repeat_range"] = inter_repeat_ranges
    edge_data["smallest_overlap"] = smallest_overlaps
    return edge_data
# Add the inter repeat range and smallest overlap to the edge data
normal_edge_data = add_inter_repeat_range_and_overlap(normal_edge_data)
private_edge_data = add_inter_repeat_range_and_overlap(private_edge_data)
# Sort the edge data by the inter repeat range
normal_edge_data.sort_values(by="inter_repeat_range", inplace=True, ascending=False)
private_edge_data.sort_values(by="inter_repeat_range", inplace=True)

In [None]:
normal_edge_data.iloc[9]

In [None]:
# for each repeat and the average calculate the RMSE MUE and Ktau for the DG values and the DDG edgewise and pairwise errors 
from collections import defaultdict
from cinnabar import FEMap
from cinnabar.stats import bootstrap_statistic
from openff.units import unit
def calculate_dgs(edge_df, dg_df, public=True) -> tuple[pd.DataFrame]:
    if public:
        group_name = "system group"
        system_name = "system name"
    else:
        group_name = "partner_id"
        system_name = "dataset_name"
    calculated_dgs = []
    for system in edge_df[group_name].unique():
        # get the edges for this system
        system_df = edge_df[edge_df[group_name] == system].copy(deep=True).reset_index(drop=True)
        targets = system_df[system_name].unique()
        for target in targets:
            print(f"Calculating DGs for {target} in {system}")
            # get the edges for this target
            target_df = system_df[(system_df[system_name] == target)].copy(deep=True).reset_index(drop=True)
            exp_target_data = dg_df[(dg_df[system_name] == target) & (dg_df[group_name] == system)].copy(deep=True).reset_index(drop=True)
            exp_shift = exp_target_data["Exp DG (kcal/mol)"].mean()  # shift the experimental values to match the OpenFE values
            # calculate the stats for each repeat
            target_data = {}
            for i in range(3):

                # calculate the absolute DG values
                fe_map = FEMap()
                for _, row in target_df.iterrows():
                    complex_dg = row[f"complex_repeat_{i}_DG (kcal/mol)"]
                    complex_error = row[f"complex_repeat_{i}_dDG (kcal/mol)"]
                    solvent_dg = row[f"solvent_repeat_{i}_DG (kcal/mol)"]
                    solvent_error = row[f"solvent_repeat_{i}_dDG (kcal/mol)"]
                    uncertainty = (complex_error**2 + solvent_error**2)**0.5 * unit.kilocalorie_per_mole
                    if uncertainty < 0.01 * unit.kilocalorie_per_mole:
                        uncertainty = 0.1 * unit.kilocalorie_per_mole
                    fe_map.add_relative_calculation(
                        value=(complex_dg - solvent_dg) * unit.kilocalorie_per_mole,
                        uncertainty=uncertainty if np.isfinite(uncertainty) else 0.1 * unit.kilocalorie_per_mole,
                        labelA=row["ligand_A"],
                        labelB=row["ligand_B"],
                    )
                # calculate the absolute DG values
                fe_map.generate_absolute_values()
                # get the absolute DG values
                abs_df = fe_map.get_absolute_dataframe()
                # write them to the target data
                for _, abs_row in abs_df.iterrows():
                    if abs_row["label"] not in target_data:
                        target_data[abs_row["label"]] = {"system group": system, "system name": target, "ligand name": abs_row["label"]}
                        # add the exp data and the average calculated dg
                        try:
                            avg_data = exp_target_data[exp_target_data["ligand name"] == abs_row["label"]].iloc[0]
                        except IndexError as e:
                                print(abs_row["label"], "not found in experimental data")
                                continue
                        target_data[abs_row["label"]]["exp DG (kcal/mol)"] = avg_data["Exp DG (kcal/mol)"]
                        target_data[abs_row["label"]]["exp dDG (kcal/mol)"] = avg_data["Exp dDG (kcal/mol)"]
                        target_data[abs_row["label"]]["average DG (kcal/mol)"] = avg_data["DG (kcal/mol)"]
                        target_data[abs_row["label"]]["average dDG (kcal/mol)"] = avg_data["uncertainty (kcal/mol)"]
                    target_data[abs_row["label"]][f"repeat_{i}_DG (kcal/mol)"] = abs_row["DG (kcal/mol)"] + exp_shift
                    # calculate the uncertainty
                    target_data[abs_row["label"]][f"repeat_{i}_dDG (kcal/mol)"] = abs_row["uncertainty (kcal/mol)"]
            calculated_dgs.extend(list(target_data.values()))
    # create a new dataframe with all of the metrics calculated for each system
    calculated_dgs = pd.DataFrame(calculated_dgs)
    return calculated_dgs

                

In [None]:
public_dg_data

In [None]:
per_repeat_dgs_public = calculate_dgs(normal_edge_data, public_dg_data, public=True)

In [None]:
# calculate the all-to-all pairwise differences for the public data for each repeat and the average in a new df
def calculate_pairwise_diffs(dg_df, public=True) -> pd.DataFrame:
    if public:
        group_name = "system group"
        system_name = "system name"
    else:
        group_name = "partner_id"
        system_name = "dataset_name"
    pairwise_diffs = []
    for system in dg_df[group_name].unique():
        system_df = dg_df[dg_df[group_name] == system].copy(deep=True).reset_index(drop=True)
        targets = system_df[system_name].unique()
        for target in targets:
            target_df = system_df[(system_df[system_name] == target)].copy(deep=True).reset_index(drop=True)
            ligands = target_df["ligand name"].unique()
            for i, ligand1 in enumerate(ligands):
                for j, ligand2 in enumerate(ligands):
                    if i == j:  # skip self-comparisons
                        continue
                    # get the ddg values for these ligands
                    exp_dg1 = target_df[target_df["ligand name"] == ligand1]["exp DG (kcal/mol)"].values[0]
                    exp_dg2 = target_df[target_df["ligand name"] == ligand2]["exp DG (kcal/mol)"].values[0]
                    openfe_dg1 = target_df[target_df["ligand name"] == ligand1]["average DG (kcal/mol)"].values[0]
                    openfe_dg2 = target_df[target_df["ligand name"] == ligand2]["average DG (kcal/mol)"].values[0]
                    openfe_error1 = target_df[target_df["ligand name"] == ligand1]["average dDG (kcal/mol)"].values[0]
                    openfe_error2 = target_df[target_df["ligand name"] == ligand2]["average dDG (kcal/mol)"].values[0]
                    # add to a new dataframe
                    new_row = {
                        "system group": system,
                        "system name": target,
                        "Ligand 1": ligand1,
                        "Ligand 2": ligand2,
                        "Exp DDG (kcal/mol)": exp_dg2 - exp_dg1,
                        "average DDG (kcal/mol)": openfe_dg2 - openfe_dg1,
                        "average DDG uncertainty (kcal/mol)": (openfe_error1**2 + openfe_error2**2)**0.5,
                    }
                    for i in range(3):
                        repeat_dg1 = target_df[target_df["ligand name"] == ligand1][f"repeat_{i}_DG (kcal/mol)"].values[0]
                        repeat_dg2 = target_df[target_df["ligand name"] == ligand2][f"repeat_{i}_DG (kcal/mol)"].values[0]
                        repeat_error1 = target_df[target_df["ligand name"] == ligand1][f"repeat_{i}_dDG (kcal/mol)"].values[0]
                        repeat_error2 = target_df[target_df["ligand name"] == ligand2][f"repeat_{i}_dDG (kcal/mol)"].values[0]
                        new_row[f"repeat_{i} DDG (kcal/mol)"] = repeat_dg2 - repeat_dg1
                        new_row[f"repeat_{i} DDG uncertainty (kcal/mol)"] = (repeat_error1**2 + repeat_error2**2)**0.5
                    pairwise_diffs.append(new_row)
    return pd.DataFrame(pairwise_diffs)


In [None]:
public_pariwise_diffs = calculate_pairwise_diffs(per_repeat_dgs_public, public=True)

In [None]:
public_pariwise_diffs

In [None]:
# do the same for the private data
per_repeat_dgs_private = calculate_dgs(private_edge_data, private_dg_data, public=False)
private_pairwise_diffs = calculate_pairwise_diffs(per_repeat_dgs_private, public=True)
private_pairwise_diffs

In [None]:
# plot the ecdf of the pairwise errors for each repeat and the average
fig, ax = plt.subplots(figsize=(8, 6))
colors = sns.color_palette("rocket", 4)
for dataset, label in [(public_pariwise_diffs, "Public"), (private_pairwise_diffs, "Private")]:
    for i in range(3):
        # calculate the absolute errors for each repeat
        abs_error = np.abs(dataset[f"repeat_{i} DDG (kcal/mol)"] - dataset["Exp DDG (kcal/mol)"])
        sns.ecdfplot(abs_error, ax=ax, label=f"Repeat {i+1} {label}", linewidth=2, color=colors[i], linestyle='-' if label == "Public" else '--')
    average_errors = np.abs(dataset["average DDG (kcal/mol)"] - dataset["Exp DDG (kcal/mol)"])
    sns.ecdfplot(average_errors, ax=ax, label=f"Average {label}", linewidth=2, linestyle='-' if label == "Public" else '--', color=colors[3])
ax.set_xlabel(r"Pairwise |$\Delta\Delta$G$_{calc} - \Delta\Delta$G$_{exp}$| (kcal/mol)", fontsize=14)
ax.set_ylabel("Cumulative Probability", fontsize=14)
# simplify the legend to show colours for each repeat and average and the line style for public and private

plt.legend(fontsize=12)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xlim((0, 5))
plt.savefig("pairwise_ddg_ecdf_zoom.png", dpi=300, bbox_inches='tight')

In [None]:
# calculate RMSE and MUE statistics for the pairwise errors and create a summary table
pair_wise_stats = []
for dataset, label in [(public_pariwise_diffs, "Public"), (private_pairwise_diffs, "Private")]:

    for i in range(3):
        print(f"Calculating statistics for repeat {i+1} in {label}")
        # only check edges that have a DDG value for this repeat
        temp_ds = dataset[(dataset[f"repeat_{i} DDG (kcal/mol)"].notna()) & (dataset["Exp DDG (kcal/mol)"].notna())].copy(deep=True).reset_index(drop=True)
        repeat_data = {"repeat": i+1, "dataset": label}
        exp_ddg = temp_ds["Exp DDG (kcal/mol)"].values
        openfe_ddg = temp_ds[f"repeat_{i} DDG (kcal/mol)"].values
        mue, rmse = np.zeros(1000), np.zeros(1000)
        for i in range(1000):
            subset = np.random.choice(len(exp_ddg), size=len(exp_ddg), replace=True)
            mue[i] = np.mean(np.abs(openfe_ddg[subset] - exp_ddg[subset]))
            rmse[i] = np.sqrt(np.mean((openfe_ddg[subset] - exp_ddg[subset])**2))

        repeat_data["RMSE"] = np.mean(rmse)
        repeat_data["RMSE lower"] = np.percentile(rmse, 2.5)
        repeat_data["RMSE upper"] = np.percentile(rmse, 97.5)
        repeat_data["MUE"] = np.mean(mue)
        repeat_data["MUE lower"] = np.percentile(mue, 2.5)
        repeat_data["MUE upper"] = np.percentile(mue, 97.5)
        pair_wise_stats.append(repeat_data)

    # do the same for the average
    temp_ds = dataset[(dataset["Exp DDG (kcal/mol)"].notna())].copy(deep=True).reset_index(drop=True)

    average_data = {"repeat": "average", "dataset": label}
    exp_ddg = temp_ds["Exp DDG (kcal/mol)"].values
    openfe_ddg = temp_ds["average DDG (kcal/mol)"].values
    print(f"Calculating statistics for average in {label}")
    mue, rmse = np.zeros(1000), np.zeros(1000)
    for i in range(1000):
        subset = np.random.choice(len(exp_ddg), size=len(exp_ddg), replace=True)
        mue[i] = np.mean(np.abs(openfe_ddg[subset] - exp_ddg[subset]))
        rmse[i] = np.sqrt(np.mean((openfe_ddg[subset] - exp_ddg[subset])**2))
    average_data["RMSE"] = np.mean(rmse)
    average_data["RMSE lower"] = np.percentile(rmse, 2.5)
    average_data["RMSE upper"] = np.percentile(rmse, 97.5)
    average_data["MUE"] = np.mean(mue)
    average_data["MUE lower"] = np.percentile(mue, 2.5)
    average_data["MUE upper"] = np.percentile(mue, 97.5)
    pair_wise_stats.append(average_data)
pair_wise_stats = pd.DataFrame(pair_wise_stats)
pair_wise_stats

In [None]:
# get a list of the public and private datasets which follow the Hahn benchmarking convention, 16 ligands and an exp range of 3 kcal/mol
public_hahn_datasets, private_hahn_datasets = [], []
# do public first
for system in per_repeat_dgs_public["system group"].unique():
    system_df = per_repeat_dgs_public[per_repeat_dgs_public["system group"] == system].copy(deep=True).reset_index(drop=True)
    targets = system_df["system name"].unique()
    for target in targets:
        target_df = system_df[(system_df["system name"] == target)].copy(deep=True).reset_index(drop=True)
        if len(target_df) >= 16 and target_df["exp DG (kcal/mol)"].max() - target_df["exp DG (kcal/mol)"].min() >= 3:
            public_hahn_datasets.append((system, target))
# do the same for the private data
for system in per_repeat_dgs_private["system group"].unique():
    system_df = per_repeat_dgs_private[per_repeat_dgs_private["system group"] == system].copy(deep=True).reset_index(drop=True)
    targets = system_df["system name"].unique()
    for target in targets:
        target_df = system_df[(system_df["system name"] == target)].copy(deep=True).reset_index(drop=True)
        if len(target_df) >= 16 and target_df["exp DG (kcal/mol)"].max() - target_df["exp DG (kcal/mol)"].min() >= 3:
            private_hahn_datasets.append((system, target))
print("Public Hahn datasets:", public_hahn_datasets)
print("Private Hahn datasets:", private_hahn_datasets)

In [None]:
len(public_hahn_datasets), len(private_hahn_datasets)

In [None]:
# get the kendall tau ranking for the DGs for the public and private datasets only for datasets that follow the Hahn benchmarking convention
ktau_data = []
for dataset, label, valid_systems in [(per_repeat_dgs_public, "Public", public_hahn_datasets), (per_repeat_dgs_private, "Private", private_hahn_datasets)]:
    for system, target in valid_systems:
        target_df = dataset[(dataset["system group"] == system) & (dataset["system name"] == target) & (dataset["exp DG (kcal/mol)"].notna())].copy(deep=True).reset_index(drop=True)
        # calculate the kendall tau for each repeat and the average
        for i in range(3):
            # get the repeat DG values
            repeat_dg = target_df[f"repeat_{i}_DG (kcal/mol)"].values
            exp_dg = target_df["exp DG (kcal/mol)"].values
            s = stats.bootstrap_statistic(
                y_pred=repeat_dg, y_true=exp_dg, statistic="KTAU", nbootstrap=1000, ci=0.95
            )
            ktau_data.append({
                "system group": system,
                "system name": target,
                "repeat": i + 1,
                "dataset": label,
                "Kendall Tau": s["mle"],
                "Kendall Tau lower": s["low"],
                "Kendall Tau upper": s["high"],
                "N_ligs": len(target_df),
            })
            
        # calculate the kendall tau for the average
        avg_dg = target_df["average DG (kcal/mol)"].values
        avg_exp_dg = target_df["exp DG (kcal/mol)"].values
        s = stats.bootstrap_statistic(
            y_pred=avg_dg, y_true=avg_exp_dg, statistic="KTAU", nbootstrap=1000, ci=0.95
        )
        ktau_data.append({
            "system group": system,
            "system name": target,
            "repeat": "average",
            "dataset": label,
            "Kendall Tau": s["mle"],
            "Kendall Tau lower": s["low"],
            "Kendall Tau upper": s["high"],
            "N_ligs": len(target_df),
        })
ktau_data = pd.DataFrame(ktau_data)
ktau_data
        

In [None]:
# for the public and private datasets calculate the weighted Ktau using the number of ligands as the weight
public_ktau = ktau_data[ktau_data["dataset"] == "Public"].copy(deep=True).reset_index(drop=True)
private_ktau = ktau_data[ktau_data["dataset"] == "Private"].copy(deep=True).reset_index(drop=True)
# work out the weighted Ktau for each repeat and the average
tau_table = []
for i in range(3):
    for dataset, label in [(public_ktau, "Public"), (private_ktau, "Private")]:
        repeat_data = dataset[dataset["repeat"] == i + 1].copy(deep=True).reset_index(drop=True)
        weighted_tau = (repeat_data["Kendall Tau"] * repeat_data["N_ligs"]).sum() / repeat_data["N_ligs"].sum()
        # get the lower and upper bounds using bootstrapping over the systems
        tau_bootstrap = []
        for _ in range(1000):
            sample = repeat_data.sample(n=len(repeat_data), replace=True)
            weighted_tau_sample = (sample["Kendall Tau"] * sample["N_ligs"]).sum() / sample["N_ligs"].sum()
            tau_bootstrap.append(weighted_tau_sample)
        weighted_tau_lower = np.percentile(tau_bootstrap, 2.5)
        weighted_tau_upper = np.percentile(tau_bootstrap, 97.5)
        tau_table.append({
            "dataset": label,
            "repeat": i + 1,
            "weighted Kendall Tau": weighted_tau,
            "weighted Kendall Tau lower": weighted_tau_lower,
            "weighted Kendall Tau upper": weighted_tau_upper,
        })
# do the same for the average
for dataset, label in [(public_ktau, "Public"), (private_ktau, "Private")]:
    repeat_data = dataset[dataset["repeat"] == "average"].copy(deep=True).reset_index(drop=True)
    weighted_tau = (repeat_data["Kendall Tau"] * repeat_data["N_ligs"]).sum() / repeat_data["N_ligs"].sum()
    # get the lower and upper bounds using bootstrapping over the systems
    tau_bootstrap = []
    for _ in range(1000):
        sample = repeat_data.sample(n=len(repeat_data), replace=True)
        weighted_tau_sample = (sample["Kendall Tau"] * sample["N_ligs"]).sum() / sample["N_ligs"].sum()
        tau_bootstrap.append(weighted_tau_sample)
    weighted_tau_lower = np.percentile(tau_bootstrap, 2.5)
    weighted_tau_upper = np.percentile(tau_bootstrap, 97.5)
    tau_table.append({
        "dataset": label,
        "repeat": "average",
        "weighted Kendall Tau": weighted_tau,
        "weighted Kendall Tau lower": weighted_tau_lower,
        "weighted Kendall Tau upper": weighted_tau_upper,
    })
tau_table = pd.DataFrame(tau_table)


In [None]:
tau_table

In [None]:
stat, p = wilcoxon(public_ktau[public_ktau["repeat"] == "average"]["Kendall Tau"].values, public_ktau[public_ktau["repeat"] == 3]["Kendall Tau"].values)
print(f"Wilcoxon test for public average vs repeat 1: statistic={stat}, p-value={p}")

In [None]:
per_repeat_dgs_public[(per_repeat_dgs_public["system group"] == "jacs_set") & (per_repeat_dgs_public["system name"] == "ptp1b")]

In [None]:
ptp1b_public_edges = normal_edge_data[(normal_edge_data["system group"] == "jacs_set") & (normal_edge_data["system name"] == "ptp1b")].copy(deep=True).reset_index(drop=True)
# calculate the ddgs for repeat 2
complex_dg = ptp1b_public_edges["complex_repeat_2_DG (kcal/mol)"].values
solvent_dg = ptp1b_public_edges["solvent_repeat_2_DG (kcal/mol)"].values
ptp1b_public_edges[["complex_repeat_2_DG (kcal/mol)", "solvent_repeat_2_DG (kcal/mol)", "failed"]] 

In [None]:
public_ktau[public_ktau["repeat"] == 3]

In [None]:
from scipy.stats import wilcoxon
# perform a wilcoxon signed-rank test for the public and private datasets compairing the pairwise predictions between the repeats and the average
def wilcoxon_signed_rank_test(df, repeat):
    "do the test and return a plot for the given repeat"
    # remove entries with nans
    temp_df = df[(df[f"repeat_{repeat} DDG (kcal/mol)"].notna()) & (df["average DDG (kcal/mol)"].notna())].copy(deep=True).reset_index(drop=True)
    average_ddg = temp_df["average DDG (kcal/mol)"].values
    repeat_ddg = temp_df[f"repeat_{repeat} DDG (kcal/mol)"].values
    # perform the wilcoxon signed-rank test
    stat, p_value = wilcoxon(average_ddg, repeat_ddg)
    print(f"Wilcoxon signed-rank test for repeat {repeat}: statistic={stat}, p-value={p_value}")
    # create a plot of the differences
    # calculate the bootstrap confidence interval for the difference between 4 ns and 5 ns
    diffs = repeat_ddg - average_ddg
    n_bootstrap = 1000
    boot_diffs = []
    n_edges = len(diffs)

    for _ in range(n_bootstrap):
        sample = np.random.choice(diffs, size=n_edges, replace=True)
        boot_diffs.append(np.mean(sample))

    ci_lower = np.percentile(boot_diffs, 2.5)
    ci_upper = np.percentile(boot_diffs, 97.5)
    mean_diff = np.mean(diffs)
    # Plot: repeat X vs Average predictions and difference histogram
    fig, axs = plt.subplots(1, 2, figsize=(12, 5))

    # Scatter plot of predictions
    axs[0].scatter(average_ddg, repeat_ddg, color='#009384', edgecolor='k')
    # workout the limits of the axes
    min_val = min(np.min(average_ddg), np.min(repeat_ddg)) - 1
    max_val = max(np.max(average_ddg), np.max(repeat_ddg)) + 1
    axs[0].plot([min_val, max_val], [min_val, max_val], 'k--', label='y = x')
    axs[0].set_xlabel(r'Average ∆∆G$_{calc}$ (kcal/mol)', fontdict={"fontsize": 12})
    axs[0].set_ylabel(r'Repeat ∆∆G$_{calc}$ (kcal/mol)', fontdict={"fontsize": 12})
    # set the size of the xticks on the x axis
    axs[0].tick_params(axis='x', labelsize=12)
    # same for the y axis
    axs[0].tick_params(axis='y', labelsize=12)
    # axs[0].xticks(fontsize=12)
    axs[0].set_title(f'Comparison of Predictions for Repeat {repeat + 1} vs Average', fontdict={"fontsize": 14})
    axs[0].legend()
    axs[0].grid(True)
    # plt.xticks(fontsize=12)

    # Histogram of differences
    sns.histplot(diffs, bins=15, kde=True, ax=axs[1], color='#009384')
    axs[1].axvline(ci_lower, color='red', linestyle='--', label=f"95% CI: [{ci_lower:.3f}, {ci_upper:.3f}]", linewidth=2)
    axs[1].axvline(ci_upper, color='red', linestyle='--', linewidth=2)
    axs[1].axvline(mean_diff, color='black', linestyle='-', label=f"Mean diff: {mean_diff:.3f}")
    axs[1].set_xlabel(f'∆∆G (Repeat {repeat} - Average)', fontdict={"fontsize": 12})
    axs[1].set_ylabel('Frequency', fontdict={"fontsize": 12})
    axs[1].set_title('Distribution of Prediction Differences', fontdict={"fontsize": 14})
    axs[1].tick_params(axis='x', labelsize=12)
    # same for the y axis
    axs[1].tick_params(axis='y', labelsize=12)
    axs[1].legend(fontsize=12)
    axs[1].grid(True)

    plt.tight_layout()


In [None]:
wilcoxon_signed_rank_test(public_pariwise_diffs, 1)

In [None]:
wilcoxon_signed_rank_test(private_pairwise_diffs, 2)

In [None]:
# bin the absolute experimental DDG values in 1 kcal/mol bins calculate the probability of gettting the sign of the DDG correct with openfe and fep+ for each bin
def bin_sign_correctness(data, bin_size=1.0):
    """
    Bin the absolute experimental DDG values and calculate the probability of getting the sign correct.
    
    Parameters:
    - data: DataFrame with 'Exp DDG (kcal/mol)', 'OpenFE DDG (kcal/mol)', 'FEP+ DDG (kcal/mol)' columns.
    - bin_size: Size of the bins for absolute DDG values.
    
    Returns:
    - DataFrame with bins and probabilities for OpenFE and FEP+.
    """
    # Create bins
    # bins = np.arange(0, 5 + bin_size, bin_size)
    temp_df = data[data['Exp DDG (kcal/mol)'].notna()].copy(deep=True).reset_index(drop=True)
    temp_df['abs_exp_ddg'] = np.abs(temp_df['Exp DDG (kcal/mol)'])
    # max value
    max_value = temp_df['abs_exp_ddg'].max()
    bins = np.arange(0, max_value + bin_size, bin_size)
    
    # Bin the data
    temp_df['bin'] = pd.cut(temp_df['abs_exp_ddg'], bins=bins, right=False)

    
    # Calculate probabilities
    results = []
    # x = 0.5
    # sort the bins by the lower edge
    for b in sorted(temp_df['bin'].unique(), key=lambda x: x.left):
        subset = temp_df[temp_df['bin'] == b]
        if len(subset) == 0:
            continue
        # Calculate the probability of correct sign for the average and each repeat

        average_correct = np.sum(np.sign(subset['average DDG (kcal/mol)']) == np.sign(subset['Exp DDG (kcal/mol)']))
        repeat_1_correct = np.sum(np.sign(subset['repeat_0 DDG (kcal/mol)']) == np.sign(subset['Exp DDG (kcal/mol)']))
        repeat_2_correct = np.sum(np.sign(subset['repeat_1 DDG (kcal/mol)']) == np.sign(subset['Exp DDG (kcal/mol)']))
        repeat_3_correct = np.sum(np.sign(subset['repeat_2 DDG (kcal/mol)']) == np.sign(subset['Exp DDG (kcal/mol)']))
        total = len(subset)
        average_prob = average_correct / total
        repeat_1_prob = repeat_1_correct / total
        repeat_2_prob = repeat_2_correct / total
        repeat_3_prob = repeat_3_correct / total
        # Bootstrap the probabilities
        nboots = 1000
        average_probs = []
        repeat_1_probs = []
        repeat_2_probs = []
        repeat_3_probs = []
        for _ in range(nboots):
            bootstrapped_subset = subset.sample(n=len(subset), replace=True)
            average_correct_boot = np.sum(np.sign(bootstrapped_subset['average DDG (kcal/mol)']) == np.sign(bootstrapped_subset['Exp DDG (kcal/mol)']))
            repeat_1_correct_boot = np.sum(np.sign(bootstrapped_subset['repeat_0 DDG (kcal/mol)']) == np.sign(bootstrapped_subset['Exp DDG (kcal/mol)']))
            repeat_2_correct_boot = np.sum(np.sign(bootstrapped_subset['repeat_1 DDG (kcal/mol)']) == np.sign(bootstrapped_subset['Exp DDG (kcal/mol)']))
            repeat_3_correct_boot = np.sum(np.sign(bootstrapped_subset['repeat_2 DDG (kcal/mol)']) == np.sign(bootstrapped_subset['Exp DDG (kcal/mol)']))
            total_boot = len(bootstrapped_subset)
            average_probs.append(average_correct_boot / total_boot)
            repeat_1_probs.append(repeat_1_correct_boot / total_boot)
            repeat_2_probs.append(repeat_2_correct_boot / total_boot)
            repeat_3_probs.append(repeat_3_correct_boot / total_boot)
        # Calculate mean and 95% CI
        average_mean = np.mean(average_probs)
        average_low = np.percentile(average_probs, 2.5)
        average_high = np.percentile(average_probs, 97.5)
        repeat_1_mean = np.mean(repeat_1_probs)
        repeat_1_low = np.percentile(repeat_1_probs, 2.5)
        repeat_1_high = np.percentile(repeat_1_probs, 97.5)
        repeat_2_mean = np.mean(repeat_2_probs)     
        repeat_2_low = np.percentile(repeat_2_probs, 2.5)
        repeat_2_high = np.percentile(repeat_2_probs, 97.5)
        repeat_3_mean = np.mean(repeat_3_probs)
        repeat_3_low = np.percentile(repeat_3_probs, 2.5)
        repeat_3_high = np.percentile(repeat_3_probs, 97.5)
        # store the results so we can use hue to split the data
        results.append({
            'bin': b,
            'Average Probability': average_mean,
            'Average Probability Lower': average_low,
            'Average Probability Upper': average_high,
            'Repeat 1 Probability': repeat_1_mean,
            'Repeat 1 Probability Lower': repeat_1_low,
            'Repeat 1 Probability Upper': repeat_1_high,
            'Repeat 2 Probability': repeat_2_mean,
            'Repeat 2 Probability Lower': repeat_2_low,
            'Repeat 2 Probability Upper': repeat_2_high,
            'Repeat 3 Probability': repeat_3_mean,
            'Repeat 3 Probability Lower': repeat_3_low,
            'Repeat 3 Probability Upper': repeat_3_high,
        })
        # x+=1
    return pd.DataFrame(results)
# Calculate the binned probabilities
binned_probabilities = bin_sign_correctness(public_pariwise_diffs, bin_size=0.5)

In [None]:
binned_probabilities_private = bin_sign_correctness(private_pairwise_diffs, bin_size=0.5)

In [None]:
# Plot the binned probabilities using a bar plot with error bars
fig, ax = plt.subplots(figsize=(10, 6))
colours = sns.color_palette("rocket", 4)
# for i in range(3):
#     sns.barplot(data=binned_probabilities, x='bin', y=f'Repeat {i+1} Probability', ax=ax,  color=colours[i], label=f'Repeat {i+1}', alpha=0.3, width=1.0)
# sns.barplot(data=binned_probabilities, x='bin', y='Average Probability', ax=ax,  color=colours[3], label='Average', alpha=0.7, width=1.0)
# add error bars for the probabilities
ax.set_xlabel(r"|$\Delta\Delta$G$_{exp}$| (kcal/mol)", fontsize=14)
ax.set_ylabel(r"Probability Correct $\Delta\Delta$G$_{calc}$ Sign", fontsize=14)
# # set the x ticks to go from 0 to the max bin value in 1 kcal/mol increments
# x_ticks = np.arange(1.5, 15.5, 2)
# ax.errorbar(binned_probabilities['bin'] *2, binned_probabilities['OpenFE Probability'],
#             yerr=[binned_probabilities['OpenFE Probability'] - binned_probabilities['OpenFE Probability Lower'],
#                   binned_probabilities['OpenFE Probability Upper'] - binned_probabilities['OpenFE Probability']],
#             fmt='none', color='black', capsize=5)
# ax.errorbar(binned_probabilities['bin'] *2, binned_probabilities['FEP+ Probability'],
#             yerr=[binned_probabilities['FEP+ Probability'] - binned_probabilities['FEP+ Probability Lower'],
#                   binned_probabilities['FEP+ Probability Upper'] - binned_probabilities['FEP+ Probability']],
#             fmt='none', color='black', capsize=5)
# # add scatter points for the probabilities
# get the lower half of the bin edges for the x ticks
ticks = np.array([v.left for v in binned_probabilities['bin'].unique()] )
# the center of each bin is the left edge + 0.25

for i in range(3):
    ax.plot(
        ticks + 0.1 * (i + 1), 
        binned_probabilities[f'Repeat {i+1} Probability'], 
        color=colours[i], label=f'Repeat {i+1}', linewidth=2, marker='o', markersize=7,
        markeredgecolor='black', markerfacecolor=colours[i], linestyle='--'
    )
ax.plot(
    ticks + 0.4, 
    binned_probabilities['Average Probability'], 
    color=colours[3], markeredgecolor='black', label='Average', linewidth=2, marker='o', markersize=7, linestyle='--', markerfacecolor=colours[3]
)
# add error bars for the probabilities
ax.errorbar(
    ticks + 0.1, 
    binned_probabilities['Repeat 1 Probability'], 
    yerr=[binned_probabilities['Repeat 1 Probability'] - binned_probabilities['Repeat 1 Probability Lower'],
          binned_probabilities['Repeat 1 Probability Upper'] - binned_probabilities['Repeat 1 Probability']],
    fmt='none', color=colours[0], capsize=5, elinewidth=2
)
ax.errorbar(
    ticks + 0.2, 
    binned_probabilities['Repeat 2 Probability'], 
    yerr=[binned_probabilities['Repeat 2 Probability'] - binned_probabilities['Repeat 2 Probability Lower'],
          binned_probabilities['Repeat 2 Probability Upper'] - binned_probabilities['Repeat 2 Probability']],
    fmt='none', color=colours[1], capsize=5, elinewidth=2
)
ax.errorbar(
    ticks + 0.3, 
    binned_probabilities['Repeat 3 Probability'], 
    yerr=[binned_probabilities['Repeat 3 Probability'] - binned_probabilities['Repeat 3 Probability Lower'],
          binned_probabilities['Repeat 3 Probability Upper'] - binned_probabilities['Repeat 3 Probability']],
    fmt='none', color=colours[2], capsize=5, elinewidth=2
)
ax.errorbar(
    ticks + 0.4, 
    binned_probabilities['Average Probability'], 
    yerr=[binned_probabilities['Average Probability'] - binned_probabilities['Average Probability Lower'],
          binned_probabilities['Average Probability Upper'] - binned_probabilities['Average Probability']],
    fmt='none', color=colours[3], capsize=5, elinewidth=2
)
ax.set_xticks(ticks)
# ax.set_xticklabels([1, 2, 3, 4, 5, 6, 7], fontsize=12)
plt.xlim((0,7.5))
# save the figure
plt.legend(fontsize=12)
plt.tight_layout()
plt.ylim(bottom=0.5)

In [None]:
# Plot the binned probabilities using a bar plot with error bars
fig, ax = plt.subplots(figsize=(10, 6))
colours = sns.color_palette("rocket", 4)
# for i in range(3):
#     sns.barplot(data=binned_probabilities, x='bin', y=f'Repeat {i+1} Probability', ax=ax,  color=colours[i], label=f'Repeat {i+1}', alpha=0.3, width=1.0)
# sns.barplot(data=binned_probabilities, x='bin', y='Average Probability', ax=ax,  color=colours[3], label='Average', alpha=0.7, width=1.0)
# add error bars for the probabilities
ax.set_xlabel(r"|$\Delta\Delta$G$_{exp}$| (kcal/mol)", fontsize=14)
ax.set_ylabel(r"Probability Correct $\Delta\Delta$G$_{calc}$ Sign", fontsize=14)
# # set the x ticks to go from 0 to the max bin value in 1 kcal/mol increments
# x_ticks = np.arange(1.5, 15.5, 2)
# ax.errorbar(binned_probabilities['bin'] *2, binned_probabilities['OpenFE Probability'],
#             yerr=[binned_probabilities['OpenFE Probability'] - binned_probabilities['OpenFE Probability Lower'],
#                   binned_probabilities['OpenFE Probability Upper'] - binned_probabilities['OpenFE Probability']],
#             fmt='none', color='black', capsize=5)
# ax.errorbar(binned_probabilities['bin'] *2, binned_probabilities['FEP+ Probability'],
#             yerr=[binned_probabilities['FEP+ Probability'] - binned_probabilities['FEP+ Probability Lower'],
#                   binned_probabilities['FEP+ Probability Upper'] - binned_probabilities['FEP+ Probability']],
#             fmt='none', color='black', capsize=5)
# # add scatter points for the probabilities
# get the lower half of the bin edges for the x ticks
ticks = np.array([v.left for v in binned_probabilities_private['bin'].unique()] )
# the center of each bin is the left edge + 0.25

for i in range(3):
    ax.plot(
        ticks + 0.1 * (i + 1), 
        binned_probabilities_private[f'Repeat {i+1} Probability'], 
        color=colours[i], label=f'Repeat {i+1}', linewidth=2, marker='o', markersize=7,
        markeredgecolor='black', markerfacecolor=colours[i], linestyle='--'
    )
ax.plot(
    ticks + 0.4, 
    binned_probabilities_private['Average Probability'], 
    color=colours[3], markeredgecolor='black', label='Average', linewidth=2, marker='o', markersize=7, linestyle='--', markerfacecolor=colours[3]
)
# add error bars for the probabilities
ax.errorbar(
    ticks + 0.1, 
    binned_probabilities_private['Repeat 1 Probability'], 
    yerr=[binned_probabilities_private['Repeat 1 Probability'] - binned_probabilities_private['Repeat 1 Probability Lower'],
          binned_probabilities_private['Repeat 1 Probability Upper'] - binned_probabilities_private['Repeat 1 Probability']],
    fmt='none', color=colours[0], capsize=5, elinewidth=2
)
ax.errorbar(
    ticks + 0.2, 
    binned_probabilities_private['Repeat 2 Probability'], 
    yerr=[binned_probabilities_private['Repeat 2 Probability'] - binned_probabilities_private['Repeat 2 Probability Lower'],
          binned_probabilities_private['Repeat 2 Probability Upper'] - binned_probabilities_private['Repeat 2 Probability']],
    fmt='none', color=colours[1], capsize=5, elinewidth=2
)
ax.errorbar(
    ticks + 0.3, 
    binned_probabilities_private['Repeat 3 Probability'], 
    yerr=[binned_probabilities_private['Repeat 3 Probability'] - binned_probabilities_private['Repeat 3 Probability Lower'],
          binned_probabilities_private['Repeat 3 Probability Upper'] - binned_probabilities_private['Repeat 3 Probability']],
    fmt='none', color=colours[2], capsize=5, elinewidth=2
)
ax.errorbar(
    ticks + 0.4, 
    binned_probabilities_private['Average Probability'], 
    yerr=[binned_probabilities_private['Average Probability'] - binned_probabilities_private['Average Probability Lower'],
          binned_probabilities_private['Average Probability Upper'] - binned_probabilities_private['Average Probability']],
    fmt='none', color=colours[3], capsize=5, elinewidth=2
)
ax.set_xticks(ticks)
# ax.set_xticklabels([1, 2, 3, 4, 5, 6, 7], fontsize=12)
# plt.xlim())
# save the figure
plt.legend(fontsize=12)
plt.tight_layout()
plt.ylim(bottom=0.5)

In [None]:
# Random sampling reproducibility analysis
from cinnabar import FEMap
from openff.units import unit
from scipy.stats import kendalltau
def calculate_repeat_distribution(edge_df, dg_df, public=True) -> tuple[pd.DataFrame]:
    if public:
        group_name = "system group"
        system_name = "system name"
    else:
        group_name = "partner_id"
        system_name = "dataset_name"
    nboots = 1000
    pairwise_rmses, weighted_kendall_tau = np.zeros(nboots), np.zeros(nboots)
    for i in range(nboots):
        print(f"Calculating repeat distribution for boot {i + 1} of {nboots}")
        calculated_dgs = []
        for system in edge_df[group_name].unique():
            # get the edges for this system
            system_df = edge_df[edge_df[group_name] == system].copy(deep=True).reset_index(drop=True)
            targets = system_df[system_name].unique()
            for target in targets:
                # get the edges for this target
                target_df = system_df[(system_df[system_name] == target)].copy(deep=True).reset_index(drop=True)
                exp_target_data = dg_df[(dg_df[system_name] == target) & (dg_df[group_name] == system)].copy(deep=True).reset_index(drop=True)
                exp_shift = exp_target_data["Exp DG (kcal/mol)"].mean()  # shift the experimental values to match the OpenFE values
                # calculate the stats for each repeat
                target_data = {}
                # generate a list of edges randomly sampled from each repeat
                complex_repeat_index = np.random.choice([0, 1, 2], size=len(target_df), replace=True)
                solvent_repeat_index = np.random.choice([0, 1, 2], size=len(target_df), replace=True)
                
                # add the DDG estimates to the graph
                fe_map = FEMap()
                for index, row in target_df.iterrows():
                    complex_repeat_id = complex_repeat_index[index]
                    solent_repeat_id = solvent_repeat_index[index]
                    complex_dg = row[f"complex_repeat_{complex_repeat_id}_DG (kcal/mol)"]
                    complex_error = row[f"complex_repeat_{complex_repeat_id}_dDG (kcal/mol)"]
                    solvent_dg = row[f"solvent_repeat_{solent_repeat_id}_DG (kcal/mol)"]
                    solvent_error = row[f"solvent_repeat_{solent_repeat_id}_dDG (kcal/mol)"]
                    uncertainty = (complex_error**2 + solvent_error**2)**0.5 * unit.kilocalorie_per_mole
                    if uncertainty < 0.01 * unit.kilocalorie_per_mole:
                        uncertainty = 0.1 * unit.kilocalorie_per_mole
                    fe_map.add_relative_calculation(
                        value=(complex_dg - solvent_dg) * unit.kilocalorie_per_mole,
                        uncertainty=uncertainty if np.isfinite(uncertainty) else 0.1 * unit.kilocalorie_per_mole,
                        labelA=row["ligand_A"],
                        labelB=row["ligand_B"],
                    )
                # calculate the absolute DG values
                fe_map.generate_absolute_values()
                # get the absolute DG values
                abs_df = fe_map.get_absolute_dataframe()
                # write them to the target data
                for _, abs_row in abs_df.iterrows():
                    if abs_row["label"] not in target_data:
                        target_data[abs_row["label"]] = {"system group": system, "system name": target, "ligand name": abs_row["label"]}
                        # add the exp data and the average calculated dg
                        try:
                            avg_data = exp_target_data[exp_target_data["ligand name"] == abs_row["label"]].iloc[0]
                        except IndexError as e:
                                print(abs_row["label"], "not found in experimental data")
                                continue
                        target_data[abs_row["label"]]["Exp DG (kcal/mol)"] = avg_data["Exp DG (kcal/mol)"]
                        target_data[abs_row["label"]]["Exp dDG (kcal/mol)"] = avg_data["Exp dDG (kcal/mol)"]
                    # add the repeat DG value and the uncertainty
                    target_data[abs_row["label"]][f"repeat_DG (kcal/mol)"] = abs_row["DG (kcal/mol)"] + exp_shift
                    target_data[abs_row["label"]][f"repeat_dDG (kcal/mol)"] = abs_row["uncertainty (kcal/mol)"]

                calculated_dgs.extend(list(target_data.values()))
        calculated_dgs = pd.DataFrame(calculated_dgs)
        # calculate the weighted kendall tau for each of the hahn systems
        kendall_tau = []
        target_weights = []
        for system in calculated_dgs["system group"].unique():
            system_df = calculated_dgs[calculated_dgs["system group"] == system].copy(deep=True).reset_index(drop=True)
            targets = system_df["system name"].unique()
            for target in targets:
                target_df = system_df[(system_df["system name"] == target) & (system_df["Exp DG (kcal/mol)"].notna())].copy(deep=True).reset_index(drop=True)
                if len(target_df) < 16 or target_df["Exp DG (kcal/mol)"].max() - target_df["Exp DG (kcal/mol)"].min() < 3:
                    continue
                repeat_dg = target_df[f"repeat_DG (kcal/mol)"].values
                exp_dg = target_df["Exp DG (kcal/mol)"].values
                tau, _ = kendalltau(repeat_dg, exp_dg)
                # add tau times the weight for this target
                kendall_tau.append(tau * len(target_df))
                target_weights.append(len(target_df))
        # calculate the weighted kendall tau
        weighted_kendall_tau[i] = np.sum(kendall_tau) / np.sum(target_weights)

        # calculate all pairwise DDG differences
        pairwise_diffs_calc, pairwise_diffs_exp = [], []
        for system in calculated_dgs["system group"].unique():
            system_df = calculated_dgs[calculated_dgs["system group"] == system].copy(deep=True).reset_index(drop=True)
            targets = system_df["system name"].unique()
            for target in targets:
                target_df = system_df[(system_df["system name"] == target) & (system_df["Exp DG (kcal/mol)"].notna())].copy(deep=True).reset_index(drop=True)
                # calculate the pairwise differences for each repeat
                unique_ligands = target_df["ligand name"].unique()
                for x, ligand_A in enumerate(unique_ligands):
                    for y, ligand_B in enumerate(unique_ligands):
                        if x >= y:
                            continue
                        # get the repeat DG values for each ligand
                        repeat_dg_A = target_df[target_df["ligand name"] == ligand_A][f"repeat_DG (kcal/mol)"].values[0]
                        repeat_dg_B = target_df[target_df["ligand name"] == ligand_B][f"repeat_DG (kcal/mol)"].values[0]
                        exp_dg_A = target_df[target_df["ligand name"] == ligand_A]["Exp DG (kcal/mol)"].values[0]
                        exp_dg_B = target_df[target_df["ligand name"] == ligand_B]["Exp DG (kcal/mol)"].values[0]  
                        pairwise_diffs_calc.append((repeat_dg_B - repeat_dg_A))
                        pairwise_diffs_exp.append((exp_dg_B - exp_dg_A))
        # calculate the RMSE for the pairwise differences
        pairwise_diffs_calc = np.array(pairwise_diffs_calc)
        pairwise_diffs_exp = np.array(pairwise_diffs_exp)
        pairwise_rmses[i] = np.sqrt(np.mean((pairwise_diffs_calc - pairwise_diffs_exp) ** 2))
        print(f"Boot {i + 1} - Pairwise RMSE: {pairwise_rmses[i]:.3f}, Weighted Kendall Tau: {weighted_kendall_tau[i]:.3f}")




    return pairwise_rmses, weighted_kendall_tau

In [None]:
single_repeat_rmses, single_repeat_tau = calculate_repeat_distribution(normal_edge_data, public_dg_data, public=True)

In [None]:
sns.boxplot(single_repeat_rmses, color="#009384")

In [None]:
sns.boxplot(single_repeat_tau, color="#009384")

In [None]:
# do the same again but use averages combining 3 randomly selected repeats for each edge
def calculate_average_distribution(edge_df, dg_df, public=True) -> tuple[pd.DataFrame]:
    if public:
        group_name = "system group"
        system_name = "system name"
    else:
        group_name = "partner_id"
        system_name = "dataset_name"
    nboots = 1000
    pairwise_rmses, weighted_kendall_tau = np.zeros(nboots), np.zeros(nboots)
    for i in range(nboots):
        print(f"Calculating average distribution for boot {i + 1} of {nboots}")
        calculated_dgs = []
        for system in edge_df[group_name].unique():
            # get the edges for this system
            system_df = edge_df[edge_df[group_name] == system].copy(deep=True).reset_index(drop=True)
            targets = system_df[system_name].unique()
            for target in targets:
                # get the edges for this target
                target_df = system_df[(system_df[system_name] == target)].copy(deep=True).reset_index(drop=True)
                exp_target_data = dg_df[(dg_df[system_name] == target) & (dg_df[group_name] == system)].copy(deep=True).reset_index(drop=True)
                exp_shift = exp_target_data["Exp DG (kcal/mol)"].mean()  # shift the experimental values to match the OpenFE values
                # calculate the stats for each repeat
                target_data = {}
                
                # add the DDG estimates to the graph
                fe_map = FEMap()
                for index, row in target_df.iterrows():
                    # randomly select 3 repeats
                    complex_repeat_ids = np.random.choice([0, 1, 2], size=3, replace=True)
                    solvent_repeat_ids = np.random.choice([0, 1, 2], size=3, replace=True)
                    complex_dg = np.mean([row[f"complex_repeat_{repeat_id}_DG (kcal/mol)"] for repeat_id in complex_repeat_ids])
                    complex_error = np.std([row[f"complex_repeat_{repeat_id}_dDG (kcal/mol)"] for repeat_id in complex_repeat_ids])
                    solvent_dg = np.mean([row[f"solvent_repeat_{repeat_id}_DG (kcal/mol)"] for repeat_id in solvent_repeat_ids])
                    solvent_error = np.std([row[f"solvent_repeat_{repeat_id}_dDG (kcal/mol)"] for repeat_id in solvent_repeat_ids])
                    uncertainty = (complex_error**2 + solvent_error**2)**0.5 * unit.kilocalorie_per_mole
                    if uncertainty < 0.01 * unit.kilocalorie_per_mole:
                        uncertainty = 0.1 * unit.kilocalorie_per_mole
                    fe_map.add_relative_calculation(
                        value=(complex_dg - solvent_dg) * unit.kilocalorie_per_mole,
                        uncertainty=uncertainty if np.isfinite(uncertainty) else 0.1 * unit.kilocalorie_per_mole,
                        labelA=row["ligand_A"],
                        labelB=row["ligand_B"],
                    )
                # calculate the absolute DG values
                fe_map.generate_absolute_values()
                # get the absolute DG values
                abs_df = fe_map.get_absolute_dataframe()
                # write them to the target data
                for _, abs_row in abs_df.iterrows():
                    if abs_row["label"] not in target_data:
                        target_data[abs_row["label"]] = {"system group": system, "system name": target, "ligand name": abs_row["label"]}
                        # add the exp data and the average calculated dg
                        try:
                            avg_data = exp_target_data[exp_target_data["ligand name"] == abs_row["label"]].iloc[0]
                        except IndexError as e:
                                print(abs_row["label"], "not found in experimental data")
                                continue
                        target_data[abs_row["label"]]["Exp DG (kcal/mol)"] = avg_data["Exp DG (kcal/mol)"]
                        target_data[abs_row["label"]]["Exp dDG (kcal/mol)"] = avg_data["Exp dDG (kcal/mol)"]
                    # add the repeat DG value and the uncertainty
                    target_data[abs_row["label"]][f"average_DG (kcal/mol)"] = abs_row["DG (kcal/mol)"] + exp_shift
                    target_data[abs_row["label"]][f"average_dDG (kcal/mol)"] = abs_row["uncertainty (kcal/mol)"]

                calculated_dgs.extend(list(target_data.values()))
        calculated_dgs = pd.DataFrame(calculated_dgs)
        # calculate the weighted kendall tau for each of the hahn systems
        kendall_tau = []
        target_weights = []
        for system in calculated_dgs["system group"].unique():
            system_df = calculated_dgs[calculated_dgs["system group"] == system].copy(deep=True).reset_index(drop=True)
            targets = system_df["system name"].unique()
            for target in targets:
                target_df = system_df[(system_df["system name"] == target) & (system_df["Exp DG (kcal/mol)"].notna())].copy(deep=True).reset_index(drop=True)
                if len(target_df) < 16 or target_df["Exp DG (kcal/mol)"].max() - target_df["Exp DG (kcal/mol)"].min() < 3:
                    continue
                repeat_dg = target_df[f"average_DG (kcal/mol)"].values
                exp_dg = target_df["Exp DG (kcal/mol)"].values
                tau, _ = kendalltau(repeat_dg, exp_dg)
                # add tau times the weight for this target
                kendall_tau.append(tau * len(target_df))
                target_weights.append(len(target_df))
        # calculate the weighted kendall tau
        weighted_kendall_tau[i] = np.sum(kendall_tau) / np.sum(target_weights)

        # calculate all pairwise DDG differences
        pairwise_diffs_calc, pairwise_diffs_exp = [], []
        for system in calculated_dgs["system group"].unique():
            system_df = calculated_dgs[calculated_dgs["system group"] == system].copy(deep=True).reset_index(drop=True)
            targets = system_df["system name"].unique()
            for target in targets:
                target_df = system_df[(system_df["system name"] == target) & (system_df["Exp DG (kcal/mol)"].notna())].copy(deep=True).reset_index(drop=True)
                # calculate the pairwise differences for each repeat
                unique_ligands = target_df["ligand name"].unique()
                for x, ligand_A in enumerate(unique_ligands):
                    for y, ligand_B in enumerate(unique_ligands):
                        if x >= y:
                            continue
                        # get the repeat DG values for each ligand
                        repeat_dg_A = target_df[target_df["ligand name"] == ligand_A][f"average_DG (kcal/mol)"].values[0]
                        repeat_dg_B = target_df[target_df["ligand name"] == ligand_B][f"average_DG (kcal/mol)"].values[0]
                        exp_dg_A = target_df[target_df["ligand name"] == ligand_A]["Exp DG (kcal/mol)"].values[0]
                        exp_dg_B = target_df[target_df["ligand name"] == ligand_B]["Exp DG (kcal/mol)"].values[0]  
                        pairwise_diffs_calc.append((repeat_dg_B - repeat_dg_A))
                        pairwise_diffs_exp.append((exp_dg_B - exp_dg_A))
        # calculate the RMSE for the pairwise differences
        pairwise_diffs_calc = np.array(pairwise_diffs_calc)
        pairwise_diffs_exp = np.array(pairwise_diffs_exp)
        pairwise_rmses[i] = np.sqrt(np.mean((pairwise_diffs_calc - pairwise_diffs_exp) ** 2))
        print(f"Boot {i + 1} - Pairwise RMSE: {pairwise_rmses[i]:.3f}, Weighted Kendall Tau: {weighted_kendall_tau[i]:.3f}")




    return pairwise_rmses, weighted_kendall_tau

In [None]:
public_average_rmses, public_average_tau = calculate_average_distribution(normal_edge_data, public_dg_data, public=True)

In [None]:
private_average_rmses, private_average_tau = calculate_average_distribution(private_edge_data, private_dg_data, public=False)
private_repeat_rmses, private_repeat_tau = calculate_repeat_distribution(private_edge_data, private_dg_data, public=False)

In [None]:
# make a box plot of the pairwise RMSD values for the public and private datasets for the single and average repeat methods
# make a new dataframe with the RMSD values and the method
all_data = pd.DataFrame({
    'RMSD': np.concatenate([single_repeat_rmses, public_average_rmses, private_repeat_rmses, private_average_rmses]),
    'Method': ['Single'] * len(single_repeat_rmses) + ['Average'] * len(public_average_rmses) +
              ['Single'] * len(private_repeat_rmses) + ['Average'] * len(private_average_rmses),
    'Dataset': ['Public'] * (len(single_repeat_rmses) + len(public_average_rmses)) +
               ['Private'] * (len(private_repeat_rmses) + len(private_average_rmses)),
    "Kendall Tau": np.concatenate([single_repeat_tau, public_average_tau, private_repeat_tau, private_average_tau])
})

# Plot the RMSD values
fig, axs = plt.subplots(1, 2, figsize=(12, 6), sharey=False)
sns.boxplot(data=all_data, x="Dataset", hue="Method", y='RMSD', ax=axs[0], palette=["#009384", "#F2A900"])
sns.boxplot(data=all_data, x="Dataset", hue="Method", y='Kendall Tau', ax=axs[1], palette=["#009384", "#F2A900"])


In [None]:
from scipy.stats import wilcoxon
stat, p = wilcoxon(single_repeat_rmses, public_average_rmses)
print(f"Wilcoxon test for pairwise RMSD: stat={stat:.3f}, p-value={p:.10f}")

In [None]:
stat, p = wilcoxon(single_repeat_tau, public_average_tau)
print(f"Wilcoxon test for weighted Kendall Tau: stat={stat:.3f}, p-value={p:.10f}")

In [None]:
rmse_diff = single_repeat_rmses - public_average_rmses
stat, p = wilcoxon(rmse_diff, alternative="greater")  # if you expect single > triplicate
print(f"Wilcoxon p = {p:.4f}")

In [None]:
np.median(rmse_diff), np.mean(rmse_diff), np.std(rmse_diff)

In [None]:
tau_diff = single_repeat_tau - public_average_tau
stat, p = wilcoxon(tau_diff, alternative="less")  # if you expect single > triplicate
print(f"Wilcoxon p = {p:.10f}")

In [None]:
np.median(tau_diff), np.mean(tau_diff), np.std(tau_diff)

In [None]:
x = np.random.choice([0, 1, 2], size=3, replace=True)
if x[0] == x[1] and x[1] == x[2]:
    print("All repeats are the same")