Generate accuracy and corelation metrics for each of the public and private systems using cinnabar and write them to a csv file.
The metrics will be calculated on the MLE derived DG values, the all-to-all pairwise RMSE on the DDG values will also be included

In [None]:
import pandas as pd
from cinnabar import stats

In [None]:
# load the public and private DG data, fix the rerun data for the pfkfb3 system
public_dg_data = pd.read_csv("https://raw.githubusercontent.com/OpenFreeEnergy/IndustryBenchmarks2024/refs/heads/main/industry_benchmarks/analysis/processed_results/combined_pymbar3_calculated_dg_data.csv")
rerun_dg_data = pd.read_csv("https://raw.githubusercontent.com/OpenFreeEnergy/IndustryBenchmarks2024/refs/heads/main/industry_benchmarks/analysis/processed_results/reruns/rerun_pymbar3_calculated_dg_data.csv")
private_dg_data = pd.read_csv("https://raw.githubusercontent.com/OpenFreeEnergy/IndustryBenchmarks2024/refs/heads/main/industry_benchmarks/analysis/private_processed_results/combined_pymbar3_calculated_dg_data.csv")
public_dg_data = public_dg_data[(public_dg_data["system name"] != "pfkfb3") ]
public_dg_data = pd.concat([public_dg_data, rerun_dg_data], ignore_index=True)

In [None]:
# define the issues with the ligands for the private systems
issue_ligands = {
    ("GSK", "GSK_PRIVATE_1"): {"tautomer": ["ligand23", "ligand18"], "enantiomer":["ligand9"]},
    ("Janssen", "systemD_set1"): {"tautomer": ["ligand36", "ligand1"], "enantiomer": ["ligand54"], "assay": ["ligand44", "ligand23", "ligand45", "ligand25", "ligand18", "ligand19", "ligand32"]},
    ("Merck", "merck_private_T"): {"enantiomer": ["ligand30"], "assay": ["ligand9", "ligand31", "ligand14", "ligand6"]},
    ("EliLilly", "Project0"): {"conformers": ["ligand1"]},
    ("Janssen", "systemB_set1"): {"enantiomer": ["ligand0", "ligand7", "ligand4", "ligand10", "ligand17", "ligand18"]},
    ("Merck", "merck_private_G"): {"conformers": ["ligand23"]},
    ("Roche", "target_A"): {"enantiomer": ["ligand0"], "assay": ["ligand3", "ligand15", "ligand14", "ligand23", "ligand11", "ligand18", "ligand13", "ligand17", "ligand25", "ligand10"]},
    ("Roche", "target_B"): {"assay": ["ligand3", "ligand4"]},
    ("Roche", "target_C_set1"): {"conformers": ["ligand16",], "assay": ["ligand17", "ligand18"]},
    ("Roche", "target_C_set5"): {"tautomer": ["ligand12", "ligand6"], "enantiomer": ["ligand33", "ligand7"]},
    ("Roche", "target_D"): {"enantiomer": ["ligand9"], "assay": ["ligand3", "ligand5"]}
}


In [None]:
def calculate_statistics(dg_data, public: bool = True, issue_table = None):
    """
    Calculate statistics for the given DG data.
    """
    if public:
        system_group = "system group"
        system_name = "system name"
    else:
        system_group = "partner_id"
        system_name = "dataset_name"
    
    all_systems_data = []
    for system in dg_data[system_group].unique():
        system_df = dg_data[dg_data[system_group] == system].copy(deep=True).reset_index(drop=True)
        targets = system_df[system_name].unique()
        for target in targets:
            print(f"Calculating statistics for system: {system}, target: {target}")
            target_df = system_df[system_df[system_name] == target].copy(deep=True).reset_index(drop=True)
            # check if we need to filter out any issues
            if issue_table is not None and not public:
                # try and look up issues for this target
                issues = issue_table.get((system, target), {})
                for issue_type, issue_list in issues.items():
                    if "assay" not in issue_type:
                        # filter out the issues
                        print(f"Filtering out issues for {system} {target}: {issue_type} {issue_list}")
                        target_df = target_df[~target_df["ligand name"].isin(issue_list)].reset_index(drop=True)
                    if "assay" in issue_type:
                        print("Fixing assay issues for", system, target, issue_type, issue_list)
                        # we want to adjust the predicted values based on the following logic:
                        # if  the assay limit is the lower detection limit (larger DG value) and we predict a value above that set the predicted value to the assay limit
                        # as the prediction is qulalitativly correct and should not be penalized
                        # if we predict a more potent ligand then keep the prediction as is
                        for ligand in issue_list:
                            ligand_df = target_df[target_df["ligand name"] == ligand]
                            if ligand_df["DG (kcal/mol)"].values[0] > ligand_df["Exp DG (kcal/mol)"].values[0]:
                                # set the predicted value to the assay limit
                                target_df.loc[target_df["ligand name"] == ligand, "DG (kcal/mol)"] = ligand_df["Exp DG (kcal/mol)"].values[0]


            n_ligands = len(target_df)
            dg_range = abs(target_df["Exp DG (kcal/mol)"].max() - target_df["Exp DG (kcal/mol)"].min())
            hahn_system = True if n_ligands >= 16 and dg_range >= 3.0 else False
            system_data = {
                "system group": system,
                "system name": target,
                "N_ligands": n_ligands,
                "DG range": dg_range,
                "Hahn system": hahn_system,
            }
            metrics = ["RAE", "RMSE", "MUE"]
            if hahn_system:
                metrics += ["R2", "KTAU", "rho"]
            
            for metric in metrics:
                s = stats.bootstrap_statistic(
                    y_true=target_df["Exp DG (kcal/mol)"].values,
                    y_pred=target_df["DG (kcal/mol)"].values,
                    dy_true=target_df["Exp dDG (kcal/mol)"].values,
                    dy_pred=target_df["uncertainty (kcal/mol)"].values,
                    statistic=metric,
                    nbootstrap=1000,
                    ci=0.95
                )
                system_data[metric] = s["mle"]
                system_data[f"{metric} lower"] = s["low"]
                system_data[f"{metric} upper"] = s["high"]
            
            # calculate the all-to-all pairwise DDG values
            exp_values, calculated_values = [], []
            ligands = target_df["ligand name"].unique()
            for i, ligand1 in enumerate(ligands):
                for j, ligand2 in enumerate(ligands):
                    if i >= j:
                        continue
                    exp_ligand1 = target_df[target_df["ligand name"] == ligand1]["Exp DG (kcal/mol)"].values[0]
                    exp_ligand2 = target_df[target_df["ligand name"] == ligand2]["Exp DG (kcal/mol)"].values[0]
                    exp_diff = exp_ligand2 - exp_ligand1
                    calculated_ligand1 = target_df[target_df["ligand name"] == ligand1]["DG (kcal/mol)"].values[0]
                    calculated_ligand2 = target_df[target_df["ligand name"] == ligand2]["DG (kcal/mol)"].values[0]
                    calculated_diff = calculated_ligand2 - calculated_ligand1
                    exp_values.append(exp_diff)
                    calculated_values.append(calculated_diff)

            s = stats.bootstrap_statistic(
                y_true=exp_values,
                y_pred=calculated_values,
                dy_true=None,
                dy_pred=None,
                statistic="RMSE",
                nbootstrap=1000,
                ci=0.95
            )
            system_data["all-toall DDG RMSE"] = s["mle"]
            system_data["all-toall DDG RMSE lower"] = s["low"]
            system_data["all-toall DDG RMSE upper"] = s["high"]
            all_systems_data.append(system_data)
    all_systems_df = pd.DataFrame(all_systems_data)
    return all_systems_df
                

In [None]:
public_statistics = calculate_statistics(public_dg_data)

In [None]:
public_statistics.to_csv("public_dataset_statistics.csv", index=False)

In [None]:
private_statistics = calculate_statistics(private_dg_data, public=False)

In [None]:
private_statistics

In [None]:
private_statistics.to_csv("private_dataset_statistics.csv", index=False)

We need to clean up the private datasets, and create another version of the dataframe where we remove conformer, stereo and tautomer issues.
Assay limit issues will be integrated into the error calculation. First create a table to remove the issue ligands.

In [None]:
# try again with private data removing the issue molecules
private_statistics_filtered = calculate_statistics(private_dg_data, public=False, issue_table=issue_ligands)

In [None]:
# write the filtered private statistics to a CSV file
private_statistics_filtered.to_csv("private_dataset_statistics_filtered.csv", index=False)