## Generate benchmarking figures

#### Generate legends

In [3]:
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import font_manager as fm
from matplotlib.patches import Patch
from pathlib import Path

font_path = "../resources/fonts/Aptos.ttf"
fm.fontManager.addfont(font_path)
prop = fm.FontProperties(fname=font_path)
mpl.rcParams['font.family'] = prop.get_name()

def save_legend(boxed: bool = True, show_title: bool = True):

    color_map = {
        "Functional-trained": "#000080",
        "Clinical-trained (meta)": "#b21e35",
        "Clinical-trained (single)": "goldenrod",
        "Population-free": "#98e5a5",
        "Population-tuned": "#23a4a6",
    }

    def _make_handles(linewidth=0.4):
        return [
            Patch(facecolor=color_map[cat], edgecolor='black', linewidth=linewidth, label=cat)
            for cat in color_map
        ]

    def _save(ax, out_path):
        out_path.parent.mkdir(parents=True, exist_ok=True)
        plt.tight_layout()
        fig = ax.figure
        fig.savefig(out_path, dpi=300, bbox_inches="tight")
        plt.close(fig)
        print(f"Saved legend to: {out_path}")

    fig, ax = plt.subplots(figsize=(2.2, 1))
    ax.axis("off")
    handles = _make_handles(linewidth=0.4)
    legend = ax.legend(
        handles=handles,
        loc="center",
        fontsize=8,
        title="Category" if show_title else None,
        title_fontsize=9,
        ncol=1,
        frameon=boxed,
        edgecolor="black" if boxed else None,
        labelspacing=0.5,
        borderpad=0.6,
        handlelength=1.2,
        handletextpad=0.6,
        borderaxespad=0.0
    )
    if show_title:
        legend.get_title().set_position((0, 4))
    _save(ax, Path("../results/figures/benchmarks/legend_vertical_box.png"))


In [4]:
save_legend(boxed=False, show_title=False)

Saved legend to: ..\results\figures\benchmarks\legend_vertical_box.png


#### Generate AUC bar plots

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from matplotlib.patches import Patch
from matplotlib import font_manager as fm
import matplotlib as mpl
from pathlib import Path

font_path = "../resources/fonts/Aptos.ttf"
fm.fontManager.addfont(font_path)
prop = fm.FontProperties(fname=font_path)
mpl.rcParams['font.family'] = prop.get_name()

def generate_benchmark_figure(
    dataset_name="functional",
    include_legend=True,
    remove_clinvep=True,
    len_x=None,
    filter_glm=False,
    linewidth=0.5,
    gap_frac=0.25,
    outer_pad_frac=1,
    bar_height=0.3
):

    if dataset_name.startswith("mave_"):
        score_col = "Mean_R2"
        correlation_type = dataset_name.replace("mave_", "")
        data_file = f"../results/benchmarking/mave_{correlation_type}_correlation.txt"
        x_label = "Mean R²"
        plot_title = "Mean R² across MAVE studies"
        default_len_x = 0.35
    else:
        score_col = "AUC"
        data_file = f"../results/benchmarking/{dataset_name}.txt"
        x_label = "AUC"
        default_len_x = 1.0
        if dataset_name == "cancer":
            plot_title = "Performance on cancer hotspots"
        elif dataset_name in ["clinical", "functional"]:
            plot_title = f"Performance on {'clinical' if dataset_name == 'clinical' else 'functional'} data"
        else:
            plot_title = f"Performance on {dataset_name.upper()} variants"

    len_x = len_x if len_x is not None else default_len_x
    features_file = "../resources/feature_lists/all_columns.txt"
    output_figure_path = f"../results/figures/benchmarks/{dataset_name}_auc_plot.png"

    df = pd.read_csv(data_file, sep="\t")
    features_df = pd.read_csv(features_file, sep="\t").rename(columns={"Name": "Tool"})

    vep_tools = features_df[features_df["Category"] == "Variant Effect Predictor"]["Tool"].unique()
    df = df[~df["Tool"].isin(vep_tools)].copy()

    if filter_glm:
        if dataset_name in ["clinical", "functional"]:
            other = "functional" if dataset_name == "clinical" else "clinical"
            other_df = pd.read_csv(f"../results/benchmarking/{other}.txt", sep="\t")
            best_glm_current = df[df["Tool"].str.startswith("glm_")].nlargest(1, score_col)
            best_glm_other = other_df[other_df["Tool"].str.startswith("glm_")].nlargest(1, score_col)
            glm_to_keep = pd.concat([best_glm_current, best_glm_other])["Tool"].unique()
            df = df[(~df["Tool"].str.startswith("glm_")) | (df["Tool"].isin(glm_to_keep))].copy()
        else:
            best_glm = df[df["Tool"].str.startswith("glm_")].nlargest(1, score_col)
            glm_to_keep = best_glm["Tool"].unique()
            df = df[(~df["Tool"].str.startswith("glm_")) | (df["Tool"].isin(glm_to_keep))].copy()

    df = df.merge(features_df[["Tool", "Category"]], on="Tool", how="left")

    if remove_clinvep:
        df = df[~df["Tool"].str.startswith("ClinVEP_")].copy()

    def clean_tool_name(tool):
        tool = re.sub(r"^glm_", "", tool)
        tool = re.sub(r"_score$", "", tool)
        return tool.replace("___", "-").replace("__", "-").replace("_", "-")

    df["Tool"] = df["Tool"].apply(clean_tool_name)

    category_rename = {
        "Functional-Trained": "Functional-trained",
        "Clinical-Trained Meta Predictor": "Clinical-trained (meta)",
        "Clinical-Trained Single Predictor": "Clinical-trained (single)",
        "Population-Free": "Population-free",
        "Population-Tuned": "Population-tuned"
    }
    df["Category"] = df["Category"].map(category_rename)

    color_map = {
        "Functional-trained": "#000080",
        "Clinical-trained (meta)": "#b21e35",
        "Clinical-trained (single)": "goldenrod",
        "Population-free": "#98e5a5",
        "Population-tuned": "#23a4a6",
    }
    df["Color"] = df["Category"].map(color_map).fillna("white")

    df = df.sort_values(score_col, ascending=True).reset_index(drop=True)
    n_bars = len(df)

    gap_ref = 0.5
    gap = max(0.05, gap_ref * gap_frac)
    step = bar_height + gap
    y_pos = np.arange(n_bars) * step
    outer_pad = bar_height * outer_pad_frac

    plt.rcParams.update({
        'font.size': 5.5,
        # 'font.weight': 'medium',
        # 'font.family': 'DejaVu Sans'
    })

    fig_height = max(2.0, 0.28 * n_bars * (step / 1.0))
    fig, ax = plt.subplots(figsize=(6, fig_height))

    ax.barh(
        y=y_pos,
        width=df[score_col] if score_col == "Mean_R2" else df[score_col] - 0.5,
        left=0 if score_col == "Mean_R2" else 0.5,
        color=df["Color"],
        height=bar_height,
        edgecolor='black',
        linewidth=linewidth
    )

    ax.set_yticks(y_pos)
    ax.set_yticklabels(df["Tool"])
    ax.set_ylim(y_pos.min() - outer_pad, y_pos.max() + outer_pad)
    ax.set_xlim([0.0 if score_col == "Mean_R2" else 0.5, len_x])
    ax.set_xlabel(x_label, fontsize=7)
    ax.set_title(plot_title, fontsize=9, loc="center", pad=10)

    if include_legend:
        handles = [Patch(facecolor=color_map[c], label=c, edgecolor='black', linewidth=linewidth) for c in color_map]
        ax.legend(
            handles=handles,
            loc="lower right",
            fontsize=8,
            title_fontsize=6.5,
            frameon=True,
            edgecolor="gray"
        )

    plt.tight_layout()
    output_figure_path.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(output_figure_path, dpi=300)
    plt.close(fig)
    print(f"Figure saved to: {output_figure_path}")


In [4]:
generate_benchmark_figure("clinical", include_legend=True, filter_glm=False, len_x = 1.0)
generate_benchmark_figure("functional", include_legend=True, filter_glm=False)
generate_benchmark_figure("dd", include_legend=True, len_x=0.82)
generate_benchmark_figure("ndd", include_legend=True, len_x=0.66)
generate_benchmark_figure("cancer", include_legend=True, len_x=0.95)
generate_benchmark_figure("mave_gam", include_legend=True, filter_glm=False, len_x=0.27)

Figure saved to: ../results/figures/benchmarks/clinical_auc_plot.png
Figure saved to: ../results/figures/benchmarks/functional_auc_plot.png
Figure saved to: ../results/figures/benchmarks/dd_auc_plot.png
Figure saved to: ../results/figures/benchmarks/ndd_auc_plot.png
Figure saved to: ../results/figures/benchmarks/cancer_auc_plot.png
Figure saved to: ../results/figures/benchmarks/mave_gam_auc_plot.png


#### Generate filtered AUC bar plots

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from matplotlib.patches import Patch
from matplotlib import font_manager as fm
from pathlib import Path

font_path = "../resources/fonts/Aptos.ttf"
fm.fontManager.addfont(font_path)
prop = fm.FontProperties(fname=font_path)
mpl.rcParams['font.family'] = prop.get_name()

def generate_filtered_benchmark_figure(
    dataset_name="functional",
    bold_keep_tools=False,
    remove_clinvep=True,
    len_x=None,
    linewidth=1,
    gap_frac=0.25,
    fig_height_per_bar=0.6,
    outer_pad_frac = 0.5,
    tools_per_category=3,
): 

    if dataset_name.startswith("mave_"):
        score_col = "Mean_R2"
        correlation_type = dataset_name.replace("mave_", "")
        data_file = f"../results/benchmarking/mave_{correlation_type}_correlation.txt"
        x_label = "Mean R²"
        plot_title = "Mean R² across MAVE studies"
        default_len_x = 0.35
    else:
        score_col = "AUC"
        data_file = f"../results/benchmarking/{dataset_name}.txt"
        x_label = "AUC"
        default_len_x = 1.0
        if dataset_name == "cancer":
            plot_title = "Performance on cancer hotspots"
        elif dataset_name in ["clinical", "functional"]:
            plot_title = f"Performance on {'clinical' if dataset_name == 'clinical' else 'functional'} data"
        else:
            plot_title = f"Performance on {dataset_name.upper()} variants"

    len_x = len_x if len_x is not None else default_len_x
    features_file = "../resources/feature_lists/all_columns.txt"
    output_figure_path = f"../results/figures/benchmarks/{dataset_name}_auc_filtered_plot.png"

    df = pd.read_csv(data_file, sep="\t")
    features_df = pd.read_csv(features_file, sep="\t").rename(columns={"Name": "Tool"})
    df = df.merge(features_df[["Tool", "Category"]], on="Tool", how="left")

    if remove_clinvep:
        df = df[~df["Tool"].str.startswith("ClinVEP_")].copy()

    always_keep = {"FuncVEP_CTI", "FuncVEP_CTE", "FuncVEP_SP", "ClinVEP_CTI", "ClinVEP_CTE", "ClinVEP_SP"}
    remaining_df = df[~df["Tool"].isin(always_keep)]

    best_per_category = (
        remaining_df.sort_values(score_col, ascending=False)
        .groupby("Category")
        .head(tools_per_category)
    )

    selected_df = pd.concat([
        df[df["Tool"].isin(always_keep)],
        best_per_category
    ]).drop_duplicates(subset="Tool")

    def clean_tool_name(tool):
        tool = re.sub(r"^glm_", "", tool)
        tool = re.sub(r"_score$", "", tool)
        return tool.replace("___", "-").replace("__", "-").replace("_", "-")

    selected_df["Cleaned_Tool"] = selected_df["Tool"].apply(clean_tool_name)

    category_rename = {
        "Functional-Trained": "Functional-trained",
        "Clinical-Trained Meta Predictor": "Clinical-trained (meta)",
        "Clinical-Trained Single Predictor": "Clinical-trained (single)",
        "Population-Free": "Population-free",
        "Population-Tuned": "Population-tuned"
    }
    selected_df["Category"] = selected_df["Category"].map(category_rename)

    color_map = {
        "Functional-trained": "#000080",
        "Clinical-trained (meta)": "#b21e35",
        "Clinical-trained (single)": "goldenrod",
        "Population-free": "#98e5a5",
        "Population-tuned": "#23a4a6",
    }
    selected_df["Color"] = selected_df["Category"].map(color_map).fillna("white")

    selected_df = selected_df.sort_values(score_col, ascending=True).reset_index(drop=True)

    bar_height_ref = 0.5
    gap_ref = 1.0 - bar_height_ref
    gap = gap_ref * gap_frac
    bar_height = bar_height_ref

    step = bar_height + gap
    n_bars = len(selected_df)
    y_pos = np.arange(n_bars) * step

    fig_height = max(2.0, fig_height_per_bar * n_bars * (step / 1.0))

    fig, ax = plt.subplots(figsize=(6, fig_height))

    ax.barh(
        y=y_pos,
        width=selected_df[score_col] - (0.5 if score_col == "AUC" else 0.0),
        left=(0.5 if score_col == "AUC" else 0.0),
        color=selected_df["Color"],
        height=bar_height,
        edgecolor='black',
        linewidth=linewidth,
    )

    ax.set_yticks(y_pos)
    ax.set_yticklabels(selected_df["Cleaned_Tool"])
    ax.set_title(plot_title, fontsize=16, pad=20)
    ax.set_xlabel(x_label, fontsize=14)
    ax.tick_params(axis='y', labelsize=12)
    ax.tick_params(axis='x', labelsize=14)

    pad = bar_height * 0.6
    
    outer_pad = bar_height * outer_pad_frac

    ymin = y_pos.min() - (bar_height / 2) - outer_pad
    ymax = y_pos.max() + (bar_height / 2) + outer_pad
    ax.set_ylim(ymin, ymax)

    ax.set_xlim([0.5 if score_col == "AUC" else 0.0, len_x])

    always_keep_cleaned = {clean_tool_name(t) for t in always_keep}
    for tick, label in zip(ax.get_yticklabels(), selected_df["Cleaned_Tool"]):
        tick.set_fontweight('bold' if bold_keep_tools and label in always_keep_cleaned else 'normal')

    plt.tight_layout()
    output_figure_path.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(output_figure_path, dpi=300)
    plt.close(fig)
    print(f"Filtered figure saved to: {output_figure_path}")

In [6]:
generate_filtered_benchmark_figure("clinical")
generate_filtered_benchmark_figure("functional")
generate_filtered_benchmark_figure("dd", len_x=0.82)
generate_filtered_benchmark_figure("ndd", len_x=0.66)
generate_filtered_benchmark_figure("cancer", len_x=0.95)
generate_filtered_benchmark_figure("mave_gam")

Filtered figure saved to: ../results/figures/benchmarks/clinical_auc_filtered_plot.png
Filtered figure saved to: ../results/figures/benchmarks/functional_auc_filtered_plot.png
Filtered figure saved to: ../results/figures/benchmarks/dd_auc_filtered_plot.png
Filtered figure saved to: ../results/figures/benchmarks/ndd_auc_filtered_plot.png
Filtered figure saved to: ../results/figures/benchmarks/cancer_auc_filtered_plot.png
Filtered figure saved to: ../results/figures/benchmarks/mave_gam_auc_filtered_plot.png


#### Generate MAVE violin plot

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde
import seaborn as sns
import re
from matplotlib.patches import Patch
from matplotlib import font_manager as fm
from pathlib import Path

font_path = "../resources/fonts/Aptos.ttf"
fm.fontManager.addfont(font_path)
prop = fm.FontProperties(fname=font_path)
mpl.rcParams['font.family'] = prop.get_name()

def plot_mave_violin(
    tools_per_category: int = 3,
    include_legend: bool = True,
    remove_clinvep: bool = True,
    remove_varity: bool = False,
    *,
    x_gap_frac: float = 0.6,
    violin_width: float = 0.3,
    fig_width_per_tool: float = 0.75,
    fig_height: float = 6.0,
    left_pad: float = 0.5,
    right_pad_frac: float = 0.25,
    title_fontsize: int = 16,
    ylabel_fontsize: int = 14,
    xtick_fontsize: int = 12,
    mean_fontsize: int = 9,
    ytick_fontsize: int = 14,
    y_major_step: float = 0.2,
    y_minor_step: float = 0.05,
):

    gam_df = pd.read_csv("../results/benchmarking/mave_gam_correlation.txt", sep="\t")
    features_df = (
        pd.read_csv("../resources/feature_lists/all_columns.txt", sep="\t")
        .rename(columns={"Name": "Tool"})
    )
    gam_df = gam_df.merge(features_df[["Tool", "Category"]], on="Tool", how="left")

    category_rename = {
        "Functional-Trained": "Functional-trained",
        "Clinical-Trained Meta Predictor": "Clinical-trained (meta)",
        "Clinical-Trained Single Predictor": "Clinical-trained (single)",
        "Population-Free": "Population-free",
        "Population-Tuned": "Population-tuned",
    }
    gam_df["Category"] = gam_df["Category"].map(category_rename)

    if remove_clinvep:
        gam_df = gam_df[~gam_df["Tool"].str.startswith("ClinVEP")].copy()

    if remove_varity:
        gam_df = gam_df[~gam_df["Tool"].str.startswith("VARITY")].copy()

    color_map = {
        "Functional-trained": "#000080",
        "Clinical-trained (meta)": "#b21e35",
        "Clinical-trained (single)": "goldenrod",
        "Population-free": "#98e5a5",
        "Population-tuned": "#23a4a6",
    }

    def clean_tool_name(tool: str) -> str:
        tool = re.sub(r"^glm_", "", tool)
        tool = re.sub(r"_score$", "", tool)
        return tool.replace("___", "-").replace("__", "-").replace("_", "-")

    gam_df = gam_df.sort_values("Mean_R2", ascending=False)
    top_per_category = (
        gam_df.groupby("Category", group_keys=False)
        .head(tools_per_category)
        .copy()
    )
    top_per_category["Tool"] = top_per_category["Tool"].apply(clean_tool_name)

    tools = top_per_category["Tool"].tolist()
    tool_to_color = {
        row["Tool"]: color_map.get(row["Category"], "gray")
        for _, row in top_per_category.iterrows()
    }
    gam_long = (
        top_per_category.melt(
            id_vars=["Tool", "Mean_R2", "Category"],
            var_name="Source",
            value_name="R2",
        )
        .dropna(subset=["R2"])
    )

    sns.set(style="white", context="talk", font_scale=1.2)

    y_max = 0.8
    label_y = y_max + 0.01

    x_step = x_gap_frac
    x_positions = np.arange(len(tools)) * x_step

    max_safe_width = x_step * 0.95
    if violin_width > max_safe_width:
        print(
            f"[plot_mave_violin] violin_width={violin_width:.2f} too large for "
            f"x_step={x_step:.2f}; clamping to {max_safe_width:.2f}."
        )
        violin_width = max_safe_width
    width = violin_width

    fig_w = max(8.0, len(tools) * fig_width_per_tool * x_gap_frac)
    fig, ax = plt.subplots(figsize=(fig_w, fig_height))

    for i, tool in enumerate(tools):
        data = gam_long[gam_long["Tool"] == tool]["R2"].dropna().values
        if len(data) < 2:
            continue

        kde = gaussian_kde(data, bw_method=0.2)
        y_vals = np.linspace(0, y_max, 200)
        densities = kde(y_vals)
        densities = densities / densities.max() * width

        ax.fill_betweenx(
            y_vals,
            x_positions[i],
            x_positions[i] + densities,
            facecolor=tool_to_color[tool],
            alpha=0.7,
            linewidth=1,
            edgecolor="black",
        )
        ax.plot(
            [x_positions[i], x_positions[i]],
            [0, y_max],
            color="k",
            lw=1,
            alpha=0.7,
        )

        mean_val = np.mean(data)
        ax.plot(
            x_positions[i],
            mean_val,
            "o",
            color="black",
            markersize=6,
            zorder=10,
        )
        ax.text(
            x_positions[i],
            label_y,
            f"{mean_val:.2f}",
            ha="center",
            va="bottom",
            fontsize=mean_fontsize,
        )

    ax.set_xticks(x_positions)
    ax.set_xticklabels(tools, rotation=45, ha="right", fontsize=xtick_fontsize)
    ax.set_ylabel("GAM $R^2$", fontsize=ylabel_fontsize)
    ax.set_ylim(0, label_y + 0.03)
    ax.set_title("MAVE correlation performance", pad=20, fontsize=title_fontsize)

    right_pad = x_step * right_pad_frac
    ax.set_xlim(-left_pad, x_positions[-1] + width + right_pad)

    sns.despine(ax=ax, top=True, right=True, left=False, bottom=False, trim=False)

    ax.set_yticks(np.arange(0, 0.8001 + 1e-9, y_major_step))
    ax.set_yticks(np.arange(0, 0.8001 + 1e-9, y_minor_step), minor=True)

    ax.tick_params(axis="y", which="major", direction="inout", length=5, width=1.2, labelsize=ytick_fontsize)
    ax.tick_params(axis="y", which="minor", direction="inout", length=3, width=1)

    ax.grid(axis="y", which="major", linestyle="-", linewidth=0.6, color="0.85")
    ax.grid(axis="y", which="minor", linestyle="-", linewidth=0.4, color="0.92")

    if include_legend:
        handles = [
            plt.Line2D(
                [0], [0],
                marker="o",
                color="w",
                label=cat,
                markerfacecolor=color,
                markeredgecolor="black",
                markersize=10,
            )
            for cat, color in color_map.items()
        ]
        ax.legend(
            handles=handles,
            loc="upper left",
            bbox_to_anchor=(0.98, 0.85),
            title="Category",
            fontsize=12,
            title_fontsize=14,
            frameon=True,
            edgecolor="gray",
        )

    out_base = f"../results/figures/benchmarks/mave_gam_violin_plot"
    out_base.parent.mkdir(parents=True, exist_ok=True)
    fig.tight_layout(rect=[0.01, 0, 0.93, 1])
    fig.savefig(f"{out_base}.png", bbox_inches="tight", dpi=300)
    plt.close(fig)
    print(f"Saved: {out_base}.png")


In [8]:
plot_mave_violin(tools_per_category=3, include_legend=False, remove_clinvep=True, remove_varity=True)

Saved: ../results/figures/benchmarks/mave_gam_violin_plot.png


#### Plot ACMG classification performance

In [None]:
import matplotlib as mpl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Patch
from matplotlib import font_manager as fm
from pathlib import Path

font_path = "../resources/fonts/Aptos.ttf"
fm.fontManager.addfont(font_path)
prop = fm.FontProperties(fname=font_path)
mpl.rcParams['font.family'] = prop.get_name()

df = pd.read_csv("../results/benchmarking/acmg_performance.txt", sep="\t")
df["Conclusiveness"] = 1 - df["VUS_M ratio"]

metrics = ["Sensitivity", "Specificity", "Concordance", "Conclusiveness"]

color_map = {
    "AlphaMissense": "#23a4a6",
    "ESM1b": "#98e5a5",
    "FuncVEP-CTI": "#001c71",
    "FuncVEP-CTE": "#2743b5",
    "FuncVEP-SP": "#8fa3ff",
}

bar_width = 0.15
x = np.arange(len(metrics))
offsets = np.linspace(-2, 2, len(df)) * bar_width

fig, ax = plt.subplots(figsize=(8, 4.5))

for i, tool in enumerate(df["Tool"]):
    y_vals = df.loc[i, metrics].values.astype(float)
    x_positions = x + offsets[i]

    ax.bar(
        x_positions,
        y_vals,
        width=bar_width,
        color=color_map[tool],
        edgecolor="black",
        label=tool,
        linewidth=0.4,
    )

ax.set_xticks(x)
ax.set_xticklabels(metrics, fontsize=9)
ax.set_ylabel("Performance", fontsize=10)
ax.set_ylim([0.70, 1.0])
ax.set_title("ACMG Classification Performance", fontsize=11)
ax.tick_params(axis="y", labelsize=8)

legend_elements = [
    Patch(
        facecolor=color_map[t],
        edgecolor="black",
        linewidth=0.6,
        label=t
    )
    for t in df["Tool"]
]

ax.legend(
    handles=legend_elements,
    fontsize=8.5,
    frameon=False,
    loc="lower center",
    bbox_to_anchor=(0.5, -0.22),
    ncol=5
)

Path("../results/figures/benchmarks").mkdir(parents=True, exist_ok=True)
plt.tight_layout()
plt.savefig("../results/figures/benchmarks/acmg_benchmark.png", dpi=300, bbox_inches="tight")
plt.close(fig)


#### Generate benchmark tables

In [None]:
import pandas as pd
import re
from pathlib import Path

def generate_benchmark_table(dataset_name="clinical", filter_glm=False):

    other_dataset = "functional" if dataset_name == "clinical" else "clinical"
    benchmark_file = f"../results/benchmarking/{dataset_name}.txt"
    other_benchmark_file = f"../results/benchmarking/{other_dataset}.txt"
    features_file = "../resources/feature_lists/all_columns.txt"
    output_file = f"../results/tables/benchmarks/{dataset_name}_benchmark.txt"

    benchmark_df = pd.read_csv(benchmark_file, sep="\t")
    other_df = pd.read_csv(other_benchmark_file, sep="\t")
    features_df = pd.read_csv(features_file, sep="\t").rename(columns={"Name": "Tool"})

    included_types = [
        "Variant Effect Predictor",
    ]
    included_tools = features_df[features_df["Category"].isin(included_types)]["Tool"].unique()
    benchmark_df = benchmark_df[~benchmark_df["Tool"].isin(included_tools)].copy()

    if filter_glm:
        def get_top_glm_tool(df):
            glm_subset = df[df["Tool"].str.startswith("glm_")]
            return glm_subset.loc[glm_subset["AUC"].idxmax(), "Tool"] if not glm_subset.empty else None

        top_glm_current = get_top_glm_tool(benchmark_df)
        top_glm_other = get_top_glm_tool(other_df)
        glm_to_keep = set(filter(None, [top_glm_current, top_glm_other]))

        benchmark_df = benchmark_df[
            ~benchmark_df["Tool"].str.startswith("glm_") | benchmark_df["Tool"].isin(glm_to_keep)
        ].copy()

    benchmark_df = benchmark_df.merge(features_df[["Tool", "Category"]], on="Tool", how="left")

    def clean_tool_name(tool):
        tool = re.sub(r"^glm_", "", tool)
        tool = re.sub(r"_score$", "", tool)
        tool = tool.replace("___", "-").replace("__", "-").replace("_", "-")
        return tool

    benchmark_df["Tool"] = benchmark_df["Tool"].apply(clean_tool_name)

    final_df = benchmark_df[["Tool", "AUC", "Accuracy", "Category"]].copy()
    final_df.sort_values("AUC", ascending=False, inplace=True)
    final_df.reset_index(drop=True, inplace=True)
    output_file.parent.mkdir(parents=True, exist_ok=True)
    final_df.to_csv(output_file, sep="\t", index=False)
    print(f"Benchmark table saved to: {output_file}")

    return final_df


In [None]:
generate_benchmark_table("clinical", filter_glm=False)
generate_benchmark_table("functional", filter_glm=False)

Benchmark table saved to: ../results/tables/benchmarks/clinical_benchmark.txt
Benchmark table saved to: ../results/tables/benchmarks/functional_benchmark.txt



In [None]:
def save_average_auc_by_category(dataset_name="clinical", filter_glm=False):

    benchmark_df = generate_benchmark_table(dataset_name, filter_glm)
    category_auc_df = benchmark_df.groupby("Category", as_index=False)["AUC"].mean()
    category_auc_df.sort_values("AUC", ascending=False, inplace=True)
    output_file = f"../results/tables/benchmarks/{dataset_name}_average_auc_by_category.txt"
    output_file.parent.mkdir(parents=True, exist_ok=True)
    category_auc_df.to_csv(output_file, sep="\t", index=False)
    print(f"Average AUC by category table saved to: {output_file}")
    return category_auc_df


In [13]:
save_average_auc_by_category("clinical", filter_glm=False)
save_average_auc_by_category("functional", filter_glm=False)

Benchmark table saved to: ../results/tables/benchmarks/clinical_benchmark.txt
Average AUC by category table saved to: ../results/tables/benchmarks/clinical_average_auc_by_category.txt
Benchmark table saved to: ../results/tables/benchmarks/functional_benchmark.txt
Average AUC by category table saved to: ../results/tables/benchmarks/functional_average_auc_by_category.txt


Unnamed: 0,Category,AUC
2,Functional-Trained,0.960354
1,Clinical-Trained Single Predictor,0.808367
3,Population-Free,0.800284
4,Population-Tuned,0.759257
0,Clinical-Trained Meta Predictor,0.756777


In [None]:
import pandas as pd

def save_and_merge_average_auc_by_category(filter_glm=False):

    functional_df = save_average_auc_by_category(dataset_name="functional", filter_glm=filter_glm)
    clinical_df = save_average_auc_by_category(dataset_name="clinical", filter_glm=filter_glm)

    functional_df.rename(columns={"AUC": "Average AUC on functional data"}, inplace=True)
    clinical_df.rename(columns={"AUC": "Average AUC on clinical data"}, inplace=True)

    merged_df = pd.merge(functional_df, clinical_df, on="Category", how="outer")
    merged_df.sort_values(by="Average AUC on functional data", ascending=False, inplace=True, na_position='last')

    output_file = "../results/tables/benchmarks/merged_average_auc_by_category.txt"
    output_file.parent.mkdir(parents=True, exist_ok=True)
    merged_df.to_csv(output_file, sep="\t", index=False)
    print(f"Merged average AUC table saved to: {output_file}")

    return merged_df

save_and_merge_average_auc_by_category()

Benchmark table saved to: ../results/tables/benchmarks/functional_benchmark.txt
Average AUC by category table saved to: ../results/tables/benchmarks/functional_average_auc_by_category.txt
Benchmark table saved to: ../results/tables/benchmarks/clinical_benchmark.txt
Average AUC by category table saved to: ../results/tables/benchmarks/clinical_average_auc_by_category.txt
Merged average AUC table saved to: ../results/tables/benchmarks/merged_average_auc_by_category.txt


Unnamed: 0,Category,Average AUC on functional data,Average AUC on clinical data
2,Functional-Trained,0.960354,0.970876
1,Clinical-Trained Single Predictor,0.808367,0.923161
3,Population-Free,0.800284,0.905156
4,Population-Tuned,0.759257,0.879514
0,Clinical-Trained Meta Predictor,0.756777,0.910209
