## Benchmark MAVE correlation performance

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.stats import pearsonr
from pygam import LinearGAM, s
from sklearn.metrics import r2_score
from scipy.stats import spearmanr

In [None]:
mave_test_variants = pd.read_csv("../data/intermediate/mave_test_variants_filtered.txt", sep="\t")

In [None]:
mave_test_variants["ID"] = mave_test_variants[["chr", "pos", "ref", "alt"]].astype(str).agg("-".join, axis=1)
mave_test_variants.drop(columns=["chr", "pos", "ref", "alt"], inplace=True)

TOOL_COLS = [c for c in mave_test_variants.columns if c not in ["ID", "ensg", "score", "source", "gene"]]
mave_test_variants[TOOL_COLS] = mave_test_variants[TOOL_COLS].apply(pd.to_numeric, errors="coerce")
mave_test_variants["score"] = pd.to_numeric(mave_test_variants["score"], errors="coerce")

In [None]:
mave_test_variants

Unnamed: 0,ensg,score,source,gene,PrimateAI_score,PAI3D_score,SIFT_score,Polyphen2_HDIV_score,Polyphen2_HVAR_score,MutationTaster_score,...,EVH_independent,EWSIM,sigma_score,ID,FuncVEP_CTI,FuncVEP_CTE,FuncVEP_SP,ClinVEP_CTI,ClinVEP_CTE,ClinVEP_SP
0,ENSG00000120948,-0.342700,urn:mavedb:00000060-a-2,TARDBP,0.842842,0.693236,0.043,0.006,0.002,0.78,...,-6.615,-3.948,0.923124,1-11022403-A-C,0.151133,0.891176,0.909904,0.028576,0.984980,0.959452
1,ENSG00000120948,-0.066900,urn:mavedb:00000060-a-2,TARDBP,0.788505,0.601515,0.131,0.462,0.084,0.67,...,-1.022,-3.145,0.809167,1-11022403-A-G,0.020714,0.546472,0.735746,0.018498,0.439737,0.587350
2,ENSG00000120948,0.481300,urn:mavedb:00000060-a-2,TARDBP,0.818508,0.742082,0.006,0.994,0.570,0.73,...,-6.615,-3.948,0.859130,1-11022403-A-T,0.318712,0.923565,0.856310,0.403256,0.983367,0.980480
3,ENSG00000120948,-0.193600,urn:mavedb:00000060-a-2,TARDBP,0.838058,0.582136,0.238,0.027,,0.84,...,-1.028,-2.559,0.853733,1-11022404-G-A,0.053620,0.524844,0.677054,0.016765,0.721399,0.374597
4,ENSG00000120948,0.474200,urn:mavedb:00000060-a-2,TARDBP,0.817442,0.582486,0.024,0.635,0.112,0.69,...,-2.311,-3.570,0.856973,1-11022404-G-C,0.031629,0.643125,0.725342,0.026908,0.829906,0.428562
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172925,ENSG00000134086,-3.561439,tmp:d624e093-ac9b,VHL,0.818915,0.898620,,1.000,0.999,0.76,...,,,0.996440,3-10146556-T-G,0.890458,0.958075,0.985788,0.999291,0.994619,0.990489
172926,ENSG00000134086,-3.573963,tmp:d624e093-ac9b,VHL,0.882541,0.738942,,1.000,0.997,1.00,...,,,0.996326,3-10142180-C-G,0.944069,0.991612,0.982732,0.998473,0.994076,0.982556
172927,ENSG00000134086,-3.590531,tmp:d624e093-ac9b,VHL,0.870494,0.724138,,1.000,0.994,0.99,...,,,0.991407,3-10142167-G-C,0.890796,0.978566,0.986710,0.926636,0.994373,0.987596
172928,ENSG00000134086,-3.607511,tmp:d624e093-ac9b,VHL,0.879346,0.675965,,0.993,0.932,0.94,...,,,0.979145,3-10142179-G-T,0.878582,0.985083,0.988095,0.997974,0.988474,0.990000


#### Benchmarking function

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from pygam import LinearGAM, s
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import r2_score

def benchmark_tools(
    dataset: pd.DataFrame,
    tool_columns: list,
    score_column: str = "score",
    group_column: str = "source",
    id_column: str = "ID",
    min_samples: int = 10,
    min_sources: int = 3,
    methods: list = ["GAM"]
) -> dict:

    sources = sorted(dataset[group_column].dropna().unique())
    results = {"GAM": [], "Pearson": [], "Spearman": []}

    for source_name in tqdm(sources, desc="Benchmarking sources"):
        mave_subset = dataset[dataset[group_column] == source_name][[id_column, score_column] + tool_columns]

        for tool in tool_columns:
            df = mave_subset[[tool, score_column]].dropna()
            if len(df) < min_samples:
                r2 = {"GAM": np.nan, "Pearson": np.nan, "Spearman": np.nan}
            else:
                x = df[tool].astype(float).values.reshape(-1, 1)
                y = df[score_column].astype(float).values

                r2 = {}
                if "GAM" in methods:
                    try:
                        gam = LinearGAM(s(0)).fit(x, y)
                        r2["GAM"] = r2_score(y, gam.predict(x))
                    except:
                        r2["GAM"] = np.nan
                if "Pearson" in methods:
                    try:
                        if np.nanvar(x) > 0 and np.nanvar(y) > 0:
                            r = pearsonr(x.flatten(), y)[0]
                            r2["Pearson"] = r ** 2
                        else:
                            r2["Pearson"] = np.nan
                    except:
                        r2["Pearson"] = np.nan
                if "Spearman" in methods:
                    try:
                        if np.nanvar(x) > 0 and np.nanvar(y) > 0:
                            rho = spearmanr(x.flatten(), y)[0]
                            r2["Spearman"] = rho ** 2
                        else:
                            r2["Spearman"] = np.nan
                    except:
                        r2["Spearman"] = np.nan

            for method in methods:
                results[method].append({
                    "Tool": tool,
                    "Source": source_name,
                    "R2": r2.get(method, np.nan)
                })

    output_dfs = {}
    for method in methods:
        df = pd.DataFrame(results[method])
        df_wide = df.pivot(index="Tool", columns="Source", values="R2").reset_index()
        valid_mask = df_wide.drop(columns=["Tool"]).notna().sum(axis=1) >= min_sources
        df_wide = df_wide[valid_mask].copy()
        df_wide["Mean_R2"] = df_wide.drop(columns=["Tool"]).mean(axis=1, skipna=True)
        df_wide = df_wide.sort_values("Mean_R2", ascending=False)
        df_wide = df_wide[["Tool", "Mean_R2"] + [col for col in df_wide.columns if col not in ["Tool", "Mean_R2"]]]
        output_dfs[method] = df_wide

    return output_dfs


In [None]:
results = benchmark_tools(
    dataset=mave_test_variants,
    tool_columns=TOOL_COLS,
    min_samples=10,
    min_sources=3,
    methods=["GAM"]
)

results["GAM"].to_csv("../results/benchmarking/mave_gam_correlation.txt", sep="\t", index=False, na_rep="NA")

Benchmarking sources: 100%|██████████| 58/58 [01:02<00:00,  1.08s/it]
