In [None]:
import pandas as pd

from src.utils import ResultGenerator
from settings import RANDOM_STATE, COMPLETE_METRICS_PATH, COMPLETE_RESULTS_PATH, INCOMPLETE_METRICS_PATH, INCOMPLETE_RESULTS_PATH, \
    COMPLETE_RBMETRICS_PATH, INCOMPLETE_RBMETRICS_PATH, COMPARISON_ALG_PATH, UNS_METRICS_COMPLETE_ALG_PATH, UNS_METRICS_INCOMPLETE_ALG_PATH

In [None]:
folder_results = "results"
indexes = ["dataset", "algorithm", "missing_percentage", "amputation_mechanism", "imputation", "run_n"]
n_permutations = 1000
nb_workers = 10
progress_bar = True
verbose = True

In [None]:
complete_results = ResultGenerator.preprocess_results(results_path= COMPLETE_RESULTS_PATH, verbose=True, nb_workers=nb_workers, progress_bar=True)
incomplete_results = ResultGenerator.preprocess_results(results_path= INCOMPLETE_RESULTS_PATH, verbose=True, nb_workers=nb_workers, progress_bar=True)
results = pd.concat([complete_results, incomplete_results], ignore_index=True)

In [None]:
_ = ResultGenerator.save_unsupervised_metrics(results = complete_results.copy(), filepath= UNS_METRICS_COMPLETE_ALG_PATH, random_state=RANDOM_STATE, progress_bar=True)
outputs = ResultGenerator.save_unsupervised_metrics(results = incomplete_results.copy(), filepath= UNS_METRICS_INCOMPLETE_ALG_PATH, random_state=RANDOM_STATE, progress_bar=True)

In [None]:
outputs = ResultGenerator.save_alg_comparison(results = results.copy(), filepath= COMPARISON_ALG_PATH, progress_bar=True)

In [None]:
_ = ResultGenerator.save_supervised_metrics(results = complete_results.copy(), filepath= COMPLETE_METRICS_PATH, random_state=RANDOM_STATE, n_permutations=n_permutations)
outputs = ResultGenerator.save_supervised_metrics(results = incomplete_results.copy(), filepath= INCOMPLETE_METRICS_PATH, random_state=RANDOM_STATE, n_permutations=n_permutations)

In [None]:
_ = ResultGenerator.save_robustness_metrics(results = complete_results.copy(), filepath= COMPLETE_RBMETRICS_PATH, random_state=RANDOM_STATE, n_permutations=n_permutations)
outputs = ResultGenerator.save_robustness_metrics(results = incomplete_results.copy(), filepath= INCOMPLETE_RBMETRICS_PATH, random_state=RANDOM_STATE, n_permutations=n_permutations)

In [None]:
incomplete_results = ResultGenerator.preprocess_results(results_path= "results/incomplete_algorithms_evaluation_2.csv", verbose=True, nb_workers=nb_workers, progress_bar=True)
outputs = ResultGenerator.save_unsupervised_metrics(results = incomplete_results.copy(), filepath= UNS_METRICS_INCOMPLETE_ALG_PATH, random_state=RANDOM_STATE, progress_bar=True)
outputs = ResultGenerator.save_supervised_metrics(results = incomplete_results.copy(), filepath= INCOMPLETE_METRICS_PATH, random_state=RANDOM_STATE, n_permutations=n_permutations)

In [None]:
def plot_graph(df, alg1_col, alg2_col, value_col, figsize= (12, 8), random_state = None):
    G = nx.Graph()
    df.apply(lambda x: G.add_edge(x[alg1_col], x[alg2_col], weight=round(x[value_col], 2)), axis= 1)
    for index, row in df.iterrows():
        G.add_edge(row[alg1_col], row[alg2_col], weight=round(row[value_col], 2))
    pos = nx.spring_layout(G, seed=random_state)  # positions for all nodes
    plt.figure(figsize=(12, 8))  # Change the figure size
    nx.draw_networkx_nodes(G, pos, node_size=700, node_color='lightblue')
    edges = G.edges(data=True)
    weights = [edge[2]['weight'] for edge in edges]
    edges = nx.draw_networkx_edges(
        G, pos, edgelist=edges, width=[w * 10 for w in weights],
        edge_color=weights, edge_cmap=plt.cm.Blues, edge_vmin=min(weights), edge_vmax=max(weights)
    )
    nx.draw_networkx_labels(G, pos, font_size=12, font_family="sans-serif")

    sm = plt.cm.ScalarMappable(cmap=plt.cm.Blues, norm=plt.Normalize(vmin=min(weights), vmax=max(weights)))
    sm.set_array([])
    plt.colorbar(sm, label=f"Agreement ({value_col})", ax=plt.gca())
    plt.show()
    
plot_graph(df= outputs.loc[(outputs["alg1"] != "OPIMC") & (outputs["alg2"] != "OPIMC")], alg1_col="alg1", alg2_col="alg2", value_col="AMI", random_state=RANDOM_STATE)

In [None]:
outputs.loc[~outputs["F1"].isna()]

In [None]:
stability_supervised_metrics = results[(results["dataset"] == dataset) & ((results["imputation"]) | (results["missing_percentage"] == 0))].iloc[:1000].parallel_apply(
    lambda x: [GetMetrics.compute_supervised_metrics(y_true= pred, y_pred= pd.Series(x["y_pred"], index= x["y_pred_idx"]).sort_index().values,
                                                     random_state = RANDOM_STATE, n_permutations=2) for pred in x["y_pred_wo_missing"]], axis= 1)

In [None]:
results.loc[stability_supervised_metrics.index, "stability_supervised_metrics"] = stability_supervised_metrics.parallel_apply(
    lambda x:{f"{key}_{i}": value for i,run_n in enumerate(x) for key,value in run_n.items()})

In [None]:
stability_supervised_metrics = pd.DataFrame(results.loc[stability_supervised_metrics.index, "stability_supervised_metrics"].to_dict(), columns=stability_supervised_metrics.index).T
results = pd.concat([results, stability_supervised_metrics], axis=1)

In [None]:
results

In [None]:
x

In [None]:
results[(~results["stability_supervised_metrics"].isna()) & (results["dataset"] == "simulated_gm")]["missing_percentage"].unique()

In [None]:
results.loc[(results["dataset"] == dataset) &
            (results["algorithm"] == alg) &
            (results["missing_percentage"] == missing_percentage) &
            (results["amputation_mechanism"] == amputation_mechanism)].parallel_apply(
    lambda x: [GetMetrics.compute_supervised_metrics(y_true= pred, y_pred= pd.Series(x["y_pred"], index= x["y_pred_idx"]).sort_index().values,
                                                     random_state = RANDOM_STATE, n_permutations=10) for pred in x["y_pred_wo_missing"]], axis= 1)

In [None]:
results["stability_supervised_metrics"].parallel_apply(lambda x:{f"{key}_stab": value for key,value in x.items()})

In [None]:
results = pd.read_csv(COMPLETE_RESULTS_PATH)
results = results[results["finished"]]
results = results[results["completed"]]
results

In [None]:
results.loc[(results["dataset"] == dataset) & (results["missing_percentage"] == 0), ["y_pred", "y_pred_idx"]]

In [None]:
pandarallel.initialize(nb_workers=5)
results = pd.read_csv(COMPLETE_RESULTS_PATH)
results = results[results["finished"]]
results = results[results["completed"]]
results[["y_true", "y_pred", "y_true_idx", "y_pred_idx"]] = results[
    ["y_true", "y_pred", "y_true_idx", "y_pred_idx"]].parallel_applymap(eval)
assert results["y_true_idx"].eq(results["y_pred_idx"]).all()


In [None]:
dataset = "nutrimouse_genotype"
preds = results.loc[(results["dataset"] == dataset) & (results["algorithm"] == alg) & (results["missing_percentage"] == 0), ["y_pred", "y_pred_idx"]]
results["y_pred_wo_missing"] = preds.apply(lambda x: pd.Series(x["y_pred"], index= x["y_pred_idx"]).to_list(), axis= 1).to_list()
# results.loc[(results["dataset"] == dataset) & (results["missing_percentage"] == 0), ["y_pred", "y_pred_idx"]].apply(lambda x: pd.Series(x["y_pred"], index= x["y_pred_idx"]).to_list()).to_list()

## Incomplete

In [None]:
from pandarallel import pandarallel

In [None]:
pandarallel.initialize(nb_workers=5)

In [None]:
results = pd.read_csv("results/frombioint/incomplete_algorithms_evaluation_2.csv")
print("results", results.shape)
results.head()

In [None]:
import numpy as np

mask = results["finished"] & (results["completed"]) & (results["algorithm"] == "MONET")

results_missing = results[mask].copy()
results_missing["y_pred"] = results_missing["y_pred"].str.replace("nan", "np.nan")
results_missing[["y_pred", "y_pred_idx"]] = results_missing[["y_pred", "y_pred_idx"]].parallel_applymap(lambda x: np.array(eval(x)))
results_missing.loc[:, "algorithm"] = "MONET_IO"
results_missing["y_pred"] = results_missing["y_pred"].apply(lambda x: pd.factorize(x, use_na_sentinel=False)[0].tolist())

results_excluding_outliers = results[mask].copy()
results_excluding_outliers["y_pred"] = results_excluding_outliers["y_pred"].str.replace("nan", "np.nan")
results_excluding_outliers[["y_pred", "y_pred_idx"]] = results_excluding_outliers[["y_pred", "y_pred_idx"]].applymap(lambda x: np.array(eval(x)))
results_excluding_outliers.loc[:, "algorithm"] = "MONET_EO"
metrics_col = ['silhouette', 'vrc', 'db', 'dbcv', 'dunn', "dhi", "ssei", 'rsi', 'bhi']
results_excluding_outliers[metrics_col] = results_excluding_outliers[[f"{met}_excluding_outliers" for met in metrics_col]]
results_excluding_outliers[["y_pred", "y_pred_idx"]] = results_excluding_outliers[["y_pred", "y_pred_idx"]].apply(
    lambda row: (row["y_pred"][~np.isnan(row["y_pred"])].astype(int).tolist(), row["y_pred_idx"][~np.isnan(row["y_pred"])].astype(int).tolist()), axis=1, result_type='expand')

results_missing = pd.concat([results_missing, results_excluding_outliers])
results_missing[["y_pred", "y_pred_idx"]] = results_missing[["y_pred", "y_pred_idx"]].parallel_applymap(str)
results = pd.concat([results.loc[~mask], results_missing])

In [None]:
print("results", results.shape)


In [None]:
results2 = pd.read_csv("results/merge/boz_bioint_inc.csv")
print("results2", results2.shape)
results2.head()

In [None]:
cols = ["dataset", "algorithm", "missing_percentage", "amputation_mechanism", "imputation", "run_n"]
results[cols].eq(results2[cols]).all(1).sum()

In [None]:
results[results[cols].eq(results2[cols]).all(1)]

In [None]:
results = pd.concat([results, results2])
print("results", results.shape)
results.head()

In [None]:
results[results["finished"] & (~results["completed"])]

In [None]:
results.loc[results["finished"] & (~results["completed"])]["comments"].value_counts()

In [None]:
results.loc[results["finished"] & (~results["completed"]), "finished"] = False

In [None]:
results = pd.concat([results, results2])
results.loc[results["finished"] & (~results["completed"]), "finished"] = False
# results.to_csv("results/merge/boz_bioint_inc.csv", index= None)

In [None]:
results.loc[results["finished"] & (~results["completed"]), "finished"] = False
results[results["finished"] & (~results["completed"])]

In [None]:
print("dataset count")
print(results["dataset"].value_counts())
print()
print("algorithm count")
print(results["algorithm"].value_counts())

In [None]:
results = results[results["finished"]]
print("results", results.shape)

In [None]:
print("dataset count")
print(results["dataset"].value_counts())
print()
print("algorithm count")
print(results["algorithm"].value_counts())

In [None]:
errors = results[~results["completed"]]["comments"].parallel_map(lambda x:x.split(": ")[0].removeprefix('{').removeprefix("'").removesuffix("'"))

In [None]:
errors.value_counts()

In [None]:
results = results[results["completed"]]
print("results", results.shape)

In [None]:
print("dataset count")
print(results["dataset"].value_counts())
print()
print("algorithm count")
print(results["algorithm"].value_counts())

In [None]:
results["dataset"].value_counts()

In [None]:
results[["y_true", "y_pred", "y_true_idx", "y_pred_idx"]] = results[["y_true", "y_pred", "y_true_idx", "y_pred_idx"]].parallel_applymap(eval)
assert results["y_true_idx"].eq(results["y_pred_idx"]).all()

In [None]:
results = results[results["missing_percentage"] == 0]
# results = results.iloc[:100]
results.head()

In [None]:
supervised_metrics = results[["y_true", "y_pred"]].parallel_apply(
    lambda row: GetMetrics.compute_supervised_metrics(y_true=row["y_true"], y_pred=row["y_pred"], random_state= RANDOM_STATE), axis=1)

In [None]:
results = pd.concat([results, pd.DataFrame(supervised_metrics.to_dict()).T], axis= 1)
print("results", results.shape)
results.head()

In [None]:
indexes_names = ["dataset", "algorithm", "missing_percentage", "amputation_mechanism", "imputation"]
results = results[results.select_dtypes(include="float").columns.to_list() + indexes_names].groupby(indexes_names, sort= False).agg(["mean", 'std']).reset_index()
results.columns = results.columns.map('_'.join).str.strip('_')
results["size"] = results["MCC (p-value)_mean"].apply(lambda x: -np.log(x))
print("results", results.shape)
results.head()

In [None]:
results.to_csv(INCOMPLETE_METRICS_PATH, index= None)

In [None]:
results = pd.merge(results, pd.DataFrame(itertools.product(results["dataset"].unique(), results["algorithm"].unique()), columns = ["dataset", "algorithm"]), how= "right")
res = OneHotEncoder(sparse_output= False).set_output(transform= "pandas").fit_transform(results[["dataset", "algorithm"]])
for col in ["silhouette_mean", "silhouette_std", "MCC_mean", "MCC_std", "MCC (p-value)_mean", "MCC (p-value)_std"]:
    res[col] = results[col]
    results[col] = KNNImputer().set_output(transform= "pandas").fit_transform(X= res)[col]
    res = res.drop(columns=col)
results["size"] = results["MCC (p-value)_mean"].apply(lambda x: -np.log(x))
results.to_csv(INCOMPLETE_INMETRICS_PATH, index= None)

In [None]:
filelame = "incomplete_algorithms_evaluation.csv"
file_path = os.path.join(folder_results, filelame)
results = pd.read_csv(file_path)
print("results", results.shape)
results.head()

In [None]:
results = results[results["finished"]]
print("results", results.shape)

In [None]:
results = results[results["completed"]]
print("results", results.shape)

In [None]:
results[["y_true", "y_pred", "y_true_idx", "y_pred_idx"]] = results[["y_true", "y_pred", "y_true_idx", "y_pred_idx"]].applymap(eval)
assert results["y_true_idx"].eq(results["y_pred_idx"]).all()
# results["y_true_final"] = results.apply(lambda x: pd.Series(eval(x["y_true"]), index= eval(x["y_true_idx"])).to_list(), axis= 1)
# results["y_pred_final"] = results.apply(lambda x: pd.Series(eval(x["y_pred"]), index= eval(x["y_pred_idx"])).to_list(), axis= 1)

In [None]:
a = results.apply(lambda x: pd.Series(eval(x["y_true"]), index= eval(x["y_true_idx"])), axis= 1)
a

In [None]:
results[["MCC_supervised", "MCC_supervised_pvalue"]] = results[["y_true", "y_pred"]].apply(lambda x: GetMetrics.compute_mcc(y_true= x["y_true"], y_pred= x["y_pred"]), axis= 1)

In [None]:
results = results[results["completed"]]
mask = results["dataset"].str.startswith("simulated")
results["dataset_type"] = mask.where(mask, "real").mask(mask, "synthetic")
results = results.rename(columns = {original_mcc_supervised: renamed_mcc_supervised, original_mcc_unsupervised: renamed_mcc_unsupervised})
print("results", results.shape)

In [None]:
nomissing_results = results.copy()
nomissing_results = nomissing_results[nomissing_results["missing_percentage"] == 0]
nomissing_results = nomissing_results.groupby(["dataset", "algorithm", "imputation", "dataset_type"], sort=False)[renamed_mcc_metrics]
nomissing_results = pd.concat([nomissing_results.mean(), nomissing_results.std()], axis= 1)
nomissing_results.columns = (nomissing_results.columns[:2] + "_avg").to_list() + (nomissing_results.columns[2:] + "_std").to_list()
nomissing_results = nomissing_results.reset_index()
color_dict = {alg:col for alg,col in zip(nomissing_results["algorithm"].unique(), list(sns.color_palette(None, nomissing_results["algorithm"].nunique()).as_hex()))}
nomissing_results.loc[:,"color"] = nomissing_results["algorithm"].apply(lambda x: color_dict[x])
print("nomissing_results", nomissing_results.shape)
nomissing_results.head()

In [None]:
px.scatter(nomissing_results, x="algorithm", y= renamed_avg_supervised, error_y= renamed_stability_supervised, template= "simple_white", facet_col= "dataset",
           facet_col_wrap=3, width=1400, height=1000, color= "algorithm", labels= labels_dict)

In [None]:
px.scatter(nomissing_results, x="algorithm", y= renamed_avg_supervised, error_y= renamed_stability_supervised, template= "simple_white", facet_col= "dataset",
           facet_col_wrap=3, width=1400, height=1000, color= "algorithm", labels= labels_dict)

In [None]:
nomissing_results["rank"] = 0
for d in nomissing_results["dataset"]:
    d_nomissing_results = nomissing_results[nomissing_results["dataset"] == d].sort_values([renamed_avg_supervised, renamed_stability_supervised], ascending= [False, True])
    nomissing_results.loc[d_nomissing_results.index, "rank"] = d_nomissing_results[renamed_avg_supervised].rank()
rank = nomissing_results.groupby(["algorithm", "imputation", "dataset_type"], sort=False)["rank"].mean().to_list()
by_datasettype_nomissing_results = results.copy()
by_datasettype_nomissing_results = by_datasettype_nomissing_results[by_datasettype_nomissing_results["missing_percentage"] == 0]
by_datasettype_nomissing_results = by_datasettype_nomissing_results.groupby(["algorithm", "imputation", "dataset_type"], sort=False)[renamed_mcc_metrics]
by_datasettype_nomissing_results = pd.concat([by_datasettype_nomissing_results.mean(), by_datasettype_nomissing_results.std()], axis= 1)
by_datasettype_nomissing_results.columns = (by_datasettype_nomissing_results.columns[:2] + "_avg").to_list() + (by_datasettype_nomissing_results.columns[2:] + "_std").to_list()
by_datasettype_nomissing_results = by_datasettype_nomissing_results.reset_index()
color_dict = {alg:col for alg,col in zip(by_datasettype_nomissing_results["algorithm"].unique(), list(sns.color_palette(None, by_datasettype_nomissing_results["algorithm"].nunique()).as_hex()))}
by_datasettype_nomissing_results.loc[:,"color"] = by_datasettype_nomissing_results["algorithm"].apply(lambda x: color_dict[x])
by_datasettype_nomissing_results["rank"] = rank
by_datasettype_nomissing_results["size"] = 1
print("by_datasettype_nomissing_results", by_datasettype_nomissing_results.shape)
by_datasettype_nomissing_results.head()

In [None]:
px.scatter(by_datasettype_nomissing_results, x="algorithm", y= renamed_avg_supervised, error_y= renamed_stability_supervised,
           template= "simple_white", facet_col = "dataset_type", size= "rank", color= "algorithm", labels= labels_dict)

In [None]:
px.scatter(by_datasettype_nomissing_results, x="rank", y= renamed_avg_supervised, error_y= renamed_stability_supervised, template= "simple_white", size= "size",
           facet_col = "dataset_type", text="algorithm", color= "algorithm", labels= labels_dict)

In [None]:
missing_results = results.copy()
missing_results = missing_results[missing_results["missing_percentage"] > 0]
missing_results = missing_results.groupby(["algorithm", "missing_percentage", "amputation_mechanism", "imputation", "dataset_type"], sort=False)[renamed_mcc_metrics]
missing_results = pd.concat([missing_results.mean(), missing_results.std()], axis= 1)
missing_results.columns = (missing_results.columns[:2] + "_avg").to_list() + (missing_results.columns[2:] + "_std").to_list()
missing_results = missing_results.reset_index()
color_dict = {alg:col for alg,col in zip(missing_results["algorithm"].unique(), list(sns.color_palette(None, missing_results["algorithm"].nunique()).as_hex()))}
missing_results.loc[:,"color"] = missing_results["algorithm"].apply(lambda x: color_dict[x])
# missing_results[renamed_stability_supervised] /= 5
# missing_results[renamed_stability_unsupervised] /= 2
missing_results["size"] = 1
print("missing_results", missing_results.shape)
missing_results.head()

In [None]:
fig = px.scatter(missing_results, x= renamed_avg_supervised, y= renamed_avg_unsupervised, animation_frame="missing_percentage", animation_group="algorithm",
                 size= "size", text="algorithm", color= "imputation", error_x=  renamed_stability_supervised,
                 error_y= renamed_stability_unsupervised, facet_row = "amputation_mechanism", facet_col = "dataset_type",
                 range_x=[0,1], range_y=[0,1], width=1400, height=2000, title= "Clustering performance on incomplete multi-view datasets",
                 template= "simple_white", labels= labels_dict)
fig

In [None]:
fig.write_html("test.html")

In [None]:
px.scatter(missing_results[missing_results["amputation_mechanism"] == "EDM"], x= renamed_avg_supervised, 
                 y= renamed_avg_unsupervised, animation_frame="missing_percentage", animation_group="algorithm",
                 size= "size", text="algorithm", color= "imputation", error_x=  renamed_stability_supervised,
                 error_y= renamed_stability_unsupervised, facet_row = "amputation_mechanism", facet_col = "dataset_type",
                 range_x=[0,1], range_y=[0,1], height=600, title= "Clustering performance on incomplete multi-view datasets",
                 template= "simple_white", labels= labels_dict)

In [None]:
missing_results = results.copy()
missing_results = missing_results[missing_results["missing_percentage"] > 0]
missing_results = missing_results.groupby(["algorithm", "missing_percentage", "amputation_mechanism", "imputation", "dataset_type"], sort=False)[supervised_metrics]
missing_results = pd.concat([missing_results.mean(), missing_results.std()], axis= 1)
missing_results.columns = (missing_results.columns[:2] + "_avg").to_list() + (missing_results.columns[2:] + "_std").to_list()
missing_results = missing_results.reset_index()
missing_results["rank"] = 0
for d in missing_results["dataset"]:
    d_missing_results = missing_results[missing_results["dataset"] == d].sort_values(["MCC_avg", "MCC_std"], ascending= [False, True])
    missing_results.loc[d_missing_results.index, "rank"] = d_missing_results["MCC_avg"].rank()


missing_results = results.copy()
missing_results = missing_results[missing_results["missing_percentage"] > 0]
missing_results = missing_results.groupby(["algorithm", "missing_percentage", "amputation_mechanism", "imputation", "dataset_type"], sort=False)[supervised_metrics]
missing_results = pd.concat([missing_results.mean(), missing_results.std()], axis= 1)
missing_results.columns = (missing_results.columns[:2] + "_avg").to_list() + (missing_results.columns[2:] + "_std").to_list()
missing_results = missing_results.reset_index()
color_dict = {alg:col for alg,col in zip(missing_results["algorithm"].unique(), list(sns.color_palette(None, missing_results["algorithm"].nunique()).as_hex()))}
missing_results.loc[:,"color"] = missing_results["algorithm"].apply(lambda x: color_dict[x])
missing_results["MCC_std"] = missing_results["MCC_std"]/10
missing_results["MCC_performance_std"] = missing_results["MCC_performance_std"]/5
print("missing_results", missing_results.shape)
missing_results.head()

## Complete

In [None]:
outputs = GetMetrics.save_cluster_evaluation(INCOMPLETE_RESULTS_PATH, INCOMPLETE_METRICS_PATH, INCOMPLETE_INMETRICS_PATH, random_state = RANDOM_STATE,
                                             n_permutations=10000, verbose= True)

In [None]:
!pip install statsmodels

In [None]:
from statsmodels.stats.multitest import multipletests

In [None]:
multipletests([0.1, 0.3, 0.5, 0.0001])[1]

In [None]:
results = pd.read_csv(COMPLETE_RESULTS_PATH)
print("results", results.shape)
results.head()

In [None]:
results = results[results["finished"]]
print("results", results.shape)

In [None]:
errors = results[~results["completed"]]["comments"].parallel_map(lambda x: list(eval(x).keys())[0].split(": ")[0])

In [None]:
errors.value_counts()

In [None]:
results = results[results["completed"]]
print("results", results.shape)

In [None]:
results[["y_true", "y_pred", "y_true_idx", "y_pred_idx"]] = results[["y_true", "y_pred", "y_true_idx", "y_pred_idx"]].parallel_applymap(eval)
assert results["y_true_idx"].eq(results["y_pred_idx"]).all()

In [None]:
results = results[results["missing_percentage"] == 0]
# results = results.iloc[:100]
results

In [None]:
supervised_metrics = results[["y_true", "y_pred"]].parallel_apply(
    lambda row: GetMetrics.compute_supervised_metrics(y_true=row["y_true"], y_pred=row["y_pred"], random_state= RANDOM_STATE), axis=1)
supervised_metrics

In [None]:
results = pd.concat([results, pd.DataFrame(supervised_metrics.to_dict()).T], axis= 1)
print("results", results.shape)
results.head()

In [None]:
indexes_names = ["dataset", "algorithm", "missing_percentage", "amputation_mechanism", "imputation"]
results = results[results.select_dtypes(include="float").columns.to_list() + indexes_names].groupby(indexes_names, sort= False).agg(["mean", 'std']).reset_index()
results.columns = results.columns.map('_'.join).str.strip('_')
results["padj"] = false_discovery_control(results["MCC (p-value)_mean"])
results["log_padj"] = results["padj"].apply(lambda x: -np.log10(x))
print("results", results.shape)
results.head()

In [None]:
results.to_csv(COMPLETE_METRICS_PATH, index= None)

In [None]:
results = pd.merge(results, pd.DataFrame(itertools.product(results["dataset"].unique(), results["algorithm"].unique()), columns = ["dataset", "algorithm"]), how= "right")
res = OneHotEncoder(sparse_output= False).set_output(transform= "pandas").fit_transform(results[["dataset", "algorithm"]])
for col in ["silhouette_mean", "silhouette_std", "MCC_mean", "MCC_std", "MCC (p-value)_mean", "MCC (p-value)_std"]:
    res[col] = results[col]
    results[col] = KNNImputer().set_output(transform= "pandas").fit_transform(X= res)[col]
    res = res.drop(columns=col)
results["padj"] = false_discovery_control(results["MCC (p-value)_mean"])
results["log_padj"] = results["padj"].apply(lambda x: -np.log10(x))
results.to_csv(COMPLETE_INMETRICS_PATH, index= None)

In [None]:
fig = px.scatter(results.reset_index(), x= "ami_mean", y= "MCC_mean", size= "size", text="algorithm", color= "algorithm", facet_col = "dataset", facet_col_wrap= 3,
                 range_x=[0,1.1], range_y=[0,1.1], width=1500, height=500, title= "Clustering performance on incomplete multi-view datasets",
                 template= "simple_white", labels= labels_dict, error_x="ami_var", error_y="MCC_var")
fig