## Aggregating results to DataFrame

In [3]:
import os
import lib
import numpy as np
import pandas as pd
import json
from pathlib import Path

base_path = json.load(open("secrets.json", "r"))["Experiment_Folder"]

method2exp = {
    "real": "adult/20_12_2022-REAL-BASELINE/outputs/exp/adult/ddpm_real/final_eval/",
    "tab-ddpm": "adult/21_12_2022-identity-50optuna-ts26048-catboost-tune-CatboostAndSimilarityEval-syntheticEval/outputs/exp/adult/ddpm_identity_best/final_eval/",
    "tab-ddpm-bgm": "adult/20_12_2022-bgm-50optuna-ts26048-catboost-tune-CatboostAndSimilarityEval-syntheticEval/outputs/exp/adult/ddpm_bgm_best/final_eval/",
    # "smote": "exp/{}/smote/",
    # "ctabgan+": "exp/{}/ctabgan-plus/",
    # "ctabgan": "exp/{}/ctabgan/",
    # "tvae": "exp/{}/tvae/"
} 
for k,v in method2exp.items():
    method2exp[k] = Path(os.path.join(base_path, v))

eval_file = "eval_catboost.json"
sim_file = "results_similarity.json"
show_std = False
columns = ["method"] 
# df = pd.DataFrame(columns=["method"] + [_[:3].upper() for _ in DATASETS])
df = []
for algo in method2exp: 
    base_df = pd.DataFrame([algo], columns=["method"])
    metric_df = pd.DataFrame()
    metrics=["acc","f1","roc_auc"]

    if not os.path.exists(method2exp[algo] / eval_file):
        print(f"File {eval_file} not found for {algo}")
        metric_df = pd.DataFrame([["---"]*len(metrics)], columns=metrics)

    else:
        res_dict = lib.load_json(method2exp[algo] / eval_file)
        for metric in metrics:
            if algo == "real":
                res = f'{res_dict["real"]["test"][metric + "-mean"]:.4f}' 
                if show_std: res += f'+-{res_dict["real"]["test"][metric + "-std"]:.4f}'
                metric_df[metric] = [res]
            else:
                res = f'{res_dict["synthetic"]["test"][metric + "-mean"]:.4f}'
                if show_std: res += f'+-{res_dict["synthetic"]["test"][metric + "-std"]:.4f}'
                metric_df[metric] = [res]

    sim_metrics = ["score", "basic_score", "ml_score","sup_score", "pmse_score"]
    if not os.path.exists(method2exp[algo] / sim_file):
        print(f"File {sim_file} not found for {algo}")
        sim_df = pd.DataFrame([["---"]*len(sim_metrics)], columns=sim_metrics)
    else:
        sim_res_dict = lib.load_json(method2exp[algo] / sim_file)
        sim_df = pd.DataFrame([sim_res_dict["sim_score"]], columns=sim_metrics)
    # rename score to sim_score
    sim_df = sim_df.rename(columns={"score": "sim_score"})    
    # format all floats to :.4f
    sim_df = sim_df.applymap(lambda x: f'{x:.4f}' if isinstance(x, float) else x)

    base_df = pd.concat([base_df, metric_df, sim_df], axis=1)
    
    df.append(base_df)


calculate_diff = True
results = pd.concat(df, axis=0)
metrics = results.columns.tolist()
metrics.remove("method")
cols= ["method"]
if calculate_diff:
    for metric in metrics:
        results[metric] = pd.to_numeric(results[metric])
        results[f"{metric}-diff"] = results[metric] - results.loc[results["method"] == "real", metric].values[0]
        cols.append(metric)
        cols.append(f"{metric}-diff")
    results=results.reindex(cols, axis=1)

results

Unnamed: 0,method,acc,acc-diff,f1,f1-diff,roc_auc,roc_auc-diff,sim_score,sim_score-diff,basic_score,basic_score-diff,ml_score,ml_score-diff,sup_score,sup_score-diff,pmse_score,pmse_score-diff
0,real,0.8742,0.0,0.8152,0.0,0.9276,0.0,0.9598,0.0,0.9922,0.0,0.9975,0.0,0.9839,0.0,0.882,0.0
0,tab-ddpm,0.8598,-0.0144,0.7941,-0.0211,0.9128,-0.0148,0.7586,-0.2012,0.973,-0.0192,0.9923,-0.0052,0.8741,-0.1098,0.0349,-0.8471
0,tab-ddpm-bgm,0.8632,-0.011,0.7985,-0.0167,0.9165,-0.0111,0.7418,-0.218,0.9642,-0.028,0.9955,-0.002,0.8307,-0.1532,0.0004,-0.8816


In [None]:
# DATASETS = [
#     "abalone",
#     "adult",
#     "buddy",
#     "california",
#     "cardio",
#     "churn2",
#     "default",
#     "diabetes",
#     "fb-comments",
#     "gesture",
#     "higgs-small",
#     "house",
#     "insurance",
#     "king",
#     "miniboone",
#     "wilt"
# ]

# _REGRESSION = [
#     "abalone",
#     "california",
#     "fb-comments",
#     "house",
#     "insurance",
#     "king",
# ]

# method2exp = {
#     "real": "adult/20_12_2022-REAL-BASELINE/outputs/adult/ddpm_real/final_eval/",
#     "tab-ddpm-bgm": "adult/20_12_2022-bgm-50optuna-ts26048-catboost-tune-CatboostAndSimilarityEval-syntheticEval/outputs/exp/adult/ddpm_bgm_best/final_eval/",
#     # "tab-ddpm": "exp/{}/ddpm_cb_best/",
#     # "smote": "exp/{}/smote/",
#     # "ctabgan+": "exp/{}/ctabgan-plus/",
#     # "ctabgan": "exp/{}/ctabgan/",
#     # "tvae": "exp/{}/tvae/"
# }

# eval_file = "eval_catboost.json"
# sim_file = "results_similarity.json"
# show_std = False
# df = pd.DataFrame(columns=["method"] + [_[:3].upper() for _ in DATASETS])

# for algo in method2exp: 
#     algo_res = []
#     for ds in DATASETS:
#         if not os.path.exists(os.path.join(method2exp[algo].format(ds), eval_file)):
#             algo_res.append("--")
#             continue
#         metric = "r2" if ds in _REGRESSION else "f1"
#         res_dict = lib.load_json(os.path.join(method2exp[algo].format(ds), eval_file))

#         if algo == "real":
#             res = f'{res_dict["real"]["test"][metric + "-mean"]:.4f}' 
#             if show_std: res += f'+-{res_dict["real"]["test"][metric + "-std"]:.4f}'
#         else:
#             res = f'{res_dict["synthetic"]["test"][metric + "-mean"]:.4f}'
#             if show_std: res += f'+-{res_dict["synthetic"]["test"][metric + "-std"]:.4f}'

#         algo_res.append(res)
#     df.loc[len(df)] = [algo] + algo_res