In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

import seaborn as sns
from pathlib import Path
import warnings

from critdd import Diagram
warnings.simplefilter(action='ignore', category=FutureWarning)

sns.set(font_scale=1.5,rc={'text.usetex' : True})
sns.set_style("whitegrid")
plt.rc('font', **{'family': 'serif'})
plt.rcParams["figure.figsize"] = (12, 3)

In [3]:
def load_results(folder, folds=None):
    df = pd.concat((pd.read_csv(f) for f in Path(folder).glob("*.csv") if f.stat().st_size > 0 ), ignore_index=True)
    df = df.drop("params",axis=1)
    df["test_brier"] = -df["test_brier"]
    df["train_brier"] = -df["train_brier"]
    df = df[df.fold<folds if folds is not None else np.inf]
    df["clf_variant"] = df.clf.str.cat(df.clf_variant, sep="_", na_rep="").str.rstrip("_")
    return df
df = load_results("results_90s_miss",25)
df.loc[df.clf_variant == "msl_prebin_miss", "clf_variant"] = "msl_prebin_miss90"
try:
    df2 = load_results("results",25)
    df = pd.concat((df, df2.loc[df2.clf_variant == "msl_prebin_miss"]))
except:
    pass

In [6]:
datasets = [42900, 41945,37]
clfs = [ "psl_prebin","msl_prebin", "msl_prebin_miss", "msl_prebin_nb"]

In [9]:
metric = "test_bacc"

df_ = df[(df.dataset.isin(datasets))
    & (df.clf_variant.isin(clfs))
    & (~df.stage.isnull())][["dataset", "fold","clf_variant", metric]]\
        .groupby(["dataset", "fold","clf_variant"]).mean().reset_index()
df_["key"] = df_[["dataset","fold"]].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
display(df_)
df_ = df_.pivot(
    index = "key",
    columns = "clf_variant",
    values = metric
)

# create a CD diagram from the Pandas DataFrame
diagram = Diagram(
    df_.to_numpy(),
    treatment_names = df_.columns,
    maximize_outcome = True
)
print(diagram.average_ranks) # the average rank of each treatment
print(diagram.get_groups(alpha=.05, adjustment="holm"))

# export the diagram to a file
diagram.to_file(
    "fig/binary_bacc_cd.pdf",
    alpha = .05,
    adjustment = "holm",
    reverse_x = True,
    axis_options = {"title": "Critical Difference"},
    as_document=False,
)


Unnamed: 0,dataset,fold,clf_variant,test_bacc,key
0,37,0,msl_prebin,0.255898,37_0
1,37,0,msl_prebin_miss,0.226720,37_0
2,37,0,msl_prebin_nb,0.285496,37_0
3,37,0,psl_prebin,0.248055,37_0
4,37,1,msl_prebin,0.291898,37_1
...,...,...,...,...,...
295,42900,23,psl_prebin,0.375815,42900_23
296,42900,24,msl_prebin,0.265241,42900_24
297,42900,24,msl_prebin_miss,0.252406,42900_24
298,42900,24,msl_prebin_nb,0.158289,42900_24


[3.33333333 2.34666667 2.10666667 2.21333333]
[array([0]), array([1, 2, 3])]
