In [None]:
# Software Name : mislabeled-benchmark
# SPDX-FileCopyrightText: Copyright (c) Orange Innovation
# SPDX-License-Identifier: MIT
# 
# This software is distributed under the MIT license,
# see the "LICENSE.md" file for more details
# or https://github.com/Orange-OpenSource/mislabeled-benchmark/blob/master/LICENSE.md

import os
import math
from define_models import baselines
import matplotlib.pyplot as plt
from IPython.display import display
from critdd import Diagram, Diagrams
from collections import defaultdict
from numbers import Number
import texfig as tf
from texfig import TMLR_textwidth
import os
import pandas as pd
import numpy as np
import h5py
from benchmark_analysis import load_estim

In [None]:
output_dir = "output"

In [None]:
result_dirs = [
    (
        "weak/gb",
        os.path.join(os.path.expanduser("~"), f"{output_dir}/estim/weak/gb"),
    ),
    (
        "weak/klm",
        os.path.join(os.path.expanduser("~"), f"{output_dir}/estim/weak/klm"),
    ),
    (
        "noise/gb",
        os.path.join(os.path.expanduser("~"), f"{output_dir}/estim/noise/gb"),
    ),
    (
        "noise/klm",
        os.path.join(os.path.expanduser("~"), f"{output_dir}/estim/noise/klm"),
    ),
]

In [None]:
def custom_grid(axis):
    axis.grid(c="#f2f2f2", which="both")
    axis.set_axisbelow(True)

In [None]:
all_results = dict()

In [None]:
all_results['allclass'] = dict()
for prefix, result_dir in result_dirs:
    all_results['allclass'][prefix] = load_estim(
        result_dir=result_dir
    )

In [None]:
all_results["relabel"] = dict()
for prefix, result_dir in result_dirs:
    result_dir = result_dir.replace("estim", "relabel")
    all_results["relabel"][prefix] = load_estim(
        result_dir=result_dir,
    )

In [None]:
all_results['byclass'] = dict()
for prefix, result_dir in result_dirs:
    result_dir = result_dir.replace("estim", "estim_byclass")
    all_results['byclass'][prefix] = load_estim(
        result_dir=result_dir,
    )

In [None]:
detectors = np.unique(
    np.concatenate([r.detector_name.unique() for r in all_results['allclass'].values()])
)
datasets = np.unique(np.concatenate([r.dataset_name.unique() for r in all_results['allclass'].values()]))

detectors_nobaseline = list(set(detectors) - set(baselines + ["random"]))

len(detectors), detectors, len(detectors_nobaseline), detectors_nobaseline, len(
    datasets
), datasets

In [None]:
d_base_model_map = dict()
d_detect_map = dict()

for d in detectors_nobaseline:
    s = d.split("_")
    if len(s) == 2:
        d_base_model_map[d] = s[0]
    else:
        d_base_model_map[d] = "klm"

    d_detect_map[d] = s[-1]

detector_suffixes = np.unique(list(d_detect_map.values()))
d_base_model_map, d_detect_map, detector_suffixes

In [None]:
detect_pretty_name = {
    "agra": "AGRA",
    "aum": "AUM",
    "cleanlab": "Cleanlab",
    "consensus": "Consensus",
    "forget": "Forget scores",
    "influence": "Influence",
    "representer": "Representer",
    "smallloss": "Small loss",
    "tracin": "TracIn",
    "vosg": "VoSG",
}

bm_pretty_name = {"klm": "KLM", "gb": "GBT"}

In [None]:
from matplotlib import colormaps

# tab_colors = colormaps["tab20"]
tab_colors = colormaps["tab10"]

detector_colors = {
    d: tab_colors.colors[i % len(tab_colors.colors)]
    for i, d in enumerate(detector_suffixes)
}

plt.figure()
labels = []
# KLM
for d_name, d_color in detector_colors.items():
    l = f"({bm_pretty_name['klm']}) " + detect_pretty_name[d_name]
    plt.scatter([], [], label=l, color=d_color)
    labels.append(l)

# GB
for d_name, d_color in detector_colors.items():
    if "gb_" + d_name in detectors_nobaseline:
        l = f"({bm_pretty_name['gb']}) " + detect_pretty_name[d_name]
        plt.scatter([], [], label=l, color=d_color, marker="s")
        labels.append(l)

    else:
        plt.scatter([], [], label="", color="white", alpha=0)
        labels.append("")


disp_baselines = False
if disp_baselines:
    plt.scatter([], [], label="gold", color="black", marker="x")
    labels.append("gold")
    plt.scatter([], [], label="silver", color="black", marker="+")
    labels.append("silver")
    plt.scatter([], [], label="random", color="black", marker="p")
    labels.append("random")

    for i in range(len(detector_colors.items()) - 3):
        plt.scatter([], [], label="", color="white", alpha=0)
        labels.append("")

plt.gca().axis("off")
plt.legend(ncols=3 if disp_baselines else 2, labels=labels, title="Detectors")
# plt.savefig(f"figures/summary/{prefix_txt}_legend.pdf")

In [None]:
full_detector_colors = dict()
full_detector_markers = dict()
for d in detectors:
    if d in baselines + ['random']:
        full_detector_colors[d] = 'white'
        full_detector_markers[d] = 'x'
    else:
        full_detector_colors[d] = detector_colors[d_detect_map[d]]
        full_detector_markers[d] = 'o' if d_base_model_map[d] == 'klm' else 's'


In [None]:
for k, r in all_results['allclass'].items():
    d = r.pivot_table(
        index="detector_name",
        columns="dataset_name",
        values="estim_time",
        aggfunc="count",
    )
    print(k, d.sum().sum(), d.sum())
    display(d)

In [None]:
for k, r in all_results['relabel'].items():
    d = r.pivot_table(
        index="detector_name",
        columns="dataset_name",
        values="estim_time",
        aggfunc="count",
    )
    print(k, d.sum().sum())

    display(d)

In [None]:
for k, r in all_results['byclass'].items():
    d = r.pivot_table(
        index="detector_name",
        columns="dataset_name",
        values="estim_time",
        aggfunc="count",
    )

    print(k, d.sum().sum())
    display(d)

In [None]:
for k, r in all_results['allclass'].items():
    print(k)
    display(
        r.pivot_table(
            index="detector_name",
            columns="dataset_name",
            values="logl_test",
            aggfunc="min",
        )
    )

In [None]:
# Hyperparameters selection

all_results_cv = dict()
all_val_cv = dict()

baselines_norandom = set(baselines) - {"random"}
for which, _all_res in [
    ("allclass", all_results['allclass']),
    ("byclass", all_results['byclass']),
    ("relabel", all_results['relabel']),
]:

    all_results_cv[which] = dict()
    all_val_cv[which] = dict()

    for prefix in _all_res.keys():

        all_results_cv[which][prefix] = dict()
        all_val_cv[which][prefix] = dict()

        for cv_k, cv_function in [
            ("logl", lambda d, k: d[f"logl_{k}"].idxmin()),
            ("bacc", lambda d, k: d[f"bacc_{k}"].idxmax()),
        ]:

            indices_oracl_cv = cv_function(
                _all_res[prefix].groupby(["dataset_name", "detector_name"]), "test"
            )
            indices_clean_cv = cv_function(
                _all_res[prefix].groupby(["dataset_name", "detector_name"]), "val"
            )
            indices_noisy_cv = cv_function(
                _all_res[prefix].groupby(["dataset_name", "detector_name"]), "noisy_val"
            )
            indices_noisy_10_cv = cv_function(
                _all_res[prefix][
                    _all_res[prefix]["params_splitter"].apply(
                        lambda r: "quantile" in r.keys() and r["quantile"] == 0.1
                    )
                    | _all_res[prefix]["detector_name"].isin(baselines_norandom)
                ].groupby(["dataset_name", "detector_name"]),
                "noisy_val",
            )
            indices_noisy_90_cv = cv_function(
                _all_res[prefix][
                    _all_res[prefix]["params_splitter"].apply(
                        lambda r: "quantile" in r.keys() and r["quantile"] == 0.9
                    )
                    | _all_res[prefix]["detector_name"].isin(baselines_norandom)
                ].groupby(["dataset_name", "detector_name"]),
                "noisy_val",
            )
            indices_clean_90_cv = cv_function(
                _all_res[prefix][
                    _all_res[prefix]["params_splitter"].apply(
                        lambda r: "quantile" in r.keys() and r["quantile"] == 0.9
                    )
                    | _all_res[prefix]["detector_name"].isin(baselines_norandom)
                ].groupby(["dataset_name", "detector_name"]),
                "val",
            )
            indices_clean_10_cv = cv_function(
                _all_res[prefix][
                    _all_res[prefix]["params_splitter"].apply(
                        lambda r: "quantile" in r.keys() and r["quantile"] == 0.1
                    )
                    | _all_res[prefix]["detector_name"].isin(baselines_norandom)
                ].groupby(["dataset_name", "detector_name"]),
                "val",
            )

            all_results_cv[which][prefix][cv_k] = {
                "oracl": _all_res[prefix].loc[indices_oracl_cv.dropna()],
                "clean": _all_res[prefix].loc[indices_clean_cv.dropna()],
                "noisy": _all_res[prefix].loc[indices_noisy_cv.dropna()],
                "noisy_10": _all_res[prefix].loc[indices_noisy_10_cv.dropna()],
                "noisy_90": _all_res[prefix].loc[indices_noisy_90_cv.dropna()],
                "clean_90": _all_res[prefix].loc[indices_clean_90_cv.dropna()],
                "clean_10": _all_res[prefix].loc[indices_clean_10_cv.dropna()],
            }

            all_val_cv[which][prefix][cv_k] = {
                k: r.pivot(
                    index="detector_name", columns="dataset_name", values=f"{cv_k}_test"
                )
                for k, r in all_results_cv[which][prefix][cv_k].items()
            }

        # sanity check
        best_logl_direct = _all_res[prefix].pivot_table(
            index="detector_name",
            columns="dataset_name",
            values="logl_test",
            aggfunc="min",
        )

        assert (
            best_logl_direct - all_val_cv[which][prefix]["logl"]["oracl"]
        ).abs().sum().sum() == 0

        best_bacc_direct = _all_res[prefix].pivot_table(
            index="detector_name",
            columns="dataset_name",
            values="bacc_test",
            aggfunc="max",
        )

        assert (
            best_bacc_direct - all_val_cv[which][prefix]["bacc"]["oracl"]
        ).abs().sum().sum() == 0

hp_tune_str = {
    "oracl": "oracle",
    "clean": "clean validation set",
    "noisy": "noisy validation set",
    "noisy_10": "noisy validation set (threshold=10%)",
}

In [None]:
d = all_results_cv["allclass"]["weak/gb"]["bacc"]["clean"]

e = d[np.logical_and(d["detector_name"] == "none", d["dataset_name"] == "agnews")]

print(e["params_classifier"].values)

In [None]:
detectors

In [None]:
prefix = "weak/klm"
exp_alt = "allclass"


results_cv = all_results_cv[exp_alt][prefix]["logl"]
logl_cv = all_val_cv[exp_alt][prefix]["logl"]
logl_cv_for_norm = all_val_cv["allclass"][prefix]["logl"]

roc_auc = results_cv["oracl"].pivot(
    index="detector_name", columns="dataset_name", values="global_ranking_quality"
)

n_per_row = 4
n_rows = math.ceil(len(datasets) / n_per_row)
fig, axes = plt.subplots(n_rows, n_per_row, figsize=(13, 3 * n_rows))

for i, dataset_name in enumerate(results_cv["oracl"].dataset_name.unique()):
    axis = axes[i // n_per_row, i % n_per_row]

    axis.set_title(dataset_name)

    d_colors = [full_detector_colors[d] for d in roc_auc.index]

    axis.scatter(
        roc_auc[dataset_name],
        logl_cv["oracl"][dataset_name],
        marker="+",
        c=d_colors,
    )

    perf_none = logl_cv_for_norm["oracl"][dataset_name]["none"]
    perf_silver = logl_cv_for_norm["oracl"][dataset_name]["silver"]
    axis.hlines(perf_none, 0.35, 1)
    axis.hlines(perf_silver, 0.35, 1)

    # axis.set_xlim(0.35, 1)
    axis.set_ylim(2 * perf_silver - perf_none, 2 * perf_none - perf_silver)
    custom_grid(axis)

for axis in axes:
    axis[0].set_ylabel("test log loss")
for axis in axes[-1]:
    axis.set_xlabel("roc auc")

plt.tight_layout()
plt.show()

In [None]:
prefix = "weak/klm"
exp_alt = "allclass"

results_cv = all_results_cv[exp_alt][prefix]["bacc"]
logl_cv = all_val_cv[exp_alt][prefix]["bacc"]
logl_cv_for_norm = all_val_cv["allclass"][prefix]["bacc"]

n_per_row = 4
n_rows = math.ceil(len(datasets) / n_per_row)
fig, axes = plt.subplots(n_rows, n_per_row, figsize=(13, 3 * n_rows))

for i, dataset_name in enumerate(results_cv["oracl"].dataset_name.unique()):
    axis = axes[i // n_per_row, i % n_per_row]

    axis.set_title(dataset_name)

    sorted_indices = np.argsort(np.argsort(logl_cv["oracl"][dataset_name].values))
    n_detectors = len(sorted_indices)

    d_colors = [full_detector_colors[d] for d in logl_cv["oracl"][dataset_name].index]

    axis.scatter(
        sorted_indices,
        logl_cv["oracl"][dataset_name],
        marker="_",
        c=d_colors,
    )

    axis.scatter(
        sorted_indices,
        logl_cv["clean"][dataset_name],
        marker="x",
        c=d_colors,
    )

    axis.scatter(
        sorted_indices,
        logl_cv["noisy"][dataset_name],
        marker="*",
        c=d_colors,
    )

    perf_none = logl_cv_for_norm["oracl"][dataset_name]["none"]
    perf_silver = logl_cv_for_norm["oracl"][dataset_name]["silver"]
    axis.hlines(perf_none, -1, n_detectors)
    axis.hlines(perf_silver, -1, n_detectors)

    axis.set_ylim(2 * perf_silver - perf_none, 2 * perf_none - perf_silver)
    axis.set_xlim(-1, n_detectors)
    custom_grid(axis)

for axis in axes:
    axis[0].set_ylabel("test log loss")

plt.savefig(f"figures/summary/{prefix.replace('/','_')}.pdf", bbox_inches="tight")
plt.show()

In [None]:
results_cv.keys()

In [None]:
hp_tune_str = {
    "clean": "HP tuned on noise free validation",
    "noisy": "HP tuned on noisy validation",
    "oracl": "Oracle HP",
    "noisy_10": "HP tuned on noisy validation, threshold forced at 10\%",
    "noisy_90": "HP tuned on noisy validation, training on 10\% top trusted",
    "clean_90": "HP tuned on clean validation, training on 10\% top trusted",
    "clean_10": "HP tuned on clean validation, training on 90\% top trusted",
}

strategy_str = {
    "allclass": "Filtering",
    "byclass": "Filtering by class",
    "relabel": "10\% relabeling",
}

In [None]:
prefix = "weak/klm"
exp_alt = "allclass"
metric = "logl"

results_cv = all_results_cv[exp_alt][prefix][metric]
logl_cv = all_val_cv[exp_alt][prefix][metric]
logl_cv_for_norm = all_val_cv["allclass"][prefix][metric]

for k, r in results_cv.items():
    if exp_alt == "relabel" and k in ["noisy_10", "noisy_90", "clean_10", "clean_90"]:
        continue

    logl_cv_none = logl_cv_for_norm[k].loc[logl_cv_for_norm[k].index == "none"].values
    logl_cv_random = logl_cv[k].loc[logl_cv[k].index == "random"].values
    logl_cv_wood = logl_cv_for_norm[k].loc[logl_cv_for_norm[k].index == "wood"].values
    logl_cv_silver = (
        logl_cv_for_norm[k].loc[logl_cv_for_norm[k].index == "silver"].values
    )
    logl_cv_gold = logl_cv_for_norm[k].loc[logl_cv_for_norm[k].index == "gold"].values

    # norm_low = (logl_cv_none + logl_cv_wood) / 2
    # norm_high = (logl_cv_silver + logl_cv_gold) / 2

    if False and exp_alt == "relabel":
        norm_low = logl_cv_random
    else:
        norm_low = logl_cv_none
    norm_high = logl_cv_silver

    print(k, norm_low.shape, norm_high.shape, logl_cv[k].shape)
    logl_cv_norm = (logl_cv[k] - norm_low) / (norm_high - norm_low + 1e-10)

    ordered_detectors = list(logl_cv_norm.median(axis=1).sort_values().index)

    tf.figure(width=TMLR_textwidth * 0.95, pad=0.5)

    plt.axhline(100)
    plt.axhline(200)

    for i, detector_name in enumerate(ordered_detectors):
        d = logl_cv_norm.loc[logl_cv_norm.index == detector_name].values[0]
        # print(len(d))

        d_base100 = (2 - d) * 100
        d_base100 = d_base100[~np.isnan(d_base100)]
        bplot = plt.boxplot(
            [d_base100],
            positions=[i],
            # notch=True,
            # bootstrap=100,
            widths=[0.7],
            showfliers=False,
            patch_artist=True,
        )

        if np.any(np.isnan(d_base100)):
            ccc

        bplot["boxes"][0].set_facecolor(full_detector_colors[detector_name])
        bplot["boxes"][0].set_alpha(0.8)

        eps = 0.05
        plt.scatter(
            [i] * len(d_base100) + np.random.uniform(-eps, eps, size=len(d_base100)),
            d_base100,
            facecolors=[full_detector_colors[detector_name]] * len(d_base100),
            edgecolors="black",
            s=[5] * len(d_base100),
            zorder=10,
        )

    pretty_xticks = []
    for s in ordered_detectors:
        if s in baselines + ["random"]:
            pretty_xticks.append(s.replace("_", "\\_"))
        else:
            pretty_xticks.append(
                detect_pretty_name[d_detect_map[s]]
                + f" ({bm_pretty_name[d_base_model_map[s]]})"
            )
    plt.xticks(
        range(len(ordered_detectors)),
        pretty_xticks,
        rotation=50,
        rotation_mode="anchor",
        ha="right",
    )

    for i, detector_name in enumerate(ordered_detectors):
        if detector_name in baselines:
            plt.gca().get_xticklabels()[i].set_color("red")
        elif detector_name in ["random"]:
            plt.gca().get_xticklabels()[i].set_color("blue")

    plt.ylim(0, 300)
    custom_grid(plt.gca())
    # plt.title(f"Hyperparameters tuned using {hp_tune_str[k]} | {prefix} | {exp_alt}")
    plt.ylabel("normalized test loss")

    noise_str = "NCAR" if prefix.split("/")[0] == "noise" else "NNAR"
    classif_str = (
        "Linear Classifier"
        if prefix.split("/")[1] == "klm"
        else "Gradient Boosting Classifier"
    )
    plt.suptitle(
        f"{noise_str} | {strategy_str[exp_alt]} | {classif_str} | {hp_tune_str[k]}"
    )

    tf.savefig(f"figures/detectors/{prefix.replace('/','_')}_{exp_alt}_{k}_{metric}")

    plt.show()

In [None]:
def plot_gb_klm_mix(prefix, exp_alt, method_suffixes, k="oracl"):
    classif_c = (("klm", "tab:blue"), ("gb", "tab:orange"))

    tf.figure(width=TMLR_textwidth * (2 / 5), ratio=1, pad=0.5)

    xy_min = 1000
    xy_max = 0

    # for method_suffix in ["vosg", "consensus", "forget", "cleanlab", "aum"]:
    for method_suffix in method_suffixes:
        for classif, color in classif_c:
            logl_cv_classif = all_val_cv[exp_alt][prefix + classif]["logl"][k]

            perf_klm = logl_cv_classif.loc[
                logl_cv_classif.index == "klm_" + method_suffix
            ].values
            perf_gb = logl_cv_classif.loc[
                logl_cv_classif.index == "gb_" + method_suffix
            ].values

            # print(perf_gb.shape, perf_klm.shape)
            # print(perf_gb[:2], perf_klm[:2])
            is_na = np.logical_or(np.isnan(perf_gb), np.isnan(perf_klm))

            plt.scatter(perf_gb[~is_na], perf_klm[~is_na], color=color, s=12, alpha=0.7)

            xy_min = min([xy_min, np.nanmin(perf_gb), np.nanmin(perf_klm)])
            xy_max = max([xy_max, np.nanmax(perf_gb), np.nanmax(perf_klm)])

    xlims, ylims = plt.xlim(), plt.ylim()
    # xy_min = min(xlims[0], ylims[0])
    # xy_max = max(xlims[1], ylims[1])

    plt.plot([xy_min, xy_max], [xy_min, xy_max], color="black")
    plt.ylabel("test log loss\n(KLM base model)")
    plt.xlabel("test log loss\n(GBM base model)")

    for base_model, color in classif_c:
        plt.scatter(
            [],
            [],
            color=color,
            label=f"{bm_pretty_name[base_model]} estimator",
            alpha=0.7,
        )

    plt.legend()

    custom_grid(plt.gca())
    plt.xlim(1e-1, 1)
    plt.ylim(1e-1, 1)
    # plt.xlim(xy_min, xy_max)
    # plt.ylim(xy_min, xy_max)
    plt.xscale("log")
    plt.yscale("log")

    plt.xticks(np.arange(0.1, 1, 0.1), [], minor=True)
    plt.yticks(np.arange(0.1, 1, 0.1), [], minor=True)
    plt.xticks([0.1, 1], [0.1, 1], minor=False)
    plt.yticks([0.1, 1], [0.1, 1], minor=False)
    # plt.grid()

    tf.savefig(f"figures/robustness/{prefix.replace('/','_')}_{exp_alt}")
    plt.show()

for exp_alt in ['allclass', 'byclass', 'relabel']:
    for prefix in ['weak/', 'noise/']:
        print(exp_alt, prefix)
        plot_gb_klm_mix(
            prefix=prefix,
            exp_alt=exp_alt,
            method_suffixes=["smallloss", "consensus", "cleanlab", "aum"],
        )

In [None]:
def plot_byclass_vs_allclass(prefix, k):
    logl_cv_allclass = all_val_cv["allclass"][prefix]["logl"][k]
    logl_cv_byclass = all_val_cv["byclass"][prefix]["logl"][k]

    tf.figure(width=TMLR_textwidth * (2 / 5), ratio=1, pad=0.5)

    for detector in set(detectors) - set(baselines):

        perf_allclass = logl_cv_allclass.loc[logl_cv_allclass.index == detector]
        perf_byclass = logl_cv_byclass.loc[logl_cv_byclass.index == detector]

        # print(detector, perf_allclass.shape, perf_byclass.shape)

        plt.scatter(
            perf_allclass,
            perf_byclass,
            color=full_detector_colors[detector],
            marker=full_detector_markers[detector],
            s=12,
            alpha=0.7,
        )

    xlims, ylims = plt.xlim(), plt.ylim()
    xy_min = min(xlims[0], ylims[0])
    xy_max = max(xlims[1], ylims[1])

    plt.plot([xy_min, xy_max], [xy_min, xy_max], color="black")
    plt.xlabel("test log loss\n(filtering allclass)")
    plt.ylabel("test log loss\n(filtering byclass)")

    custom_grid(plt.gca())
    plt.xlim(1e-1, 1)
    plt.ylim(1e-1, 1)
    plt.xscale("log")
    plt.yscale("log")

    plt.xticks(np.arange(0.1, 1, 0.1), [], minor=True)
    plt.yticks(np.arange(0.1, 1, 0.1), [], minor=True)
    plt.xticks([0.1, 1], [0.1, 1], minor=False)
    plt.yticks([0.1, 1], [0.1, 1], minor=False)
    # plt.grid()

    tf.savefig(f"figures/byclass_vs_allclass/{prefix.replace('/','_')}")
    plt.show()


plot_byclass_vs_allclass(prefix="weak/klm", k="oracl")

In [None]:
def plot_clean10_vs_clean90(prefix, exp_alt, k="clean"):
    logl_cv_for_norm = all_val_cv[exp_alt][prefix]["logl"]

    norm_low = logl_cv_for_norm[k].loc[logl_cv_for_norm[k].index == "none"].values
    norm_high = logl_cv_for_norm[k].loc[logl_cv_for_norm[k].index == "silver"].values

    logl_cv_clean10 = all_val_cv[exp_alt][prefix]["logl"]["clean_10"]
    logl_cv_clean90 = all_val_cv[exp_alt][prefix]["logl"]["clean_90"]

    logl_cv_clean10 = (logl_cv_clean10 - norm_low) / (norm_high - norm_low + 1e-10)
    logl_cv_clean90 = (logl_cv_clean90 - norm_low) / (norm_high - norm_low + 1e-10)

    logl_cv_clean10 = (logl_cv_clean10 + 1) * 100
    logl_cv_clean90 = (logl_cv_clean90 + 1) * 100

    tf.figure(width=TMLR_textwidth * (2 / 5), ratio=1, pad=0.5)

    for detector in set(detectors) - set(baselines):

        perf_clean10 = logl_cv_clean10.loc[logl_cv_clean10.index == detector]
        perf_clean90 = logl_cv_clean90.loc[logl_cv_clean90.index == detector]

        plt.scatter(
            perf_clean10,
            perf_clean90,
            color=full_detector_colors[detector],
            marker=full_detector_markers[detector],
            s=12,
            alpha=0.7,
        )

    xlims, ylims = plt.xlim(), plt.ylim()
    xy_min = min(xlims[0], ylims[0])
    xy_max = max(xlims[1], ylims[1])

    plt.plot([xy_min, xy_max], [xy_min, xy_max], color="black")
    plt.xlabel("test log loss\n(training on 10\% most trusted)")
    plt.ylabel("test log loss\n(training without 10\% less trusted)")

    custom_grid(plt.gca())
    plt.xlim(0, 250)
    plt.ylim(0, 250)
    # plt.xscale("log")
    # plt.yscale("log")

    plt.xticks(np.arange(0, 250, 50), [], minor=True)
    plt.yticks(np.arange(0, 250, 50), [], minor=True)
    plt.xticks([0, 100, 200], [0, 100, 200], minor=False)
    plt.yticks([0, 100, 200], [0, 100, 200], minor=False)
    # plt.grid()

    tf.savefig(f"figures/top_vs_bottom/{prefix.replace('/','_')}")
    plt.show()


plot_clean10_vs_clean90(prefix="weak/klm", exp_alt="allclass")

In [None]:
def plot_nnar_vs_ncar(suffix, k, exp_alt):
    logl_cv_noise = all_val_cv[exp_alt]["noise" + suffix]["logl"][k]
    logl_cv_weak = all_val_cv[exp_alt]["weak" + suffix]["logl"][k]

    tf.figure(width=TMLR_textwidth * (2 / 5), ratio=1, pad=0.5)

    for detector in set(detectors) - set(baselines):

        perf_noise = logl_cv_noise.loc[logl_cv_noise.index == detector]
        perf_weak = logl_cv_weak.loc[logl_cv_weak.index == detector]

        if perf_noise.shape != perf_weak.shape:
            continue

        plt.scatter(
            perf_noise,
            perf_weak,
            color=full_detector_colors[detector],
            s=12,
            alpha=0.7,
            marker=full_detector_markers[detector],
        )

    xlims, ylims = plt.xlim(), plt.ylim()
    xy_min = min(xlims[0], ylims[0])
    xy_max = max(xlims[1], ylims[1])

    plt.plot([xy_min, xy_max], [xy_min, xy_max], color="black")
    plt.xlabel("test log loss\n(NCAR dataset)")
    plt.ylabel("test log loss\n(NNAR dataset)")

    custom_grid(plt.gca())
    plt.xlim(1e-1, 1)
    plt.ylim(1e-1, 1)
    plt.xscale("log")
    plt.yscale("log")

    plt.xticks(np.arange(0.1, 1, 0.1), [], minor=True)
    plt.yticks(np.arange(0.1, 1, 0.1), [], minor=True)
    plt.xticks([0.1, 1], [0.1, 1], minor=False)
    plt.yticks([0.1, 1], [0.1, 1], minor=False)
    # plt.grid()

    tf.savefig(f"figures/weak_vs_noise/{exp_alt}{suffix.replace('/','_')}")
    plt.show()


plot_nnar_vs_ncar(suffix="/klm", k="oracl", exp_alt="allclass")

In [None]:
def plot_clean_noisy(prefix, exp_alt):
    logl_cv_noisy = all_val_cv[exp_alt][prefix]["logl"]["noisy"]
    logl_cv_clean = all_val_cv[exp_alt][prefix]["logl"]["clean"]
    logl_cv_noisy_none = logl_cv_noisy.loc[logl_cv_noisy.index == "none"]
    logl_cv_clean_none = logl_cv_clean.loc[logl_cv_clean.index == "none"]

    tf.figure(width=TMLR_textwidth * (2 / 5), ratio=1, pad=0.5)

    for detector in set(detectors) - set(baselines):

        perf_noisy = logl_cv_noisy.loc[logl_cv_noisy.index == detector]
        perf_clean = logl_cv_clean.loc[logl_cv_clean.index == detector]

        plt.scatter(
            logl_cv_noisy_none,
            perf_noisy,
            color=full_detector_colors[detector],
            s=12,
            alpha=0.7,
        )

        plt.scatter(
            logl_cv_clean_none,
            perf_clean,
            color=full_detector_colors[detector],
            s=12,
            alpha=0.7,
            marker="x",
        )

    xlims, ylims = plt.xlim(), plt.ylim()
    xy_min = min(xlims[0], ylims[0])
    xy_max = max(xlims[1], ylims[1])

    plt.plot([xy_min, xy_max], [xy_min, xy_max], color="black")

    plt.xlabel("test log loss\n(no filtering)")
    plt.ylabel("test log loss\n(detect + filter)")

    custom_grid(plt.gca())
    plt.xlim(1e-1, 1)
    plt.ylim(1e-1, 1)
    plt.xscale("log")
    plt.yscale("log")

    plt.xticks(np.arange(0.1, 1, 0.1), [], minor=True)
    plt.yticks(np.arange(0.1, 1, 0.1), [], minor=True)
    plt.xticks([0.1, 1], [0.1, 1], minor=False)
    plt.yticks([0.1, 1], [0.1, 1], minor=False)

    plt.scatter([], [], color="black", label=f"clean valid.", marker="x", s=12)
    plt.scatter([], [], color="black", label=f"noisy valid.", s=12)
    plt.legend()

    tf.savefig(f"figures/clean_vs_noisy/{exp_alt}_{prefix.replace('/','_')}")
    plt.show()


plot_clean_noisy(prefix="weak/klm", exp_alt="allclass")

In [None]:
prefix = "noise/klm"
exp_alt = "allclass"

logl_cv_noisy = all_val_cv[exp_alt][prefix]["logl"]["noisy"]
logl_cv_clean = all_val_cv[exp_alt][prefix]["logl"]["clean"]
logl_cv_noisy_none = logl_cv_noisy.loc[logl_cv_noisy.index == "none"]
logl_cv_clean_none = logl_cv_clean.loc[logl_cv_clean.index == "none"]

perf_noisy = logl_cv_noisy.loc[logl_cv_noisy.index == "gb_smallloss"]

In [None]:
splitter_quantiles = []
noise_ratios = []

plt.figure(figsize=(5, 5))
for p_split, noise_ratio in all_results_cv["allclass"]["noise/klm"]["logl"]["oracl"][
    ["params_splitter", "noise_ratio"]
].itertuples(index=False):
    if "quantile" in p_split.keys():
        splitter_quantiles.append(p_split["quantile"])
        noise_ratios.append(noise_ratio)

plt.scatter(noise_ratios, splitter_quantiles, alpha=0.2)
plt.grid()
plt.xlabel("noise ratio")
plt.ylabel("splitter quantile")
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.plot([0, 1], [0, 1])

In [None]:
hp_scales = {
    "alpha": "log",
    "eta0": "log",
    "learning_rate": "log",
    "reg_lambda": "linear",
}

In [None]:
exp_alt = "allclass"
prefix = "weak/klm"

vals = defaultdict(list)
cv_k = "logl"

for params_detector in all_results_cv[exp_alt][prefix][cv_k]["oracl"][
    "params_detector"
]:
    if isinstance(params_detector, dict):
        for k, v in params_detector.items():
            if type(v) == float:
                vals[k].append(v)

for params_classifier in all_results_cv[exp_alt][prefix][cv_k]["oracl"][
    "params_classifier"
]:
    if isinstance(params_classifier, dict):
        for k, v in params_classifier.items():
            if type(v) == float:
                vals[k].append(v)

f, axes = plt.subplots(1, len(vals), figsize=(3 * len(vals), 3))
for i, (k, v) in enumerate(vals.items()):

    scale = hp_scales[k.split("__")[-1]]
    axis = axes[i]

    if scale == "log":
        axis.hist(np.log10(v))
    else:
        axis.hist(v)
    axis.set_title(k)


f.show()

In [None]:
exp_alt = "allclass"
prefix = "weak/klm"

vals = defaultdict(lambda: defaultdict(list))
cv_k = "logl"

for params_detector, detect_name in all_results_cv[exp_alt][prefix][cv_k]["oracl"][
    ["params_detector", "detector_name"]
].itertuples(index=False):
    if isinstance(params_detector, dict):
        for k, v in params_detector.items():
            if type(v) == float:
                vals[detect_name][k].append(v)

for params_classifier, detect_name in all_results_cv[exp_alt][prefix][cv_k]["oracl"][
    ["params_classifier", "detector_name"]
].itertuples(index=False):
    if isinstance(params_classifier, dict):
        for k, v in params_classifier.items():
            if type(v) == float:
                vals[detect_name][k].append(v)

n_hps = np.max([len(v) for v in vals.values()])


f, axes = plt.subplots(len(vals), n_hps, figsize=(3 * n_hps, 3 * len(vals)))

for j, (k_detect, vals_d) in enumerate(vals.items()):
    axes[j][0].set_ylabel(k_detect)

    for i, (k, v) in enumerate(vals_d.items()):

        scale = hp_scales[k.split("__")[-1]]
        axis = axes[j][i]

        if scale == "log":
            axis.hist(np.log10(v))
        else:
            axis.hist(v)

        axis.set_title(k)


f.show()

In [None]:
exp_alt = "allclass"
prefix = "noise/klm"

r_oracl = all_results_cv[exp_alt][prefix]["logl"]["oracl"]
r_noisy = all_results_cv[exp_alt][prefix]["logl"]["noisy"]

# p = r_noisy.pivot(index="dataset_name", columns="detector_name")
tf.figure(width=TMLR_textwidth * (2 / 5), ratio=1, pad=0.5)
custom_grid(plt.gca())

for detect_name, dataset_name, p_split_oracl in r_oracl[
    ["detector_name", "dataset_name", "params_splitter"]
].itertuples(index=False):
    p_split_noisy = r_noisy[
        (r_noisy["dataset_name"] == dataset_name)
        & (r_noisy["detector_name"] == detect_name)
    ]["params_splitter"].iloc[0]

    if detect_name in ["agra", "random"] or not "quantile" in p_split_oracl:
        continue

    plt.scatter(
        p_split_oracl["quantile"],
        p_split_noisy["quantile"],
        alpha=0.15,
        color="tab:blue",
    )

lmin = -0.05
lmax = 1.05

plt.xlim(lmin, lmax)
plt.ylim(lmin, lmax)
plt.plot([lmin, lmax], [lmin, lmax], color="black")
plt.xlabel("Filtering quantile\nOracle")
plt.ylabel("Filtering quantile\nTuned using noisy valid. set")
plt.xticks(np.linspace(0, 1, 6))
plt.yticks(np.linspace(0, 1, 6))

tf.savefig(f"figures/threshold/{prefix.replace('/','_')}_{exp_alt}")

In [None]:
prefix = "weak/klm"
exp_alt = "allclass"


def scale_f_mult(x, up=True):
    if up:
        return x * 1.5
    return x / 1.5


def scale_f_add(x, up=True):
    if up:
        return x + 5
    return x - 5


for k_detect, k_classif, scale, scale_f in [
    ("base_model__sgd__alpha", "sgd__alpha", "log", scale_f_mult),
    ("base_model__sgd__eta0", "sgd__eta0", "log", scale_f_mult),
    ("base_model__reg_lambda", "reg_lambda", "linear", scale_f_add),
]:
    p1 = []
    p2 = []
    d_names = []
    counts = {"above": 0, "below": 0, "equal": 0}

    for detect_name, p_detect, p_classif in all_results_cv[exp_alt][prefix]["logl"][
        "oracl"
    ][["detector_name", "params_detector", "params_classifier"]].itertuples(
        index=False
    ):

        if (
            isinstance(p_detect, dict)
            and k_detect in p_detect.keys()
            and k_classif in p_classif.keys()
        ):
            p1.append(p_detect[k_detect])
            p2.append(p_classif[k_classif])
            d_names.append(detect_name)

            if p_classif[k_classif] > scale_f(p_detect[k_detect], True):
                counts["above"] += 1
            elif p_classif[k_classif] < scale_f(p_detect[k_detect], False):
                counts["below"] += 1
            else:
                counts["equal"] += 1

    if len(p1) == 0:
        continue

    plt.figure(figsize=(5, 5))
    plt.scatter(p1, p2, c=[full_detector_colors[d] for d in d_names], alpha=1)
    plt.grid()
    plt.xlabel(k_detect)
    plt.ylabel(k_classif)
    plt.xscale(scale)
    plt.yscale(scale)

    xlims = plt.xlim()
    ylims = plt.ylim()

    rmin = min(xlims[0], ylims[0])
    rmax = max(xlims[1], ylims[1])

    plt.plot([rmin, rmax], [rmin, rmax])

    plt.plot([scale_f(rmin, True), rmax], [rmin, scale_f(rmax, False)], color="orange")
    plt.plot([rmin, scale_f(rmax, False)], [scale_f(rmin, True), rmax], color="orange")

    plt.xlim(rmin, rmax)
    plt.ylim(rmin, rmax)

    plt.title(f"{counts['above']} / {counts['equal']} / {counts['below']}")
    plt.show()

In [None]:
prefix = "weak/klm"
exp_alt = "allclass"


def scale_f_mult(x, up=True):
    if up:
        return x * 1.5
    return x / 1.5


def scale_f_add(x, up=True):
    if up:
        return x + 5
    return x - 5


r_none = all_results_cv[exp_alt][prefix]["logl"]["oracl"][
    all_results_cv[exp_alt][prefix]["logl"]["oracl"]["detector_name"] == "none"
]

for k_classif, scale, scale_f in [
    ("sgd__alpha", "log", scale_f_mult),
    ("sgd__eta0", "log", scale_f_mult),
    ("learning_rate", "log", scale_f_mult),
    ("reg_lambda", "linear", scale_f_add),
]:
    p1 = []
    p2 = []
    d_names = []
    counts = {"above": 0, "below": 0, "equal": 0}

    for detect_name, dataset_name, p_classif_filter, logl_filter in all_results_cv[
        exp_alt
    ][prefix]["logl"]["oracl"][
        ["detector_name", "dataset_name", "params_classifier", "logl_test"]
    ].itertuples(
        index=False
    ):

        if detect_name in baselines + ["random"]:
            continue

        assert len(r_none[r_none["dataset_name"] == dataset_name]) == 1
        p_classif_none, logl_none = r_none[r_none["dataset_name"] == dataset_name].iloc[
            0
        ][["params_classifier", "logl_test"]]

        if isinstance(p_classif_filter, dict) and k_classif in p_classif_filter.keys():
            p2.append(p_classif_filter[k_classif])
            p1.append(p_classif_none[k_classif])
            d_names.append(detect_name)

            if p_classif_filter[k_classif] > scale_f(p_classif_none[k_classif], True):
                counts["above"] += 1
            elif p_classif_filter[k_classif] < scale_f(
                p_classif_none[k_classif], False
            ):
                counts["below"] += 1
            else:
                counts["equal"] += 1

    print(np.unique(d_names))
    if len(p1) == 0:
        continue

    tf.figure(width=TMLR_textwidth * (2 / 5), ratio=1, pad=0.5)
    plt.scatter(
        p1,
        p2,
        c=[full_detector_colors[d] for d in d_names],
        # marker=[full_detector_markers[d] for d in d_names],
        alpha=0.7,
        s=12,
    )
    custom_grid(plt.gca())
    k_classif_str = k_classif.replace("_", "\_")
    plt.xlabel(f"{k_classif_str} of none baseline")
    plt.ylabel(f"{k_classif_str} of detect + filter")
    plt.xscale(scale)
    plt.yscale(scale)

    xlims = plt.xlim()
    ylims = plt.ylim()

    rmin = min(xlims[0], ylims[0])
    rmax = max(xlims[1], ylims[1])

    plt.plot([rmin, rmax], [rmin, rmax], color="black")

    plt.plot([scale_f(rmin, True), rmax], [rmin, scale_f(rmax, False)], color="orange")
    plt.plot([rmin, scale_f(rmax, False)], [scale_f(rmin, True), rmax], color="orange")

    plt.xlim(rmin, rmax)
    plt.ylim(rmin, rmax)

    plt.title(f"{counts['above']} / {counts['equal']} / {counts['below']}")

    tf.savefig(
        f"figures/detect_vs_none/{prefix.replace('/','_')}_{exp_alt}_{k_classif}"
    )
    plt.show()

In [None]:
from sklearn.linear_model import HuberRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

regr = make_pipeline(PolynomialFeatures(1), HuberRegressor())

In [None]:
prefix = "weak/klm"
estim = prefix.split("/")[-1]
exp_alt = "allclass"

r_none = all_results_cv[exp_alt][prefix]["logl"]["oracl"][
    all_results_cv[exp_alt][prefix]["logl"]["oracl"]["detector_name"] == "none"
]

r_qualities = defaultdict(list)
none_losses = defaultdict(list)

for detect_name, dataset_name, ranking_quality in all_results_cv[exp_alt][prefix][
    "logl"
]["oracl"][["detector_name", "dataset_name", "global_ranking_quality"]].itertuples(
    index=False
):

    if detect_name in baselines + ["random"]:
        continue

    # if ranking_quality < 0.5:
    #     continue

    if estim != d_base_model_map[detect_name]:
        continue

    # if detect_name != 'agra': continue

    assert len(r_none[r_none["dataset_name"] == dataset_name]) == 1
    logl_test_none = r_none[r_none["dataset_name"] == dataset_name].iloc[0]["logl_test"]

    r_qualities[detect_name].append(ranking_quality)
    none_losses[detect_name].append(logl_test_none)


tf.figure(width=TMLR_textwidth * (2 / 5), ratio=1, pad=0.5)
custom_grid(plt.gca())

plt.xlabel(f"test loss of none baseline")
plt.ylabel(f"ranking quality")
plt.xscale("log")
plt.yscale("linear")


def t(x):
    return np.log(x)


for (d_name, r_quals), (_, none_los) in zip(r_qualities.items(), none_losses.items()):
    plt.scatter(
        none_los,
        r_quals,
        c=full_detector_colors[d_name],
        marker=full_detector_markers[d_name],
        alpha=0.7,
        s=12,
    )

    none_los, r_quals = zip(*sorted(zip(none_los, r_quals)))
    # plt.plot(none_los, r_quals, c=detector_colors[d_name], alpha=0.1)

    regr.fit(t(np.array(none_los)[:, None]), r_quals)

    x_plot = np.linspace(np.min(none_los), np.max(none_los))
    plt.plot(
        x_plot,
        regr.predict(t(x_plot[:, None])),
        c=full_detector_colors[d_name],
        alpha=0.2,
    )

# xlims = plt.xlim()
# ylims = plt.ylim()

# rmin = min(xlims[0], ylims[0])
# rmax = max(xlims[1], ylims[1])

# plt.plot([rmin, rmax], [rmin, rmax], color="black")

plt.ylim(0.5, 1)
# plt.ylim(rmin, rmax)

tf.savefig(f"figures/perf_vs_detect/{prefix.replace('/','_')}_{exp_alt}")
plt.show()

In [None]:
prefix = "weak/gb"
exp_alt = "allclass"

results_cv = all_results_cv[exp_alt][prefix]["logl"]

df = (
    results_cv["oracl"]
    .pivot(index="dataset_name", columns="detector_name", values="logl_test")
    .dropna()
)

diagram = Diagram(df.to_numpy(), treatment_names=df.columns, maximize_outcome=False)

diagram.to_file(
    f"critdd_logl_test_{prefix.replace('/','_')}.pdf",
    alpha=0.05,
    adjustment="holm",
    reverse_x=True,
    axis_options={"title": "critdd"},
)

In [None]:
cols = ["gold", "silver", "none", "wood"]

res_oracl = (
    results_cv["oracl"]
    .pivot(index="dataset_name", columns="detector_name", values="logl_test")
    .dropna()
)

res_clean = (
    results_cv["clean"]
    .pivot(index="dataset_name", columns="detector_name", values="logl_test")
    .dropna()
)

res_noisy = (
    results_cv["noisy"]
    .pivot(index="dataset_name", columns="detector_name", values="logl_test")
    .dropna()
)

cc = np.stack(
    (res_oracl[cols].to_numpy(), res_noisy[cols].to_numpy(), res_clean[cols].to_numpy())
)

diagram = Diagrams(
    cc.transpose((2, 1, 0)),
    treatment_names=["oracle", "noisy", "clean"],
    maximize_outcome=False,
    diagram_names=cols,
)

diagram.to_file(
    f"critdd_logl_test_onc_{prefix.replace('/','_')}.pdf",
    alpha=0.05,
    adjustment="holm",
    reverse_x=True,
    axis_options={"title": "critdd"},
)


diagram = Diagrams(
    cc,
    diagram_names=["oracle", "noisy", "clean"],
    maximize_outcome=False,
    treatment_names=cols,
)

diagram.to_file(
    f"critdd_logl_test_comp_{prefix.replace('/','_')}.pdf",
    alpha=0.05,
    adjustment="holm",
    reverse_x=True,
    axis_options={"title": "critdd"},
)

In [None]:
cc = np.stack(
    (res_oracl[cols].to_numpy(), res_noisy[cols].to_numpy(), res_clean[cols].to_numpy())
).transpose((2, 1, 0))

diagram = Diagrams(
    cc,
    treatment_names=["oracle", "noisy", "clean"],
    maximize_outcome=False,
    diagram_names=cols,
)

diagram.to_file(
    f"critdd_logl_test_oracle_{prefix.replace('/','_')}.pdf",
    alpha=0.05,
    adjustment="holm",
    reverse_x=True,
    axis_options={"title": "critdd"},
)

In [None]:
competitors = list(set(detectors) - set(baselines + ["random"]))

cc = np.stack(
    (
        np.repeat([res_noisy["none"].to_numpy()], len(competitors), axis=0).T,
        np.repeat([res_clean["none"].to_numpy()], len(competitors), axis=0).T,
        res_oracl[competitors].to_numpy(),
        res_noisy[competitors].to_numpy(),
        res_clean[competitors].to_numpy(),
    )
).transpose((2, 1, 0))

diagram = Diagrams(
    cc,
    treatment_names=["none noisy", "none clean", "oracle", "noisy", "clean"],
    maximize_outcome=False,
    diagram_names=competitors,
)

diagram.to_file(
    f"critdd_logl_test_detectors_{prefix.replace('/','_')}.pdf",
    alpha=0.05,
    adjustment="holm",
    reverse_x=True,
    axis_options={"title": "critdd"},
)

In [None]:
remaining = dict()

n = 12

for ds, n_runs in (
    all_results['byclass']["weak/klm"].pivot_table(
        index="detector_name",
        columns="dataset_name",
        values="estim_time",
        aggfunc="count",
    )
).items():

    # remaining[ds] = (8*12*10 - n_runs[np.logical_and(n_runs < 8*12*10, ~n_runs.isin([12, 120]))]).sum()
    # remaining[ds] = (9*12 - n_runs[np.logical_and(n_runs <9*12, ~n_runs.isin([12, 120]))]).sum()
    # remaining[ds] = int(
    #     (
    #         n * 12 * 10
    #         - n_runs[np.logical_and(~n_runs.isin([12, 120]), n_runs < n * 12 * 10)]
    #     ).sum()
    # )

    # remaining[ds] += 1440*2

    remaining[ds] = int(16*1440+4*12+120 - n_runs.sum())

    # if remaining[ds]> 0:
    #     remaining[ds] += 100

    # print(ds, n_runs, remaining[ds])
    print(ds, remaining[ds])

In [None]:
datasets_with_rbf = [
    "bank-marketing",
    "bioresponse",
    "census",
    "mushroom",
    "phishing",
    "spambase",
    "basketball",
    "commercial",
    "tennis",
    "cifar10",
]

In [None]:
classif = "klm"

all_results['allclass'][f"weak/{classif}"].pivot_table(
    index="dataset_name",
    columns="detector_name",
    values="estim_time",
    aggfunc="max",
    # aggfunc=lambda x: np.percentile(x, 15),
)["white_gold"].sort_values(ascending=False)

In [None]:
# for d in [
#     "bank-marketing",
#     "bioresponse",
#     "census",
#     "mushroom",
#     "phishing",
#     "spambase",
#     "basketball",
#     "commercial",
#     "tennis",
#     "cifar10",
# ]:
#     for k in ["estim", "estim_byclass", "relabel"]:
#         for n in ["weak", "noise"]:
#             print(f"ls {k}/{n}/klm/*/{d}.json")

In [None]:
for k in all_results_cv["allclass"].keys():
    d = all_results_cv["allclass"][k]["logl"]["clean"]

    print(k)
    for d_name, detector_name, params_classifier, params_detector in d[
        ["dataset_name", "detector_name", "params_classifier", "params_detector"]
    ].itertuples(index=False):
        if (
            detector_name in d_base_model_map.keys()
            and d_base_model_map[detector_name] == "klm"
        ):
            # print(d_base_model_map[detector_name] )
            print(d_name, detector_name, params_detector, params_classifier)

In [None]:
prefix = "weak/klm"
exp_alt = "allclass"

hp_pivot = all_results_cv[exp_alt][prefix]["logl"]["clean"].pivot(
    values="params_classifier", index="dataset_name", columns="detector_name"
)
res = all_results[exp_alt][prefix]
res_none = all_results_cv[exp_alt][prefix]["logl"]["clean"][
    all_results_cv[exp_alt][prefix]["logl"]["clean"]["detector_name"] == "none"
][["dataset_name", "logl_test"]].set_index("dataset_name")
res_silver = all_results_cv[exp_alt][prefix]["logl"]["clean"][
    all_results_cv[exp_alt][prefix]["logl"]["clean"]["detector_name"] == "silver"
][["dataset_name", "logl_test"]].set_index("dataset_name")
res_gold = all_results_cv[exp_alt][prefix]["logl"]["clean"][
    all_results_cv[exp_alt][prefix]["logl"]["clean"]["detector_name"] == "gold"
][["dataset_name", "logl_test"]].set_index("dataset_name")

mult = 1.5
fig, axes = plt.subplots(4, 5, figsize=(12 * mult, 8 * mult))

for i, dataset in enumerate(datasets):
    axis = axes[i // 5][i % 5]

    for detector in set(detectors) - {"gb_consensus", "klm_consensus"} - set(baselines):
        hp_this = hp_pivot.loc[dataset][detector]
        logl_this = res[(res.dataset_name == dataset) & (res.detector_name == detector)]
        logl_this = logl_this[logl_this.params_classifier.apply(lambda x: x == hp_this)]

        split_quantiles = []
        logls = []
        for p_split, logl in logl_this[["params_splitter", "logl_test"]].itertuples(
            index=False
        ):
            split_quantiles.append(p_split["quantile"])
            logls.append(logl)

        axis.plot(split_quantiles, logls, label=detector)

    axis.plot([0, 1], [res_none.loc[dataset].values] * 2, color="red")
    axis.plot([0, 1], [res_silver.loc[dataset].values] * 2, color="silver")
    axis.plot([0, 1], [res_gold.loc[dataset].values] * 2, color="gold")

    custom_grid(axis)
    axis.set_title(dataset)
    # axis.legend()
    axis.set_xlabel("split quantile")
    axis.set_ylabel("log loss test")
    axis.set_xlim()
    axis.set_yscale("log")

handles, labels = axes[0][0].get_legend_handles_labels()
fig.legend(handles, labels, loc='lower right')

fig.tight_layout()
plt.show()

In [None]:
detect_dirs = [
    (
        "weak",
        os.path.join(os.path.expanduser("~"), f"{output_dir}/detect/weak"),
    ),
    (
        "noise",
        os.path.join(os.path.expanduser("~"), f"{output_dir}/detect/noise"),
    ),
]

all_detect = dict()
for prefix, dir in detect_dirs:
    results_ = []
    methods = os.listdir(dir)

    for method in methods:
        for fname in os.listdir(os.path.join(dir, method)):
            dataset, ext = fname.split(".")
            if ext != "json":
                continue

            with open(os.path.join(dir, method, f"{dataset}.json")) as f:
                results_.append(pd.read_json(f, orient="records"))

    results_ = pd.concat(results_)

    all_detect[prefix] = results_.reset_index(drop=True)
    all_detect[prefix]["detect_run"] = all_detect[prefix].groupby(["dataset_name", "detector_name"]).cumcount()+1

for k, v in all_results.items():
    v.loc[pd.isna(v["params_detector"]), "params_detector"] = v.loc[
        pd.isna(v["params_detector"]), "params"
    ]
    v.drop("params", inplace=True, axis=1)


all_merged = dict()
for prefix in ["weak", "noise"]:
    all_results[prefix + "/klm"]["params_detector"] = all_results[prefix + "/klm"][
        "params_detector"
    ].astype(str)
    all_detect[prefix]["params"] = all_detect[prefix]["params"].astype(str)

    all_merged[prefix] = pd.merge(
        all_results[prefix + "/klm"],
        all_detect[prefix],
        left_on=["dataset_name", "detector_name", "params_detector"],
        right_on=["dataset_name", "detector_name", "params"],
    )

    all_merged[prefix]["params_detector"] = all_merged[prefix]["params_detector"].apply(
        eval
    )

In [None]:
detect_path = os.path.join(os.path.expanduser("~"), f"{output_dir}/detect")
import json
import h5py
import numpy as np
import pandas as pd
from scipy.stats import entropy


class TrustScoreReader:

    def __init__(self, base_path, dataset, detector):

        with open(os.path.join(base_path, detector, f"{dataset}.json"), mode="r") as f:
            self.results_json = json.load(f)
        self.results_hdf5 = h5py.File(
            os.path.join(base_path, detector, f"{dataset}.hdf5"), "r"
        )

        assert len(self.results_hdf5["trust_scores"]) == len(self.results_json)

    def get(self, i):
        return self.results_json[i], self.results_hdf5[f"trust_scores/{i}"][...]

    def length(self):
        return len(self.results_json)

In [None]:
from datasets import get_weak_datasets, datasets_ranked_by_time
import os

datasets_folder = os.path.join(os.path.expanduser("~"), "datasets")

datasets = list(
    filter(
        lambda dataset: dataset not in ["imdb136", "cifar10"], datasets_ranked_by_time
    )
)

weak_datasets = get_weak_datasets(
    cache_folder=datasets_folder,
    corruption="weak",
    seed=1,
    datasets=datasets,
)

noise_datasets = get_weak_datasets(
    cache_folder=datasets_folder,
    corruption="noise",
    seed=1,
    datasets=datasets_ranked_by_time,
)

loaded_datasets = dict(weak=weak_datasets, noise=noise_datasets)

In [None]:
from itertools import product


def plot_class_balance(prefix, reference="clean", k="oracl"):

    mapping = {"oracl": "test", "clean": "val", "noisy": "noisy_val"}
    best_cv = (
        all_merged[prefix]
        .groupby(["dataset_name", "detector_name"])[f"logl_{mapping[k]}"]
        .idxmin()
    )
    best_trust_scores_index = (
        all_merged[prefix]
        .iloc[best_cv]
        .pivot(
            index="detector_name",
            columns="dataset_name",
            values="detect_run",
        )
    )
    best_quantile = (
        all_merged[prefix]
        .iloc[best_cv]
        .pivot(
            index="detector_name",
            columns="dataset_name",
            values="params_splitter",
        )
        .map(lambda dict: dict.get("quantile"))
    )
    best_threshold = (
        all_merged[prefix]
        .iloc[best_cv]
        .pivot(
            index="detector_name",
            columns="dataset_name",
            values="params_splitter",
        )
        .map(lambda dict: dict.get("threshold"))
    )
    tf.figure(width=TMLR_textwidth * (2 / 5), ratio=1, pad=0.5)

    top = 0
    bottom = 0
    for detector_name, dataset_name in product(
        best_trust_scores_index.index, best_trust_scores_index.columns
    ):
        trust_score_reader = TrustScoreReader(
            os.path.join(detect_path, prefix), dataset_name, detector_name
        )
        trust_scores = trust_score_reader.get(
            best_trust_scores_index.loc[detector_name, dataset_name] - 1
        )[1]
        dataset = loaded_datasets[prefix][dataset_name]
        y_train_clean = dataset["train"]["target"]
        y_train_noisy = dataset["train"]["noisy_target"]
        y_test = dataset["test"]["target"]
        unlabeled = y_train_noisy == -1
        y_train_clean = np.asarray(y_train_clean)[~unlabeled]
        y_train_noisy = np.asarray(y_train_noisy)[~unlabeled]
        n_classes = len(np.unique(y_test))
        if "consensus" in detector_name:
            trusted = trust_scores >= best_threshold.loc[detector_name, dataset_name]
        else:
            trusted = trust_scores >= np.quantile(
                trust_scores, q=best_quantile.loc[detector_name, dataset_name]
            )
        prior_trusted = np.bincount(y_train_noisy[trusted], minlength=n_classes) / len(
            y_train_noisy[trusted]
        )
        prior_untrusted = np.bincount(
            y_train_noisy[~trusted], minlength=n_classes
        ) / len(y_train_noisy[~trusted])
        prior_clean = np.bincount(y_test, minlength=n_classes) / len(y_test)
        prior_noisy = np.bincount(y_train_noisy, minlength=n_classes) / len(
            y_train_noisy
        )
        x_clean = np.min(prior_clean) / np.max(prior_clean)
        x_noisy = np.min(prior_noisy) / np.max(prior_noisy)
        x = x_clean if reference == "clean" else x_noisy
        y = np.min(prior_trusted) / np.max(prior_trusted)
        plt.scatter(
            x,
            y,
            color="tab:blue" if x_noisy > x_clean else "tab:orange",
            # marker=full_detector_markers[detector_name],
            alpha=0.7,
            s=12,
        )
        top += np.sum(y > x)
        bottom += np.sum(y <= x)

    plt.ylabel("filtered class balance")
    if reference == "clean":
        plt.xlabel("clean class balance")
    else:
        plt.xlabel("noisy class balance")

    # plt.title(f"{top}/{bottom}")

    custom_grid(plt.gca())

    plt.xlim(0, 1)
    plt.ylim(0, 1)
    plt.plot(np.linspace(0, 1, 1000), np.linspace(0, 1, 1000), color="black")

    plt.xticks(np.arange(0, 1, 0.1), [], minor=True)
    plt.yticks(np.arange(0, 1, 0.1), [], minor=True)
    plt.xticks([0, 1], [0, 1], minor=False)
    plt.yticks([0, 1], [0, 1], minor=False)

    # plt.scatter(
    #     [], [], color="tab:blue", label=f"more bal. than {mapping[k][0]}.", s=12
    # )
    # plt.scatter(
    #     [], [], color="tab:orange", label=f"less bal. than {mapping[k][0]}.", s=12
    # )
    # plt.legend()

    tf.savefig(f"figures/class_balance/{prefix.replace('/','_')}_{reference}")
    plt.show()


if not os.path.exists("figures/class_balance"):
    os.mkdir("figures/class_balance")
plot_class_balance("noise", reference="clean")
plot_class_balance("noise", reference="noisy")
plot_class_balance("weak", reference="clean")
plot_class_balance("weak", reference="noisy")

In [None]:
from sklearn.linear_model import HuberRegressor, LinearRegression
from sklearn.metrics import r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR


def plot_rank_vs_perf(prefix, k):
    logl_cv_allclass = all_val_cv["allclass"][prefix]["logl"][k]
    mapping = {"oracl": "test", "clean": "val", "noisy": "noisy_val"}
    max_rank_global = (
        all_results[prefix]
        .iloc[
            all_results[prefix]
            .groupby(["dataset_name", "detector_name"])[f"logl_{mapping[k]}"]
            .idxmin()
        ]
        .pivot(
            index="detector_name",
            columns="dataset_name",
            values="global_ranking_quality",
        )
    )
    tf.figure(width=TMLR_textwidth * (2 / 5), ratio=1, pad=0.5)

    list_perf_allclass = []
    list_perf_rank = []

    # plt.axvline(0.5, color=detector_colors["random"], alpha=0.7)

    for detector in set(detectors) - set(baselines) - set(["random"]):

        perf_allclass = (
            logl_cv_allclass.loc[logl_cv_allclass.index == detector].values
            - logl_cv_allclass.loc[logl_cv_allclass.index == "none"].values
        ) / (
            logl_cv_allclass.loc[logl_cv_allclass.index == "silver"].values
            - logl_cv_allclass.loc[logl_cv_allclass.index == "none"].values
            + 1e-10
        )
        perf_allclass = (2 - perf_allclass) * 100
        perf_rank = max_rank_global.loc[
            max_rank_global.index == detector
        ].values.ravel()

        plt.scatter(
            perf_rank,
            perf_allclass,
            color=full_detector_colors[detector],
            marker=full_detector_markers[detector],
            s=12,
            alpha=0.7,
        )
        
        list_perf_allclass.append(perf_allclass.ravel())
        list_perf_rank.append(perf_rank)

    plt.xlabel("ranking quality")
    plt.ylabel("normalized test log loss")

    custom_grid(plt.gca())

    plt.xlim(0.5, 1)
    plt.ylim(0, 300)

    y = np.concatenate(list_perf_allclass)
    X = np.concatenate(list_perf_rank).reshape(-1, 1)
    y = y[X.ravel() >= 0.5]
    X = X[X.ravel() >= 0.5]
    lr = HuberRegressor().fit(X, y)
    print(lr.coef_)
    print(r2_score(y, lr.predict(X)))

    x = np.linspace(0.5, 1, 1000)
    plt.plot(x, lr.predict(x.reshape(-1, 1)), color="black", alpha=0.7)

    plt.xticks(np.arange(0.5, 1, 0.1), [], minor=True)
    plt.yticks(np.arange(0, 300, 25), [], minor=True)
    plt.xticks([0.5, 1], [0.5, 1], minor=False)
    plt.yticks([100, 200], [100, 200], minor=False)

    tf.savefig(f"figures/rank_vs_perf/{prefix.replace('/','_')}")
    plt.show()


if not os.path.exists("figures/rank_vs_perf"):
    os.makedirs("figures/rank_vs_perf")

plot_rank_vs_perf(prefix="noise/klm", k="oracl")
plot_rank_vs_perf(prefix="weak/klm", k="oracl")
plot_rank_vs_perf(prefix="noise/gb", k="oracl")
plot_rank_vs_perf(prefix="weak/gb", k="oracl")