In [None]:
from datasets import get_weak_datasets, datasets_ranked_by_time
import os

datasets_folder = os.path.join(os.path.expanduser("~"), "datasets")

weak_datasets = get_weak_datasets(
    cache_folder=datasets_folder,
    corruption="weak",
    seed=1,
    datasets=datasets_ranked_by_time,
)

In [None]:
import math
import operator
from functools import reduce

import numpy as np
from sklearn.metrics import confusion_matrix

from mislabeled.preprocessing import WeakLabelEncoder

splits = ["train", "validation", "test"]
stats = {}
for dataset in weak_datasets.keys():
    if dataset not in stats:
        stats[dataset] = {}
    stats[dataset]["n_samples"] = sum(
        weak_datasets[dataset][split]["data"].shape[0] for split in splits
    )
    stats[dataset]["task"] = (
        "text"
        if "raw" in weak_datasets[dataset]["train"]
        and isinstance(weak_datasets[dataset]["train"]["raw"], list)
        else "tabular"
    )
    if dataset == "cifar10":
        stats[dataset]["task"] = "image"
    stats[dataset]["n_features"] = weak_datasets[dataset]["train"]["data"].shape[1]
    if stats[dataset]["task"] == "text":
        vocabulary = set()
        for split in splits:
            sentences = weak_datasets[dataset][split]["raw"]
            for sentence in sentences:
                words = sentence.split()
                for word in words:
                    vocabulary.add(word)
        stats[dataset]["n_original_features"] = len(vocabulary)
    else:
        if "raw" in weak_datasets[dataset]["train"]:
            stats[dataset]["n_original_features"] = weak_datasets[dataset]["train"][
                "raw"
            ].shape[1]
        else:
            stats[dataset]["n_original_features"] = stats[dataset]["n_features"]
    stats[dataset]["n_classes"] = len(weak_datasets[dataset]["train"]["target_names"])
    stats[dataset]["priors"] = np.zeros(stats[dataset]["n_classes"])
    for split in splits:
        stats[dataset]["priors"] += np.bincount(
            weak_datasets[dataset][split]["target"],
            minlength=stats[dataset]["n_classes"],
        )
    stats[dataset]["priors"] /= np.sum(stats[dataset]["priors"])
    stats[dataset]["priors"] = stats[dataset]["priors"].tolist()
    stats[dataset]["n_weak_targets"] = weak_datasets[dataset]["train"][
        "weak_targets"
    ].shape[1]
    noisy_targets = WeakLabelEncoder(random_state=1).fit_transform(
        weak_datasets[dataset]["train"]["weak_targets"]
    )
    unlabeled = noisy_targets == -1
    stats[dataset]["coverage"] = math.floor((1 - np.mean(unlabeled)) * 100)
    stats[dataset]["noise_ratio"] = np.mean(
        noisy_targets[~unlabeled]
        != np.asarray(weak_datasets[dataset]["train"]["target"])[~unlabeled]
    )
    noisy_targets[noisy_targets == -1] = stats[dataset]["n_classes"]
    stats[dataset]["noise_transition"] = confusion_matrix(
        noisy_targets,
        weak_datasets[dataset]["train"]["target"],
        normalize="pred"
    )
    if stats[dataset]["coverage"] < 100:
        pass
        stats[dataset]["noise_transition"] = stats[dataset]["noise_transition"][:, 0:-1]
    ergodicity = np.empty((stats[dataset]["n_classes"], stats[dataset]["n_classes"]))
    for i in range(stats[dataset]["n_classes"]):
        for j in range(stats[dataset]["n_classes"]):
            ergodicity[i, j] = np.sum(
                np.abs(
                    stats[dataset]["noise_transition"][i]
                    - stats[dataset]["noise_transition"][j]
                )
            )
    stats[dataset]["ergodicity"] = 0.5 * np.max(ergodicity)
    stats[dataset]["noise_transition"] = stats[dataset]["noise_transition"].tolist()

In [None]:
for dataset in weak_datasets.keys():
    if dataset in ["imdb136", "amazon", "professor_teacher"]:
        stats[dataset]["benchmark"] = "weasel"
    elif dataset in ["yoruba", "hausa"]:
        stats[dataset]["benchmark"] = "waln"
    elif dataset in ["cifar10"]:
        stats[dataset]["benchmark"] = "cifar10n-agg"
    else:
        stats[dataset]["benchmark"] = "wrench"

In [None]:
stats["professor-teacher"] = stats.pop("professor_teacher")
stats.pop("imdb136")
stats.pop("cifar10")

In [None]:
def safe_num(num):
    if isinstance(num, str):
        num = float(num)
    return float("{:.3g}".format(abs(num)))


def format_number(num):
    num = safe_num(num)
    sign = ""

    metric = {"T": 1000000000000, "B": 1000000000, "M": 1000000, "K": 1000, "": 1}

    for index in metric:
        num_check = num / metric[index]

        if num_check >= 1:
            num = num_check
            sign = index
            break

    return f"{str(num).rstrip('0').rstrip('.')}{sign}"

In [None]:
import matplotlib.pyplot as plt


def priors_to_img(priors):
    n_classes = len(priors)
    precision = 5
    data = np.zeros((precision * n_classes, precision * n_classes), dtype=int)
    rounded = np.round(np.asarray(priors) * precision * n_classes)
    for i in range(n_classes):
        j = 0
        while j < precision * n_classes - rounded[i]:
            data[j, i * precision : (i + 1) * precision] = 1
            j += 1
    fig, ax = plt.subplots(figsize=(2, 2))
    ax.imshow(data, cmap="gray", vmin=0, vmax=1)
    ax.axis("off")
    return fig


def t_to_img(t):
    fig, ax = plt.subplots(figsize=(2, 2))
    ax.imshow(1 - np.asarray(t), cmap="gray", vmin=0, vmax=1)
    ax.axis("off")
    return fig


def plot_to_include_file(dataset, plot, folder="priors"):
    if not os.path.exists(folder):
        os.makedirs(folder)
    plot.savefig(
        os.path.join(folder, f"{dataset}.png"), bbox_inches="tight", transparent=True
    )
    return f"\\parbox[c]{{16pt}}{{\\includegraphics[height=16pt]{{{folder}/{dataset}.png}}}}"

In [None]:
def get_index_length(df, idx):
    return len(df.filter(like=idx, axis=0))


def format_index(df, idx):
    return f"\\rotatebox[origin=c]{{90}}{{{idx.upper()}}}"

In [None]:
from functools import partial
import pandas as pd


stats_df = pd.DataFrame.from_dict(stats, orient="index").reset_index(names=["dataset"])
columns_mapper = {
    "benchmark": "\\textbf{Benchmark}",
    "task": "\\textbf{Task}",
    "dataset": "\\textbf{Dataset}",
    "n_samples": "$n$",
    "n_original_features": "$d$",
    "n_features": "$\\phi(d)$",
    "n_classes": "$K$",
    "priors": "$p(y)$",
    "n_weak_targets": "LRs",
    "noise_transition": "$\\mathbf{T}$",
    "noise_ratio": "$p(\\tilde{y}\\neq y)$",
    "coverage": "coverage",
}

output_path = "dataset_table"


for column in ["n_samples", "n_original_features", "n_features", "n_weak_targets"]:
    stats_df[column] = stats_df[column].apply(format_number)

for i, idx in enumerate(stats_df.index):
    stats_df["priors"][i] = plot_to_include_file(
        stats_df["dataset"][i],
        priors_to_img(stats_df["priors"][i]),
        folder=os.path.join(output_path, "priors"),
    )

    stats_df["noise_transition"][i] = plot_to_include_file(
        stats_df["dataset"][i],
        t_to_img(stats_df["noise_transition"][i]),
        folder=os.path.join(output_path, "noise_transitions"),
    )

stats_df["coverage"] = stats_df["coverage"].apply(lambda cov: f"{cov}\%")

# for column in ["benchmark", "task"]:
#     stats_df[column] = stats_df[column].apply(partial(format_index, stats_df))

stats_df = stats_df.drop("ergodicity", axis=1)
stats_df = stats_df[columns_mapper.keys()]
stats_df = stats_df.rename(columns_mapper, axis=1)

index_columns = ["benchmark", "task", "dataset"]
stats_df = stats_df.sort_values([columns_mapper[column] for column in index_columns])
stats_df = stats_df.set_index([columns_mapper[column] for column in index_columns])


columns_caption = {
    "dataset size": "n",
    "number of raw features": "d",
    "number of encoded features": "\phi(d)",
    "number of classes": "K",
    "histogram of class priors": "p(y)",
    "number of labeling rules": "\\text{LRs}",
    "noise transition matrix": "\\mathbf{T}",
    "noise ratio": "p(\\tilde{y}\\neq y)",
    "percent of weakly unlabeled exemples": "\\text{coverage}",
}
caption = "Datased used to benchmark detectors. Columns:"
for name, symbol in columns_caption.items():
    caption += f" {name} ${symbol}$,"
caption = caption.rstrip(",")
caption += "."

with open(os.path.join(output_path, "table.tex"), "w") as table:
    latex = stats_df.style.format(
        precision=2, subset=[columns_mapper["noise_ratio"]]
    ).to_latex(
        hrules=True,
        column_format="l" * 3 + "r" * (len(columns_mapper.keys()) - 3),
        multirow_align="t",
        multicol_align="c",
        clines="skip-last;data",
        position="!h",
        label="tab:datasets",
        caption=caption,
    )
    latex = latex.split("\n")
    latex.insert(1, "\\centering")
    latex.insert(3, "\\resizebox{\\columnwidth}{!}{")
    latex.insert(-2, "}")
    for i, line in enumerate(latex):
        if line == "\cline{1-12} \cline{2-12}":
            latex[i] = "\cline{1-12}"
    table.write("\n".join(latex))

In [None]:
stats["youtube"]["noise_transition"]