In [None]:
import os
import sys
_project_dir = os.path.dirname(os.getcwd())
os.environ['PROJECT_DIR'] = _project_dir
sys.path.append(_project_dir)
del _project_dir

from copy import deepcopy
from pathlib import Path
from typing import Any, Optional, Union, cast

import pandas as pd

import lib; lib.configure_libraries()

pd.set_option("display.max_rows", 1000)

In [None]:
DATASET_PROPERTIES = ["task_type", "size", "n_features"]
_DATASETS_INFO: dict[Path, dict[str, Any]] = {}

DATASETS_MAIN = [
    'churn',
    'california',
    'house',
    'adult',
    'diamond',
    'otto',
    'higgs-small',
    'black-friday',
    'weather-small',
    'covtype',
    'microsoft',
]
# The datasets from the paper "Why do tree-based models still outperform deep learning on tabular data?"
DATASETS_WHY = [
    'classif-cat-large-0-covertype',
    'classif-cat-large-0-road-safety',
    'classif-cat-medium-0-KDDCup09_upselling',
    'classif-cat-medium-1-KDDCup09_upselling',
    'classif-cat-medium-2-KDDCup09_upselling',
    'classif-cat-medium-0-compass',
    'classif-cat-medium-1-compass',
    'classif-cat-medium-0-electricity',
    'classif-cat-medium-0-rl',
    'classif-cat-medium-1-rl',
    'classif-cat-medium-2-rl',
    'classif-num-large-0-Higgs',
    'classif-num-large-0-MiniBooNE',
    'classif-num-large-0-jannis',
    'classif-num-medium-0-MagicTelescope',
    'classif-num-medium-1-MagicTelescope',
    'classif-num-medium-2-MagicTelescope',
    'classif-num-medium-0-bank-marketing',
    'classif-num-medium-1-bank-marketing',
    'classif-num-medium-2-bank-marketing',
    'regression-num-medium-0-california',
    'classif-num-medium-0-credit',
    'classif-num-medium-1-credit',
    'regression-num-medium-0-house_16H',
    'classif-num-medium-0-kdd_ipums_la_97-small',
    'classif-num-medium-1-kdd_ipums_la_97-small',
    'classif-num-medium-2-kdd_ipums_la_97-small',
    'classif-num-medium-0-phoneme',
    'classif-num-medium-1-phoneme',
    'classif-num-medium-2-phoneme',
    'classif-num-medium-3-phoneme',
    'classif-num-medium-4-phoneme',
    'regression-num-medium-0-pol',
    'regression-num-medium-1-pol',
    'classif-num-medium-0-wine',
    'classif-num-medium-1-wine',
    'classif-num-medium-2-wine',
    'classif-num-medium-3-wine',
    'classif-num-medium-4-wine',
    'regression-cat-large-0-SGEMM_GPU_kernel_performance',
    'regression-cat-large-0-black_friday',
    'regression-cat-large-0-diamonds',
    'regression-cat-large-0-nyc-taxi-green-dec-2016',
    'regression-cat-large-0-particulate-matter-ukair-2017',
    'regression-cat-medium-0-Bike_Sharing_Demand',
    'regression-cat-medium-1-Bike_Sharing_Demand',
    'regression-cat-medium-0-Brazilian_houses',
    'regression-cat-medium-1-Brazilian_houses',
    'regression-cat-medium-2-Brazilian_houses',
    'regression-cat-medium-0-Mercedes_Benz_Greener_Manufacturing',
    'regression-cat-medium-1-Mercedes_Benz_Greener_Manufacturing',
    'regression-cat-medium-2-Mercedes_Benz_Greener_Manufacturing',
    'regression-cat-medium-3-Mercedes_Benz_Greener_Manufacturing',
    'regression-cat-medium-4-Mercedes_Benz_Greener_Manufacturing',
    'regression-cat-medium-0-OnlineNewsPopularity',
    'regression-cat-medium-0-analcatdata_supreme',
    'regression-cat-medium-1-analcatdata_supreme',
    'regression-cat-medium-2-analcatdata_supreme',
    'regression-cat-medium-3-analcatdata_supreme',
    'regression-cat-medium-4-analcatdata_supreme',
    'regression-cat-medium-0-house_sales',
    'regression-cat-medium-0-visualizing_soil',
    'regression-cat-medium-1-visualizing_soil',
    'regression-cat-medium-2-visualizing_soil',
    'regression-cat-medium-0-yprop_4_1',
    'regression-cat-medium-1-yprop_4_1',
    'regression-cat-medium-2-yprop_4_1',
    'regression-num-large-0-year',
    'regression-num-medium-0-Ailerons',
    'regression-num-medium-1-Ailerons',
    'regression-num-medium-2-Ailerons',
    'regression-num-medium-0-MiamiHousing2016',
    'regression-num-medium-1-MiamiHousing2016',
    'regression-num-medium-2-MiamiHousing2016',
    'regression-num-medium-0-cpu_act',
    'regression-num-medium-1-cpu_act',
    'regression-num-medium-2-cpu_act',
    'regression-num-medium-0-elevators',
    'regression-num-medium-1-elevators',
    'regression-num-medium-0-fifa',
    'regression-num-medium-1-fifa',
    'regression-num-medium-0-houses',
    'regression-num-medium-0-isolet',
    'regression-num-medium-1-isolet',
    'regression-num-medium-2-isolet',
    'regression-num-medium-0-medical_charges',
    'regression-num-medium-0-sulfur',
    'regression-num-medium-1-sulfur',
    'regression-num-medium-2-sulfur',
    'regression-num-medium-0-superconduct',
    'regression-num-medium-0-wine_quality',
    'regression-num-medium-1-wine_quality',
    'regression-num-medium-2-wine_quality',
]
DATASETS_ALL = DATASETS_MAIN + ['weather-big'] + DATASETS_WHY


def get_dataset_info(dpath: Union[str, Path]) -> dict:
    dpath = lib.get_path(dpath)
    if dpath in _DATASETS_INFO:
        return _DATASETS_INFO[dpath]

    dataset = lib.Dataset.from_dir(dpath, None)
    _DATASETS_INFO[dpath] = {
        'dataset': (
            dpath.name.upper()[:2] if dpath.parent == lib.DATA_DIR and dpath.name in DATASETS_MAIN
            else 'WE (full)' if dpath.parent == lib.DATA_DIR and dpath.name == 'weather-big'
            else dpath.name
        ),
        'task_type': dataset.task_type.value,
        'size': dataset.size(None),
        'n_features': dataset.n_features,   
    }
    return deepcopy(_DATASETS_INFO[dpath])


def load_record(output: Union[str, Path]):
    output = lib.get_path(output)
    report = lib.load_report(output)
    if lib.EXP_DIR in output.parents and '/exp/npt/' in str(output):
        # The NPT reports do not follow the required format,
        # so we infer the dataset path from the output path.
        dpath = ':data/' + list(output.relative_to(lib.EXP_DIR / 'npt').parents)[-2].name
    else:
        if report["function"] == 'bin.tune.main':
            report = report["best"]

        if report["function"] == 'bin.ensemble.main':
            dpath = report["data"]
        else:
            data = report["config"]["data"]
            dpath = data if isinstance(data, str) else data['path']
            del data

    record = get_dataset_info(dpath)
    for part in lib.Part:
        if part.value in report["metrics"]:
            score = report["metrics"][part.value]["score"]
            if record['dataset'] == 'HO':
                # Prettify the score for ":data/house".
                score /= 10000
            record[f"{part.value}_score"] = score
    return record


def _compute_ranks(dataset_df: pd.DataFrame) -> pd.DataFrame:
    dataset_df = dataset_df.sort_values(['test_mean', 'test_std'], ascending=[False, True])
    ranks = []
    current_score = None
    current_std = None
    for _, columns in dataset_df.iterrows():
        score = columns['test_mean']
        std = columns['test_std']
        if current_score is None:
            ranks.append(1)
            current_score = score
            current_std = std
        elif current_score - score <= current_std:
            ranks.append(ranks[-1])
        else:
            ranks.append(ranks[-1] + 1)
            current_score = score
            current_std = std
    dataset_df['rank'] = ranks
    return dataset_df


def build_metrics_dataframe(
    outputs_info: list[
        tuple[
            Union[str, Path],  # output path
            str,  # key (for example, algorithm name: "MLP")
            Union[int, str],  # subkey for aggregation (for example, seed: 0)
        ]
    ],
    precision: Optional[int] = 4,
):
    # >>> Build dataframe.
    records = [
        load_record(output) | { 'key': key, 'subkey': str(subkey)}
        for output, key, subkey in outputs_info
        if lib.get_path(output).joinpath('DONE').exists()
    ]
    if not records:
        raise RuntimeError('No records are available')
    df = pd.DataFrame.from_records(records)
    has_train_score = 'train_score' in df.columns

    # >>> Aggregate over subkeys.
    aggregations = {
        'test_mean': ("test_score", "mean"),
        'test_std': ("test_score", "std"),
        'val_mean': ("val_score", "mean"),
        'val_std': ("val_score", "std"),
    }
    if has_train_score:
        aggregations.update({
            'train_mean': ("train_score", "mean"),
            'train_std': ("train_score", "std"),
        })
    aggregations['count'] = ("test_score", "count")
    aggregations.update({
        x: (x, "first")
        for x in DATASET_PROPERTIES
        if x in df.columns
    })
    df = df.groupby(["dataset", "key"]).agg(**aggregations)
    df = df.reset_index().fillna(0.0)
    df["count"] = df["count"].astype(int)

    # >>> Compute ranks.
    df = cast(
        pd.DataFrame,
        df.groupby(['dataset'], group_keys=False).apply(_compute_ranks)
    )

    # >>> Finalize.
    df = df.sort_values(
        ['size', 'n_features', 'dataset', 'test_mean'],
        ascending=[True, True, True, False],
    ).reset_index(drop=True)
    df.loc[
        df['task_type'] == 'regression',
        ['test_mean', 'val_mean'] + ['train_mean'] * int(has_train_score)
    ] *= -1
    if precision is not None:
        float_columns = [
            'test_mean', 'test_std',
            'val_mean', 'val_std',
        ] + ['train_mean', 'train_std'] * int(has_train_score)
        df[float_columns] = df[float_columns].round(precision)
    df = df.set_index(["dataset"] + DATASET_PROPERTIES + ["key"])
    return df


def summarize_ranks(metrics_df: pd.DataFrame, nans: bool) -> pd.DataFrame:
    df = metrics_df
    df = df.reset_index().pivot(index='key', columns='dataset', values='rank')
    if not nans:
        df = df.dropna(axis='columns')
    columns = df.columns.tolist()
    df["avg"] = df.mean(1)
    df["std"] = df.std(1)
    df.insert(0, "avg", df.pop("avg").round(1))
    df.insert(1, "std", df.pop("std").round(1))
    df = df.sort_values("avg")
    df = df[['avg', 'std'] + columns]
    return df

# How to use the next cell
- comment/uncomment `N_SEEDS += 15` to show/hide results for single models
- comment/uncomment `N_ENSEMBLES += 3` to show/hide results for ensembles
- in the `for dataset in datasets` loop:
    - comment/uncomment the `add(...)` lines to show/hide results for various algorithms
    - in particular, uncomment `add(f':exp/mlp/{dataset}/0-reproduce', 'MLP (reproduce)')` to complete the tutorial from `README.md`

In [None]:
N_SEEDS = 0
# N_SEEDS += 15
N_ENSEMBLES = 0
N_ENSEMBLES += 3

# See the comments in build_metrics_dataframe to learn about outputs_info.
outputs_info = []
def add(location: str, name: Optional[str] = None, sep: str = '-'):
    if name is None:
        assert location.startswith(':exp/')
        # location example: ":exp/mlp/california/0"
        _exp_prefix, alg, *_dataset, tag = location.split('/')
        name = f'{alg}[{tag}]'
    for seed in range(N_SEEDS):
        outputs_info.append((location + f'{sep}evaluation/{seed}', name, seed))
    for ensemble_i in range(N_ENSEMBLES):
        outputs_info.append((location + f'{sep}ensemble{sep}5/{ensemble_i}', '(E) ' + name, ensemble_i))

datasets = DATASETS_MAIN
for dataset in datasets:
    if dataset in DATASETS_WHY:
        dataset = 'why/' + dataset

    # >>> Tutorial from README.md
    # add(f':exp/mlp/{dataset}/0-reproduce', 'MLP (reproduce)')

    # >>> Retrieval-augmented baselines
    # add(f':exp/knn/{dataset}/0', 'kNN')

    # dnnr_tag = 'ohe' if dataset in [BLACK_FRIDAY, DIAMOND] else 'loo'
    # add(f':exp/dnnr/{dataset}/{dnnr_tag}', 'DNNR')

    # add(f':exp/anp/{dataset}/0', 'ANP')
    # add(f':exp/dkl/{dataset}/0', 'DKL')

    # npt_tag = {
    #     'churn': 0,
    #     'california': 0,
    #     'house': 0,
    #     'adult': 0,
    #     'diamond': 2,
    #     'otto': 1,
    #     'higgs-small': 2,
    #     'black-friday': 2,
    #     'covtype': 3,
    #     'weather-small': 1,
    #     'microsoft': 1,
    # }[dataset]
    # add(f':exp/npt/{dataset}/{npt_tag}', 'NPT')

    # saint_tag = 'default' if dataset in ['weather-small', 'covtype', 'microsoft'] else '2'
    # add(f':exp/saint/{dataset}/{saint_tag}', 'SAINT')

    # >>> Parametric DL baselines
    # add(f':exp/mlp/{dataset}/0', 'MLP')
    # add(f':exp/mlp/{dataset}/lr', 'MLP-LR')
    # add(f':exp/mlp/{dataset}/plr-lite', 'MLP-PLR(lite)')
    # add(f':exp/mlp/{dataset}/plr', 'MLP-PLR')

    # >>> GBDT
    # add(f':exp/xgboost_/{dataset}/default2', 'XGBoost (default)')
    # add(f':exp/lightgbm_/{dataset}/default2', 'LightGBM (default)')
    # add(f':exp/catboost_/{dataset}/default2', 'CatBoost (default)')
    add(f':exp/xgboost_/{dataset}/2', 'XGBoost')
    add(f':exp/lightgbm_/{dataset}/2', 'LightGBM')
    add(f':exp/catboost_/{dataset}/2', 'CatBoost')

    # >>> The model
    model = 'TabR'
    modeldir = model.lower()
    # add(f':exp/{modeldir}/{dataset}/default', f'{model}-S (default)')
    # add(f':exp/{modeldir}/{dataset}/0', f'{model}-S')
    model_tag = "2-lr" if dataset in ['weather-small', 'covtype', 'microsoft'] else "2-plr-lite"
    add(f':exp/{modeldir}/{dataset}/{model_tag}', f'{model}')

    # >>> Ablation study
    for tag, name in [
        # ('dp-qk-v-self-scaled', 'Step-0'),
        # ('dp-qk-yv-self-scaled', 'Step-1'),
        # ('l2-k-yv-self-scaled', 'Step-2'),
        # ('l2-k-yt-self-scaled', 'Step-3'),
    ]:
        add(f':exp/{modeldir}_design/{dataset}/{tag}', f'(design) {name}')

    # >>> Context freeze
    for freeze_after_n_epochs in [
        # 0,
        # 1,
        # 2,
        # 4,
        # 5,
        # 8,
    ]:
        add(f':exp/{modeldir}_scaling/{dataset}/default-freeze-{freeze_after_n_epochs}', f'{model}-freeze-{freeze_after_n_epochs}')

metrics_df = build_metrics_dataframe(outputs_info)
# Drop details about datasets to save screen space.
# while len(metrics_df.index.levels) > 2:
#     metrics_df.index = metrics_df.index.droplevel(1)
ranks_df = summarize_ranks(metrics_df, nans=True)
print('Ranks:')
display(ranks_df)
print('\nMetrics:')
display(metrics_df)
# metrics_df.to_html('metrics.html')
# ranks_df.to_html('metrics.html')