### Setup

In [None]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import math
import itertools

import pandas as pd
import numpy as np
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
from tueplots import fonts
import openml as oml
from tqdm import tqdm

from uq.analysis.dataframes import (
	load_config, load_df, make_test_df_for_tuning, make_test_df, get_datasets_df,
	set_hparams_columns, set_test_metrics_columns, make_df_abb
)
from uq.utils.general import filter_dict, set_notebook_options, savefig

set_notebook_options()
plt.rcParams.update(fonts.icml2022_tex())

path = Path('results')
ext = 'pdf'

### Determine which datasets to select per benchmark

In [None]:
def dataset_names(suite_id):
    names = [oml.datasets.get_dataset(ds_id).name for ds_id in oml.study.get_suite(suite_id).data]
    if suite_id == 297:
        names.remove('houses')
    return names

suites = {
    suite_id: set(dataset_names(suite_id))
    for suite_id in [269, 297, 299]
}
s = suites[297].copy()
suites[299] -= s
s |= suites[299]
suites[269] -= s
s |= suites[269]

### Dataframe linking papers to datasets

In [None]:
# Our datasets come from this config
config = load_config('logs/full/')

papers_to_datasets = {
    'Gal 2016': {
        'uci': [
            'boston',
            'Concrete',
            'Energy',
            'Kin8nm',
            'Naval',
            'Power',
            'Protein',
            'wine_quality',
            'Yacht',
            'year',
        ],
    },
    'Utpala 2020': {
        'uci': [
            'Airfoil',
            'boston',
            'Concrete',
            'Fish',
            'Kin8nm',
            'Protein',
            'wine_quality',
            'Yacht',
            'year',
        ],
    },
    'Zhou 2021': {
        'uci': [
            'CPU',
            'Crime',
            'Energy',
            'MPG',
        ]
    },
    'Chung 2021': {
        'uci': [
            'boston',
            'Concrete',
            'Energy',
            'Kin8nm',
            'Naval',
            'Power',
            'wine_quality',
            'Yacht',
        ]
    },
    'Fakoor 2021': {
        'uci': [
            'boston',
            'Concrete',
            'Energy',
            'Kin8nm',
            'Naval',
            'Power',
            'Protein',
            'Yacht',
        ],
        'oml_269': [name for name in dataset_names(269) if name != 'boston'],
    },
    'Grinsztajn 2022': {
        f'oml_{suite_id}': list(suites[suite_id])
        for suite_id in [297, 299]
    },
    r'\textbf{Ours}': {
        source: dataset_group.names
        for source, dataset_group in config.dataset_groups.items()
    },
}

data = []
for paper, sources in papers_to_datasets.items():
    for source, datasets in sources.items():
        for dataset in datasets:
            data.append((paper, source, dataset))

df_papers = pd.DataFrame(data, columns=['paper', 'source', 'dataset'])

### Pivot table with the selected datasets

In [None]:
# The `order.pickle` file must first be generated in `main_figures.ipynb`.
order = pd.read_pickle(Path(config.log_dir) / 'order.pickle').to_frame()

In [None]:
df_abb = make_df_abb(order['dataset'].unique())
df_abb2 = make_df_abb(df_papers.query('dataset not in @order.dataset').dataset.unique())
df_abb2['abb'] = '(' + df_abb2['abb'] + ')'
df_abb = pd.concat((df_abb, df_abb2))

df = df_papers.merge(df_abb, how='outer')

In [None]:
source_names = {
    f'oml_{suite_id}': f'OpenML {suite_id}'
    for suite_id in [297, 299, 269]
}
source_names['uci'] = 'UCI'
df['source'] = df['source'].map(source_names)

In [None]:
order2 = np.concatenate((order.merge(df_abb)['abb'].to_numpy(), df_abb2['abb'].unique()))
df['abb'] = pd.Categorical(df['abb'], order2)
df['paper'] = pd.Categorical(df['paper'], list(papers_to_datasets))
df['source'] = pd.Categorical(df['source'], list(source_names.values()))
df = df.sort_values('dataset')
df_pivot = df.pivot_table(values='source', 
        index='paper', columns='abb', dropna=False, fill_value=None, aggfunc=lambda x: x, sort=True)

In [None]:
df_pivot

### Figure with the selected datasets

In [None]:
sources = df['source'].sort_values().unique()
source_to_int = {source: i for i, source in enumerate(sources)}
source_to_int[np.nan] = np.nan
df_pivot = df_pivot.applymap(lambda x: source_to_int[x])

In [None]:
from matplotlib.lines import Line2D
from matplotlib.patches import Rectangle
import matplotlib.patches as patches
from matplotlib.legend_handler import HandlerPatch

fig, axis = plt.subplots(figsize=(8, 1.3), dpi=300)
cmap = sns.color_palette('deep', 4)
g = sns.heatmap(df_pivot, square=True, cmap=cmap, cbar=False, xticklabels=1, yticklabels=1, mask=df_pivot.isna(), ax=axis)
axis.tick_params(axis='x', which='major', labelsize=7, labelrotation=90)
axis.tick_params(axis='y', which='major', labelsize=7)
axis.set(xlabel=None, ylabel=None)
g.set_facecolor('white')

custom_lines = [
    Rectangle((0, 0), 1, 1, color=cmap[i], lw=4)
    for i in range(len(sources))
]

class HandlerRect(HandlerPatch):
    def create_artists(self, legend, orig_handle,
                       xdescent, ydescent, width, height,
                       fontsize, trans):
        x = width//2
        y = 0
        w = h = 3
        # create
        p = patches.Rectangle(xy=(x, y), width=w, height=h)
        # update with data from original object
        self.update_prop(p, orig_handle, legend)
        # move xy to legend
        p.set_transform(trans)
        return [p]

fig.legend(custom_lines, sources,
    loc='lower center',
    bbox_to_anchor=(0.5, 1-0.15),
    frameon=True,
    ncol=4,
    fontsize=7,
    handlelength=1, handleheight=1,
)

savefig(path / f'papers_vs_datasets.{ext}', fig)

In [None]:
df.pivot_table(values='dataset', index='paper', columns='source', aggfunc='count', fill_value=0)

### List of the selected datasets sorted by size

In [None]:
def make_datasets_to_papers(papers_to_datasets):
    datasets = itertools.chain(*papers_to_datasets.values())
    datasets_to_papers = {dataset: [] for dataset in datasets}
    for paper, datasets in papers_to_datasets.items():
        for dataset in datasets:
            datasets_to_papers[dataset].append(paper)
    return datasets_to_papers

datasets_to_papers = make_datasets_to_papers(papers_to_datasets)

In [None]:
from pprint import pprint

def get_size_df(suite_id):
    data = []
    for ds_id in oml.study.get_suite(suite_id).data:
        ds = oml.datasets.get_dataset(ds_id)
        if ds.name not in suites[suite_id]:
            continue
        x, y, categorical_indicator, attribute_names = ds.get_data(
            dataset_format='dataframe', target=ds.default_target_attribute)
        data.append((ds.name, x.shape[0], suite_id))
    return pd.DataFrame(data, columns=['dataset', 'Size', 'Suite ID'])

dfs = []
for suite_id in [269, 297, 299]:
    print(suite_id)
    pprint(get_size_df(suite_id).sort_values('Size')['dataset'].to_list())