# Analysis AMLB Datasets

Notebook for analysing and extracting metadata from the benchmarks used for Classification and Regression Tasks according to the OpenML AMLB benchmark

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.datasets import fetch_openml
import openml

# setup the rcparams for the fonts
plt.rcParams['axes.titlesize'] = 20     # Font size for plot title
plt.rcParams['axes.titleweight'] = 'bold' # Font weight for plot title
plt.rcParams['xtick.labelsize'] = 13    # Font size for x-tick labels
plt.rcParams['ytick.labelsize'] = 13    # Font size for y-tick labels
plt.rcParams['axes.labelsize'] = 16   # Font size for x-axis labels
plt.rcParams['axes.labelweight'] = 'bold' # Font weight for x-axis labels
plt.rcParams['legend.loc'] = 'lower right'
plt.rcParams['legend.fontsize'] = 13
plt.rcParams['legend.title_fontsize'] = 15
# setup the rcparams for the figsize
plt.rcParams["figure.figsize"] = (12, 10)
plt.rcParams['savefig.dpi'] = 400
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42
plt.rcParams['axes.facecolor']='white'
plt.rcParams['savefig.facecolor']='white'
sns.set_style('whitegrid')

### Classification datasets

In [6]:
def get_datasets_metadata_classification(data):
    metadata = {
        'dataset' : data['details']['name'],
        '#classes' : data.target.nunique(),
        'dataset_id' : data['details']['id'],
        'n_features' : data.data.shape[1],
        'n_samples' : data.data.shape[0],
        'size' : data.data.size,
        'missings' : data.data.isnull().any().any(),
        '#integer' : len(data.data.select_dtypes(include='integer').columns),
        '#float' : len(data.data.select_dtypes(include='float').columns),
        '#boolean' : len(data.data.select_dtypes(include='boolean').columns),
        '#categorical' : len(data.data.select_dtypes(include='category').columns),
        '#integerw/nans' : data.data.select_dtypes(include='integer').isnull().any().any(),
        '#floatw/nans' : data.data.select_dtypes(include='float').isnull().any().any(),
        '#booleanw/nans' : data.data.select_dtypes(include='boolean').isnull().any().any(),
        '#categoricalw/nans' : data.data.select_dtypes(include='category').isnull().any().any()
    }
    return metadata

In [7]:
amlb_classification_benchmark = 271

In [None]:
benchmark = openml.study.get_suite(amlb_classification_benchmark)

In [5]:
metadata = []

In [None]:
for i, task_id in enumerate(benchmark.tasks):
    try:
        task = openml.tasks.get_task(task_id=task_id, download_splits=False, download_data=False)
        data = fetch_openml(data_id=task.dataset_id, as_frame=True)
        print(f'>> ({i+1}/{len(benchmark.tasks)}) {data["details"]["name"]}')
        metadata.append(get_datasets_metadata_classification(data))
    except Exception as e:
        continue

In [None]:
df = pd.DataFrame(metadata)

In [None]:
df

In [None]:
df.to_csv('../data/metadata/classification_datasets_metadata.csv', index=False)