This notebook loads all used datasets and creates table with simple statistics about them.

In [1]:
import sys
sys.path.append("../data")
from pathlib import Path
import pandas as pd
from datasets_const import NUMERICAL_DATA_NAMES, CATEGORICAL_DATA_NAMES, GRAPH_DATA_NAMES, CV_DATA_NAMES, NLP_DATA_NAMES, TIMESERIES_DATA_NAMES, MULTIOMICS_DATA_NAMES, SET_DATA_NAMES
from data_getter import get_npz_dataset, get_categorical_dataset, get_glocalkd_dataset, get_timeseries, get_multiomics_data, get_sets_data

%load_ext autoreload

%autoreload 2

In [2]:
datasets_description = []

In [3]:
def append_desc(path, f, type_, kwargs={}):
    data = f(path, **kwargs)
    X, y, name = data["X"], data["y"], data["name"]

    datasets_description.append({
        "name": name,
        "n_instances": X.shape[0],
        "n_features": X.shape[1],
        "outlier_ratio": f"{(y == 1).sum() * 100 / len(y):.2f}%",
        "type": type_
    })

### Numerical

In [4]:
NUMERICAL_ROOT = Path("../data/numerical")

for path in NUMERICAL_DATA_NAMES:
    path = NUMERICAL_ROOT / path
    append_desc(path, get_npz_dataset, "numerical")

CATEGORICAL

In [5]:
CATEGORICAL_ROOT = Path("../data/categorical")

for path in CATEGORICAL_DATA_NAMES:
    path = CATEGORICAL_ROOT / path
    append_desc(path, get_categorical_dataset, "categorical")
    

### GRAPH

In [6]:
GRAPH_PATH = "../data/graph/"
for name in GRAPH_DATA_NAMES:
    data = get_glocalkd_dataset(GRAPH_PATH, name)
    X, y, name = data["X"], data["y"], data["name"]

    datasets_description.append({
        "name": name,
        "n_instances": X.shape[0],
        "n_features": 1,
        "outlier_ratio": f"{(y == 1).sum() * 100 / len(y):.2f}%",
        "type": "graph"
    })

### TIME SERIES

In [7]:
TIMESERIES_PATH = "../data/timeseries"
for name in TIMESERIES_DATA_NAMES:
    data = get_timeseries(TIMESERIES_PATH, name)
    X, y, name = data["X"], data["y"], data["name"]

    datasets_description.append({
        "name": name,
        "n_instances": X.shape[0],
        "n_features": X.shape[1],
        "outlier_ratio": f"{(y == 1).sum() * 100 / len(y):.2f}%",
        "type": "timeseries"
    })

NLP and CV

In [8]:
NUMERICAL_ROOT = Path("../data/adBench/CV_by_ResNet18")

for path in CV_DATA_NAMES:
    path = NUMERICAL_ROOT / path
    append_desc(path, get_npz_dataset, "CV embedding")

In [9]:
NUMERICAL_ROOT = Path("../data/adBench/NLP_by_RoBERTa")

for path in NLP_DATA_NAMES:
    path = NUMERICAL_ROOT / path
    
    append_desc(path, get_npz_dataset, "NLP embedding")

#### Multiomics

In [10]:
MULTIOMICS_ROOT = Path("../data/mixed")

for path in MULTIOMICS_DATA_NAMES:
    append_desc(MULTIOMICS_ROOT, get_multiomics_data, "multiomics", {"data_name": path, "for_risf": False})

### Sequences of sets

In [11]:
SET_ROOT = Path("../data/mixed")

for path in SET_DATA_NAMES:
    append_desc(SET_ROOT, get_sets_data, "set", {"data_name": path, "for_risf": False})

### description

In [12]:
pd.DataFrame(datasets_description)

Unnamed: 0,name,n_instances,n_features,outlier_ratio,type
0,14_glass,214,7,4.21%,numerical
1,20_letter,1600,32,6.25%,numerical
2,21_Lymphography,148,18,4.05%,numerical
3,25_musk,3062,166,3.17%,numerical
4,26_optdigits,5216,64,2.88%,numerical
5,2_annthyroid,7200,6,7.42%,numerical
6,31_satimage-2,5803,36,1.22%,numerical
7,36_speech,3686,400,1.65%,numerical
8,38_thyroid,3772,6,2.47%,numerical
9,40_vowels,1456,12,3.43%,numerical
