In [24]:
import collections
%matplotlib inline
import matplotlib
import numpy as np
import openml
import pandas as pd
import time
print(openml.config.server)

https://www.openml.org/api/v1/xml


In [25]:
automlbench_dids = [41021, 42225, 42571, 4549, 42572, 42570, 42705,
                   42728, 550, 546, 541, 507, 505, 287, 216, 41540,
                   42688, 422, 416, 42724, 42727, 42729, 42726, 42730,
                   201, 41980, 42731, 531, 42563, 574, 3050, 3277, 43071]

In [36]:
all_datasets = openml.datasets.list_datasets()
to_consider = openml.datasets.list_datasets()
TOTAL = len(all_datasets)
# print(TOTAL)
# # all_tasks = openml.tasks.list_tasks()
to_remove = set()
# # print(len(all_tasks))
# print("From:", len(to_consider)) # 3004

In [37]:
# remove datasets from the datasets list which are in the automl bench or close relatives
print("From:", len(to_consider))

# remove exact duplicates
for did in all_datasets:
    if did in automlbench_dids:
        to_remove.add(did)

for did in to_remove:
    if did in to_consider:
        del to_consider[did]
assert len(to_consider) == len(all_datasets) - len(automlbench_dids)

print("To:", len(to_consider)) # 2965

From: 5012
To: 4979


In [38]:
# Remove datasets with too few features, instances or classes
print("From:", len(to_consider))

for did in all_datasets:
    try:
        if all_datasets[did]['NumberOfInstances'] * all_datasets[did]['NumberOfFeatures'] < 1_000:
            to_remove.add(did)
        elif all_datasets[did]['NumberOfInstances'] >= 500_000:
            to_remove.add(did)
        elif all_datasets[did]['NumberOfInstances'] < 20:
            to_remove.add(did)
        elif all_datasets[did]['NumberOfFeatures'] < 2:
            to_remove.add(did)
        elif all_datasets[did]['NumberOfFeatures'] > 2500:
            to_remove.add(did)
        elif all_datasets[did]['NumberOfClasses'] > 1:
            to_remove.add(did)
        elif all_datasets[did]['NumberOfInstances'] * all_datasets[did]['NumberOfFeatures'] > 10_000_000:
            to_remove.add(did)
    except KeyError:
        #print(did, all_datasets[did])
        to_remove.add(did)
        
for did in to_remove:
    if did in to_consider:
        del to_consider[did]
        
print("To:", len(to_consider)) # 651

From: 4979
To: 1546


In [7]:
all_datasets_as_frame = pd.DataFrame.from_dict(all_datasets, orient='index')

In [9]:
# Remove all datasets with "BNG" in name
print("From:", len(to_consider))

for did in to_consider:
    if to_consider[did]['name'].startswith("BNG"):
        to_remove.add(did)
        
for did in to_remove:
    if did in to_consider:
        del to_consider[did]
        
print("To:", len(to_consider)) # 641

From: 1536
To: 1527


In [10]:
# check for similar names and manually check overlaps
def return_triplets(name):
    triplets = set()
    name = name.lower()
    for i in range(len(name) - 2):
        triplets.add(name[i: i+3])
    return triplets

manually_look_at = []
for did in to_consider:
    for did2 in automlbench_dids:
        automl_triplets = return_triplets(all_datasets[did2]['name'])
        other_triplets = return_triplets(all_datasets[did]['name'])
        n_hits = sum([trip in other_triplets for trip in automl_triplets])
        ratio = n_hits / len(automl_triplets)
        if n_hits > 2:
            manually_look_at.append([n_hits, ratio, all_datasets[did]['name'], all_datasets[did2]['name'], did, did2])

df = pd.DataFrame(manually_look_at)


In [14]:
# This list contains all manually collected dataset ids as in 
# https://docs.google.com/spreadsheets/d/1XH1mQFq7-1fs28F-CQJo6i6Wg7IlpDS0aDTAJBoaD7k/edit#gid=0
print("From:", len(to_consider))
for did in [14, 16, 18, 20, 22, 36, 150, 179, 180, 958, 962, 971, 978, 994, 995, 1020, 1022, 1112, 1113, 1114, 
            1119, 1242, 1558, 40979, 40997, 40998, 40999, 41000, 41001, 41002, 41003, 41004, 41005, 
            41006, 41007, 43900, 43947, 44096, 44097, 44098]:
    to_remove.add(did)
 
for did in to_remove:
    if did in to_consider:
        del to_consider[did]
print("To:", len(to_consider)) # 603

From: 1527
To: 1488


In [15]:
# check for similar metafeatures and manually check overlaps
metafeature_names = [
    'MajorityClassSize', 'MaxNominalAttDistinctValues', 'MinorityClassSize', 'NumberOfClasses',
    'NumberOfFeatures', 'NumberOfInstances', 'NumberOfInstancesWithMissingValues', 'NumberOfMissingValues',
    'NumberOfNumericFeatures', 'NumberOfSymbolicFeatures',
]

simple_metafeatures_to_consider = pd.DataFrame(
    {k: v for k, v in all_datasets.items() if k in to_consider}
).transpose()[metafeature_names]
simple_metafeatures_automl = pd.DataFrame(
    {k: v for k, v in all_datasets.items() if k in automlbench_dids}
).transpose()[metafeature_names]

# Checking the hamming distance of the datasets
manually_look_at = []
for did1, mf1 in simple_metafeatures_to_consider.iterrows():
    for did2, mf2 in simple_metafeatures_automl.iterrows():
        n_hits = np.sum(mf1 == mf2)
        ratio = n_hits / len(mf1)
        n_hits = np.sum(
            mf1[['NumberOfFeatures', 'NumberOfInstances']] == mf2[['NumberOfFeatures', 'NumberOfInstances']]
        )
        if ratio > 0.5 or n_hits == 2:
            entry = [ratio, n_hits, all_datasets[did1]['name'], all_datasets[did2]['name'], did1, did2]
            mfs = np.array([[mf1[n], mf2[n]] for n in metafeature_names]).flatten()
            entry.extend(mfs)
            manually_look_at.append(entry)

header = ["ratio", "n_hits", "name", "automl name", "did", "automl did"] 
mfs = list(np.array([[m, m] for m in metafeature_names]).flatten())
df = pd.DataFrame(manually_look_at, columns = header + mfs)
with open("ManualMetafeatures.csv", "w") as fh:
   fh.write(df.to_csv())            

In [16]:
# This list contains all manually collected dataset ids as in 
# https://docs.google.com/spreadsheets/d/1SWEBsFTykdUMq-wBcx9NHchhdolB632EJYaAmQwwQtM/edit#gid=0
print("From:", len(to_consider))
for did in [44153, 44234]:
    to_remove.add(did)
 
for did in to_remove:
    if did in to_consider:
        del to_consider[did]
print("To:", len(to_consider)) # 599
# check whether we can work usefully with creditcard (1597)

From: 1488
To: 1486


In [17]:
pd.set_option('display.max_rows', None)
columns = ['name', 'NumberOfClasses', 'NumberOfFeatures', 'NumberOfInstances', ]

to_consider_stats = pd.DataFrame(
    {k: v for k, v in all_datasets.items() if k in to_consider}
).transpose()[columns]

to_consider_stats

Unnamed: 0,name,NumberOfClasses,NumberOfFeatures,NumberOfInstances
2,anneal,5.0,39.0,898.0
5,arrhythmia,13.0,280.0,452.0
6,letter,26.0,17.0,20000.0
11,balance-scale,3.0,5.0,625.0
13,breast-cancer,2.0,10.0,286.0
15,breast-w,2.0,10.0,699.0
23,cmc,3.0,10.0,1473.0
24,mushroom,2.0,23.0,8124.0
25,colic,2.0,27.0,368.0
26,nursery,5.0,9.0,12960.0


In [18]:
# remove sparse datasets
# print("From:", len(to_consider))
# start = time.time()
# for i, did in enumerate(to_consider):
#     #print(did)
#     if (i+1) % 25 == 0:
#         print("%4.2fsec: %d/%d" % (time.time() - start, i+1, len(to_consider)))
#     d = openml.datasets.get_dataset(int(did), download_data=False)
#     #print(
#     #    d.format.lower(), 
#     #    'sparse' in d.format.lower(), 
#     #    d.description and 'CLASSINDEX: none specific' in d.description
#     #)
#     if 'sparse' in d.format.lower():
#         to_remove.add(did)
#     if d.description and 'CLASSINDEX: none specific' in d.description:
#         to_remove.add(did)
#     for f in d.features.values():
#         if f.data_type == 'string':
#             to_remove.add(did)

# for did in to_remove:
#     if did in to_consider:
#         del to_consider[did]
# print("To:", len(to_consider)) # 599

In [19]:
# remove identical/similar datasets (to reduce the load computing the meta-data)
# check for similar names and manually check overlaps
def return_triplets(name):
    triplets = set()
    name = name.lower()
    for i in range(len(name) - 2):
        triplets.add(name[i: i+3])
    return triplets

manually_look_at = []
for i, did in enumerate(to_consider):
    for j, did2 in enumerate(to_consider):
        if j <= i:
            continue
        if len(all_datasets[did2]['name']) < 3:
            continue
        other2_triplets = return_triplets(all_datasets[did2]['name'])
        other_triplets = return_triplets(all_datasets[did]['name'])
        n_hits = sum([trip in other_triplets for trip in other2_triplets])
        ratio = n_hits / len(other2_triplets)
        if n_hits > 2:
            manually_look_at.append([n_hits, ratio, all_datasets[did]['name'], all_datasets[did2]['name'], did, did2])

df = pd.DataFrame(manually_look_at)


In [20]:
# This list contains all manually collected dataset ids as in 
# https://docs.google.com/spreadsheets/d/12Tmtgrz_92fuOKoIIIb6ROGkd65nv7R3g7EZAYwtTos/edit#gid=0
print("From:", len(to_consider))
for did in [989, 977, 1222, 997, 1568, 980, 1021, 1019, 1023,
            953, 1000, 40474, 40475, 40476, 40477, 40478,
            40479, 979, 720, 1557, 990, 41966, 1016, 954,
            40597, 976, 1004, 966, 970, 1014, 741, 774, 795,
            827, 931, 843, 853, 959, 987, 1037, 1038, 1040,
            1560, 1467, 1476, 1566, 1492, 1493, 1525, 1526,
            40687, 40926, 41945, 41946, 42140, 42141, 42192]:
    to_remove.add(did)
 
for did in to_remove:
    if did in to_consider:
        del to_consider[did]
print("To:", len(to_consider)) # 503

From: 1486
To: 1430


In [21]:
# Subsample from 'topics', where there are dozens of super-similar
# datasots from one source
dataset_names = {did: value['name'] for did, value in all_datasets.items()
                 if did in to_consider}
# Set random number generator to always sample the same set
# of datasets
rng = np.random.RandomState(1)
# Friedman datasets
friedman_datasets = [
    did for did, name in dataset_names.items() if name.startswith('fri_c')
]
# ova & ap datasets
ova_datasets = [
    did for did, name in dataset_names.items() 
    if name.startswith('OVA_') or name.startswith('AP_')
]
# volcanoes datasets
volcanoes_datasets = [
    did for did, name in dataset_names.items() 
    if name.startswith('volcanoes-')
]
# AutoUniv
auto_univ_datasets = [
    did for did, name in dataset_names.items() 
    if name.startswith('autoUniv-')
]
for dataset_ids_to_sample_from, num_keep in (
    (friedman_datasets, 10),
    (ova_datasets, 7),
    (volcanoes_datasets, 10),
    (auto_univ_datasets, 4),
):
    choices_to_drop = np.random.choice(
        dataset_ids_to_sample_from,
        replace=False,
        size=len(dataset_ids_to_sample_from) - num_keep)
    for choice in choices_to_drop:
        if choice in to_consider:
            del to_consider[choice]
print("To:", len(to_consider)) # 454

To: 1337


In [22]:
# Delete a few other datasets

# Click prediction datasets consist almost exclusively of IDs
dataset_names = {did: value['name'] for did, value in all_datasets.items()
                 if did in to_consider
                 and value['name'].startswith('Click_prediction')}
for did in dataset_names:
    if did in to_consider:
        del to_consider[did]
        


# Forex datasets are time series dataset with a time stamp
# regularly shuffled tasks won't work here
dataset_names = {did: value['name'] for did, value in all_datasets.items()
                 if did in to_consider
                 and value['name'].startswith('FOREX')}
for did in dataset_names:
    if did in to_consider:
        del to_consider[did]
print("To:", len(to_consider)) # 253

To: 1134


In [23]:
print("From:", len(to_consider))
to_remove.update([
    312,  # multilabel
    313,  # hierarchical classification
    316,  # multilabel classification
    378,  # unclear target
    381,  # unclear target
    382,  # unclear target
    1178,  # multilabel
    1179,  # multilabel
    1472,  # regression
    1477,  # not a regular classification dataset (stream)
    40588,  # multilabel
    40589,  # multilabel
    40590,  # multilabel
    40591,  # multilabel
    40592,  # multilabel
    40593,  # multilabel
    40594,  # multilabel
    40595,  # multilabel
    40596,  # multilabel
    40597,  # multilabel
    40686,  # multilabel
    40687,  # multilabel
    40702,  # multilabel
    40910,  # stream dataset
    41103,  # description says 'CIFAR-10 dataset but with some modifications'
    41526,  # is named test_dataset
])
for did in to_remove:
    if did in to_consider:
        del to_consider[did]
        
print("To:", len(to_consider)) # 232

From: 1134
To: 1112


In [24]:
# check for similar metafeatures and manually check overlaps
metafeature_names = [
    'MajorityClassSize', 'MaxNominalAttDistinctValues', 'MinorityClassSize', 'NumberOfClasses',
    'NumberOfFeatures', 'NumberOfInstances', 'NumberOfInstancesWithMissingValues', 'NumberOfMissingValues',
    'NumberOfNumericFeatures', 'NumberOfSymbolicFeatures',
]

simple_metafeatures_to_consider = pd.DataFrame(
    {k: v for k, v in all_datasets.items() if k in to_consider}
).transpose()[metafeature_names]

# Checking the hamming distance of the datasets
manually_look_at = []
for did1, mf1 in simple_metafeatures_to_consider.iterrows():
    for did2, mf2 in simple_metafeatures_to_consider.iterrows():
        if did2 <= did1:
            continue
        n_hits = np.sum(mf1 == mf2)
        ratio = n_hits / len(mf1)
        n_hits = np.sum(
            mf1[['NumberOfFeatures', 'NumberOfInstances']] == mf2[['NumberOfFeatures', 'NumberOfInstances']]
        )
        if ratio > 0.5 or n_hits == 2:
            entry = [ratio, n_hits, all_datasets[did1]['name'], all_datasets[did2]['name'], did1, did2]
            mfs = np.array([[mf1[n], mf2[n]] for n in metafeature_names]).flatten()
            entry.extend(mfs)
            manually_look_at.append(entry)

header = ["ratio", "n_hits", "name", "automl name", "did", "automl did"] 
mfs = list(np.array([[m, m] for m in metafeature_names]).flatten())
df = pd.DataFrame(manually_look_at, columns = header + mfs)
#with open("/tmp/ManualMetafeatures.csv", "w") as fh:
#    fh.write(df.to_csv()) 

KeyboardInterrupt: 

In [25]:
# This list contains all manually collected dataset ids as in 
# https://docs.google.com/spreadsheets/d/1qVqH_ZVWJ3Eu0B6vXF0q2eQDMwiU5LXToN1L9ZfgLIs/edit#gid=0
print("From:", len(to_consider))
for did in [983, 38, 40707, 40708, 40713, 40690, 454, 41156,
            40678, 41964]:
    to_remove.add(did)
 
for did in to_remove:
    if did in to_consider:
        del to_consider[did]
print("To:", len(to_consider)) # 222

From: 1112
To: 1102


In [49]:
dataset_names = {did: value['name'] for did, value in all_datasets.items()
                 if did in to_consider}
for did in [
    914,  # Balloon dataset, has only a single features
    993,  # not a classification dataset
    1002,  # not a classification dataset
    1018,  # not a classification dataset
    40497,  # regular thyroid dataset
    40517,  # artificial drift detection dataset
    40666,  # derived from the musk dataset (1116) we use
    41158,  # derived from MNIST
    41960,  # appears to not be a classification dataset
    42344,  # appears to not be a classification dataset
]:
    to_remove.add(did)

for did in [ # recreated samples from automl benchmark datasets
    44593, 
    44498, 
    44557, 
    44618, 
    44780, 
    44698, 
    44729, 
    44535
]:
    to_remove.add(did)
    
for did in to_remove:
    if did in to_consider:
        del to_consider[did]
print("To:", len(to_consider)) # 209

To: 1086


In [50]:
print(len(to_consider), len(to_remove), TOTAL)

1086 3631 5012


In [51]:
final = pd.DataFrame(
    {k: v for k, v in all_datasets.items() if k in to_consider}
).transpose()

In [52]:
final_filtered = final.groupby(['NumberOfClasses', 'NumberOfInstances']).sample(n = 1)

In [None]:
# dataset_ids = list(to_consider)
# datasets_to_tasks = collections.defaultdict(list)
# for task_id, task in all_tasks.items():
#     if task['ttid'] != 1:
#         continue
#     if task['estimation_procedure'] != '33% Holdout set':
#         continue
#     if task['did'] not in to_consider:
#         continue
#     datasets_to_tasks[task['did']].append(task_id)

In [None]:
# Datasets which not yet have a 33% holdout task associated!
# datasets_without_task = sorted(list(set(to_consider) - set(datasets_to_tasks)))
# for dataset_wo_task in datasets_without_task:  
#     try:
#         d = openml.datasets.get_dataset(int(dataset_wo_task), download_data=False)            
#     except ValueError as e:
#         print(e)
#         continue
#     print(dataset_wo_task)

In [None]:
# # Convert the 10-fold CV OpenML CC18 tasks into holdout tasks
# cc18_did_to_task_id = dict()
# for did in automlbench_dids:
#     if did not in cc18_did_to_task_id:
#         cc18_did_to_task_id[did] = list()
#     for task_id, task in all_tasks.items():
#         if task['ttid'] != 1:
#             continue
#         if task['estimation_procedure'] != '33% Holdout set':
#             continue
#         if task['did'] != did:
#             continue
#         #if 'evaluation_measures' in task:
#         #    continue
#         cc18_did_to_task_id[task['did']].append(task_id)
# print(len(cc18_did_to_task_id))
# #print(cc18_did_to_task_id)
# print([l[0] for l in list(cc18_did_to_task_id.values())])

39
[233, 242, 261, 283, 75099, 75105, 75127, 75193, 126025, 126026, 126029, 146587, 146589, 75227, 75097, 167083, 167104, 75101, 146680, 168792, 168793, 168794, 168795, 168796, 168797, 168798, 189860, 189861, 189862, 189865, 189866, 189871, 189872, 189873, 189874, 168786, 168787, 168789, 168790]
