In [1]:
import numpy as np
import openml
import pandas as pd
import time
from pathlib import Path

In [2]:
# 39 dataset used for the automl study
automlbench_dids = [3, 12, 31, 54, 1067, 1111, 1169, 1596, 1590, 1486, 
                    1461, 1464, 1468, 1489, 4135, 23517, 40981, 23512, 40668, 41168, 
                    41166, 41165, 40685, 41159, 41161, 41150, 41138, 41142, 41143, 41146,
                    41147, 41163, 41164, 41167, 41169, 40975, 40984, 40996, 41027]


In [2]:
all_datasets = openml.datasets.list_datasets()
to_consider = openml.datasets.list_datasets()
TOTAL = len(all_datasets)

to_remove = set()


In [3]:
all_datasets[3]

{'did': 3,
 'name': 'kr-vs-kp',
 'version': 1,
 'uploader': '1',
 'status': 'active',
 'format': 'ARFF',
 'MajorityClassSize': 1669.0,
 'MaxNominalAttDistinctValues': 3.0,
 'MinorityClassSize': 1527.0,
 'NumberOfClasses': 2.0,
 'NumberOfFeatures': 37.0,
 'NumberOfInstances': 3196.0,
 'NumberOfInstancesWithMissingValues': 0.0,
 'NumberOfMissingValues': 0.0,
 'NumberOfNumericFeatures': 0.0,
 'NumberOfSymbolicFeatures': 37.0}

In [8]:
binary_bench_dids = []
for did in automlbench_dids:
    if all_datasets[did]["NumberOfClasses"] == 2:
        binary_bench_dids.append(did)

In [9]:
for did in binary_bench_dids:
    try:
        if all_datasets[did]['NumberOfInstances'] < 250:
            to_remove.add(did)
        elif all_datasets[did]['NumberOfInstances'] >= 500_000:
            to_remove.add(did)
        elif all_datasets[did]["NumberOfFeatures"] > 2_500:
            to_remove.add(did)
        elif all_datasets[did]['NumberOfFeatures'] < 2:
            to_remove.add(did)
        elif all_datasets[did]['NumberOfClasses'] < 2:
            to_remove.add(did)
        elif all_datasets[did]['NumberOfInstances'] * all_datasets[did]['NumberOfFeatures'] > 10_000_000:
            to_remove.add(did)
    except KeyError:
        to_remove.add(did)

binary_bench_dids = [did for did in binary_bench_dids if did not in to_remove]

In [4]:
# remove datasets from the datasets list which are in the automl bench or close relatives
print("From:", len(to_consider))

# remove exact duplicates
for did in all_datasets:
    if did in automlbench_dids:
        to_remove.add(did)

for did in to_remove:
    if did in to_consider:
        del to_consider[did]
assert len(to_consider) == len(all_datasets) - 39

print("To:", len(to_consider)) # 2965

From: 5296
To: 5257


In [5]:
# Remove datasets with too few features, instances or classes
print("From:", len(to_consider))

for did in all_datasets:
    try:
        if all_datasets[did]['NumberOfInstances'] < 250:
            to_remove.add(did)
        elif all_datasets[did]['NumberOfInstances'] >= 500_000:
            to_remove.add(did)
        elif all_datasets[did]["NumberOfFeatures"] > 2_500:
            to_remove.add(did)
        elif all_datasets[did]['NumberOfFeatures'] < 2:
            to_remove.add(did)
        elif all_datasets[did]['NumberOfClasses'] < 2:
            to_remove.add(did)
        elif all_datasets[did]['NumberOfInstances'] * all_datasets[did]['NumberOfFeatures'] > 10_000_000:
            to_remove.add(did)
    except KeyError:
        #print(did, all_datasets[did])
        to_remove.add(did)
        
for did in to_remove:
    if did in to_consider:
        del to_consider[did]
        
print("To:", len(to_consider)) # 651

From: 5257
To: 1527


In [6]:
all_datasets_as_frame = pd.DataFrame.from_dict(all_datasets, orient='index')

In [7]:
# Remove all datasets with "BNG" in name
print("From:", len(to_consider))

for did in to_consider:
    if to_consider[did]['name'].startswith("BNG"):
        to_remove.add(did)
        
for did in to_remove:
    if did in to_consider:
        del to_consider[did]
        
print("To:", len(to_consider)) # 641

From: 1527
To: 1518


In [8]:
# check for similar names and manually check overlaps
# def return_triplets(name):
#     triplets = set()
#     name = name.lower()
#     for i in range(len(name) - 2):
#         triplets.add(name[i: i+3])
#     return triplets

# manually_look_at = []
# for did in to_consider:
#     for did2 in automlbench_dids:
#         automl_triplets = return_triplets(all_datasets[did2]['name'])
#         other_triplets = return_triplets(all_datasets[did]['name'])
#         n_hits = sum([trip in other_triplets for trip in automl_triplets])
#         ratio = n_hits / len(automl_triplets)
#         if n_hits > 2:
#             manually_look_at.append([n_hits, ratio, all_datasets[did]['name'], all_datasets[did2]['name'], did, did2])

# df = pd.DataFrame(manually_look_at)


In [9]:
# This list contains all manually collected dataset ids as in 
# https://docs.google.com/spreadsheets/d/1XH1mQFq7-1fs28F-CQJo6i6Wg7IlpDS0aDTAJBoaD7k/edit#gid=0
print("From:", len(to_consider))
for did in [14, 16, 18, 20, 22, 36, 150, 179, 180, 958, 962, 971, 978, 994, 995, 1020, 1022, 1112, 1113, 1114, 
            1119, 1242, 1558, 40979, 40997, 40998, 40999, 41000, 41001, 41002, 41003, 41004, 41005, 
            41006, 41007, 43900, 43947, 44096, 44097, 44098]:
    to_remove.add(did)
 
for did in to_remove:
    if did in to_consider:
        del to_consider[did]
print("To:", len(to_consider)) # 603

From: 1518
To: 1482


In [10]:
# check for similar metafeatures and manually check overlaps
# metafeature_names = [
#     'MajorityClassSize', 'MaxNominalAttDistinctValues', 'MinorityClassSize', 'NumberOfClasses',
#     'NumberOfFeatures', 'NumberOfInstances', 'NumberOfInstancesWithMissingValues', 'NumberOfMissingValues',
#     'NumberOfNumericFeatures', 'NumberOfSymbolicFeatures',
# ]

# simple_metafeatures_to_consider = pd.DataFrame(
#     {k: v for k, v in all_datasets.items() if k in to_consider}
# ).transpose()[metafeature_names]
# simple_metafeatures_automl = pd.DataFrame(
#     {k: v for k, v in all_datasets.items() if k in automlbench_dids}
# ).transpose()[metafeature_names]

# # Checking the hamming distance of the datasets
# manually_look_at = []
# for did1, mf1 in simple_metafeatures_to_consider.iterrows():
#     for did2, mf2 in simple_metafeatures_automl.iterrows():
#         n_hits = np.sum(mf1 == mf2)
#         ratio = n_hits / len(mf1)
#         n_hits = np.sum(
#             mf1[['NumberOfFeatures', 'NumberOfInstances']] == mf2[['NumberOfFeatures', 'NumberOfInstances']]
#         )
#         if ratio > 0.5 or n_hits == 2:
#             entry = [ratio, n_hits, all_datasets[did1]['name'], all_datasets[did2]['name'], did1, did2]
#             mfs = np.array([[mf1[n], mf2[n]] for n in metafeature_names]).flatten()
#             entry.extend(mfs)
#             manually_look_at.append(entry)

# header = ["ratio", "n_hits", "name", "automl name", "did", "automl did"] 
# mfs = list(np.array([[m, m] for m in metafeature_names]).flatten())
# df = pd.DataFrame(manually_look_at, columns = header + mfs)
# with open("ManualMetafeatures.csv", "w") as fh:
#    fh.write(df.to_csv())            

In [11]:
# This list contains all manually collected dataset ids as in 
# https://docs.google.com/spreadsheets/d/1SWEBsFTykdUMq-wBcx9NHchhdolB632EJYaAmQwwQtM/edit#gid=0
print("From:", len(to_consider))
for did in [44153, 44234]:
    to_remove.add(did)
 
for did in to_remove:
    if did in to_consider:
        del to_consider[did]
print("To:", len(to_consider)) # 599
# check whether we can work usefully with creditcard (1597)

From: 1482
To: 1480


In [12]:
columns = ['name', 'NumberOfClasses', 'NumberOfFeatures', 'NumberOfInstances', ]

to_consider_stats = pd.DataFrame(
    {k: v for k, v in all_datasets.items() if k in to_consider}
).transpose()[columns]


In [13]:
# remove sparse datasets
print("From:", len(to_consider))
start = time.time()
for i, did in enumerate(to_consider):
    #print(did)
    if (i+1) % 25 == 0:
        print("%4.2fsec: %d/%d" % (time.time() - start, i+1, len(to_consider)))
    d = openml.datasets.get_dataset(int(did), download_data=False)
    #print(
    #    d.format.lower(), 
    #    'sparse' in d.format.lower(), 
    #    d.description and 'CLASSINDEX: none specific' in d.description
    #)
    for f in d.features.values():
        if f.data_type == 'string':
            to_remove.add(did)

for did in to_remove:
    if did in to_consider:
        del to_consider[did]
print("To:", len(to_consider)) # 599

From: 1480
0.04sec: 25/1480
0.07sec: 50/1480
0.11sec: 75/1480
0.14sec: 100/1480
0.16sec: 125/1480
0.19sec: 150/1480
0.23sec: 175/1480
0.26sec: 200/1480
0.29sec: 225/1480
0.31sec: 250/1480
0.35sec: 275/1480
0.37sec: 300/1480
0.40sec: 325/1480
0.43sec: 350/1480
0.46sec: 375/1480
0.50sec: 400/1480
0.54sec: 425/1480
0.57sec: 450/1480
0.61sec: 475/1480
0.64sec: 500/1480
0.66sec: 525/1480
0.69sec: 550/1480
0.71sec: 575/1480
0.74sec: 600/1480
0.76sec: 625/1480
0.79sec: 650/1480
0.82sec: 675/1480
0.87sec: 700/1480
0.90sec: 725/1480
0.93sec: 750/1480
0.96sec: 775/1480
0.98sec: 800/1480
1.01sec: 825/1480
1.03sec: 850/1480
1.06sec: 875/1480
1.08sec: 900/1480
1.11sec: 925/1480
1.13sec: 950/1480
1.15sec: 975/1480
1.18sec: 1000/1480
1.20sec: 1025/1480
1.25sec: 1050/1480
1.27sec: 1075/1480
1.30sec: 1100/1480
2.10sec: 1125/1480
2.12sec: 1150/1480
2.15sec: 1175/1480
2.17sec: 1200/1480
2.20sec: 1225/1480
2.23sec: 1250/1480
2.26sec: 1275/1480
2.29sec: 1300/1480
2.32sec: 1325/1480
2.46sec: 1350/1480
2.49s

In [14]:
# remove identical/similar datasets (to reduce the load computing the meta-data)
# check for similar names and manually check overlaps
# def return_triplets(name):
#     triplets = set()
#     name = name.lower()
#     for i in range(len(name) - 2):
#         triplets.add(name[i: i+3])
#     return triplets

# manually_look_at = []
# for i, did in enumerate(to_consider):
#     for j, did2 in enumerate(to_consider):
#         if j <= i:
#             continue
#         if len(all_datasets[did2]['name']) < 3:
#             continue
#         other2_triplets = return_triplets(all_datasets[did2]['name'])
#         other_triplets = return_triplets(all_datasets[did]['name'])
#         n_hits = sum([trip in other_triplets for trip in other2_triplets])
#         ratio = n_hits / len(other2_triplets)
#         if n_hits > 2:
#             manually_look_at.append([n_hits, ratio, all_datasets[did]['name'], all_datasets[did2]['name'], did, did2])

# df = pd.DataFrame(manually_look_at)


In [15]:
print("From:", len(to_consider))
for did in [989, 977, 1222, 997, 1568, 980, 1021, 1019, 1023,
            953, 1000, 40474, 40475, 40476, 40477, 40478,
            40479, 979, 720, 1557, 990, 41966, 1016, 954,
            40597, 976, 1004, 966, 970, 1014, 741, 774, 795,
            827, 931, 843, 853, 959, 987, 1037, 1038, 1040,
            1560, 1467, 1476, 1566, 1492, 1493, 1525, 1526,
            40687, 40926, 41945, 41946, 42140, 42141, 42192]:
    to_remove.add(did)
 
for did in to_remove:
    if did in to_consider:
        del to_consider[did]
print("To:", len(to_consider)) # 503

From: 1227
To: 1174


In [16]:
# Subsample from 'topics', where there are dozens of super-similar
# datasots from one source
dataset_names = {did: value['name'] for did, value in all_datasets.items()
                 if did in to_consider}

rng = np.random.RandomState(1)
# Friedman datasets
friedman_datasets = [
    did for did, name in dataset_names.items() if name.startswith('fri_c')
]
print(friedman_datasets)

# volcanoes datasets
volcanoes_datasets = [
    did for did, name in dataset_names.items() 
    if name.startswith('volcanoes-')
]
print(volcanoes_datasets)
# AutoUniv
auto_univ_datasets = [
    did for did, name in dataset_names.items() 
    if name.startswith('autoUniv-')
]
print(auto_univ_datasets)
for dataset_ids_to_sample_from, num_keep in (
    (friedman_datasets, 10),
    # (ova_datasets, 7),
    (volcanoes_datasets, 10),
    (auto_univ_datasets, 4),
):
    choices_to_drop = np.random.choice(
        dataset_ids_to_sample_from,
        replace=False,
        size=len(dataset_ids_to_sample_from) - num_keep)
    for choice in choices_to_drop:
        if choice in to_consider:
            del to_consider[choice]
print("To:", len(to_consider)) # 454

[715, 718, 723, 730, 732, 740, 742, 743, 744, 746, 749, 751, 763, 766, 769, 773, 776, 779, 792, 793, 794, 797, 799, 805, 806, 813, 824, 830, 832, 834, 837, 838, 845, 849, 855, 863, 866, 869, 870, 873, 877, 879, 884, 888, 896, 903, 904, 910, 911, 912, 913, 917, 918, 920, 926, 933, 935, 936, 937, 943]
[1527, 1528, 1529, 1530, 1531, 1532, 1533, 1534, 1535, 1536, 1537, 1538, 1539, 1540, 1541, 1542, 1543, 1544, 1545, 1546]
[1547, 1548, 1549, 1551, 1552, 1553, 1554, 1555]
To: 1110


In [17]:
# Delete a few other datasets

# Click prediction datasets consist almost exclusively of IDs
dataset_names = {did: value['name'] for did, value in all_datasets.items()
                 if did in to_consider
                 and value['name'].startswith('Click_prediction')}
for did in dataset_names:
    if did in to_consider:
        del to_consider[did]
        


# Forex datasets are time series dataset with a time stamp
# regularly shuffled tasks won't work here
dataset_names = {did: value['name'] for did, value in all_datasets.items()
                 if did in to_consider
                 and value['name'].startswith('FOREX')}
for did in dataset_names:
    if did in to_consider:
        del to_consider[did]
print("To:", len(to_consider)) # 253

To: 907


In [18]:
print("From:", len(to_consider))
to_remove.update([
    312,  # multilabel
    313,  # hierarchical classification
    316,  # multilabel classification
    378,  # unclear target
    381,  # unclear target
    382,  # unclear target
    1178,  # multilabel
    1179,  # multilabel
    1472,  # regression
    1477,  # not a regular classification dataset (stream)
    40588,  # multilabel
    40589,  # multilabel
    40590,  # multilabel
    40591,  # multilabel
    40592,  # multilabel
    40593,  # multilabel
    40594,  # multilabel
    40595,  # multilabel
    40596,  # multilabel
    40597,  # multilabel
    40686,  # multilabel
    40687,  # multilabel
    40702,  # multilabel
    40910,  # stream dataset
    41103,  # description says 'CIFAR-10 dataset but with some modifications'
    41526,  # is named test_dataset
])
for did in to_remove:
    if did in to_consider:
        del to_consider[did]
        
print("To:", len(to_consider)) # 232

From: 907
To: 886


In [19]:
# check for similar metafeatures and manually check overlaps
# metafeature_names = [
#     'MajorityClassSize', 'MaxNominalAttDistinctValues', 'MinorityClassSize', 'NumberOfClasses',
#     'NumberOfFeatures', 'NumberOfInstances', 'NumberOfInstancesWithMissingValues', 'NumberOfMissingValues',
#     'NumberOfNumericFeatures', 'NumberOfSymbolicFeatures',
# ]

# simple_metafeatures_to_consider = pd.DataFrame(
#     {k: v for k, v in all_datasets.items() if k in to_consider}
# ).transpose()[metafeature_names]

# # Checking the hamming distance of the datasets
# manually_look_at = []
# for did1, mf1 in simple_metafeatures_to_consider.iterrows():
#     for did2, mf2 in simple_metafeatures_to_consider.iterrows():
#         if did2 <= did1:
#             continue
#         n_hits = np.sum(mf1 == mf2)
#         ratio = n_hits / len(mf1)
#         n_hits = np.sum(
#             mf1[['NumberOfFeatures', 'NumberOfInstances']] == mf2[['NumberOfFeatures', 'NumberOfInstances']]
#         )
#         if ratio > 0.5 or n_hits == 2:
#             entry = [ratio, n_hits, all_datasets[did1]['name'], all_datasets[did2]['name'], did1, did2]
#             mfs = np.array([[mf1[n], mf2[n]] for n in metafeature_names]).flatten()
#             entry.extend(mfs)
#             manually_look_at.append(entry)

# header = ["ratio", "n_hits", "name", "automl name", "did", "automl did"] 
# mfs = list(np.array([[m, m] for m in metafeature_names]).flatten())
# df = pd.DataFrame(manually_look_at, columns = header + mfs)
# #with open("/tmp/ManualMetafeatures.csv", "w") as fh:
#    fh.write(df.to_csv()) 

In [20]:
print("From:", len(to_consider))
for did in [983, 38, 40707, 40708, 40713, 40690, 454, 41156,
            40678, 41964]:
    to_remove.add(did)
 
for did in to_remove:
    if did in to_consider:
        del to_consider[did]
print("To:", len(to_consider)) # 222

From: 886
To: 876


In [21]:
dataset_names = {did: value['name'] for did, value in all_datasets.items()
                 if did in to_consider}
for did in [
    914,  # Balloon dataset, has only a single features
    993,  # not a classification dataset
    1002,  # not a classification dataset
    1018,  # not a classification dataset
    40497,  # regular thyroid dataset
    40517,  # artificial drift detection dataset
    40666,  # derived from the musk dataset (1116) we use
    41158,  # derived from MNIST
    41960,  # appears to not be a classification dataset
    42344,  # appears to not be a classification dataset
    42931,   # string dataset
    183, # Too many gama errors, can't evaluate pipelines
    4552, # Too many gama errors, can't evaluate pipelines
    44186, # dataset version copy
    45019, # dataset version copy
    40700, # dataset version copy
    43901, # only ids as features
    41463, # 1 feature with tweets, earlier filtering failed
    4340, # too imbalanced
    44533, # tooo many classes, problems with cross validation
    45103, # tooo many classes, problems with cross validation
    44534, # too imbalanced
    45102, # too imbalanced
]:
    to_remove.add(did)

for did in [ # recreated samples from automl benchmark datasets
    44593, 
    44498, 
    44557, 
    44618, 
    44780, 
    44698, 
    44729, 
    44535
]:
    to_remove.add(did)


# Not accessible due to errors while accessing the data from openml
for did in [
    41949,
    43148, 
    43147, 
    42716
]:
    to_remove.add(did)
    
for did in to_remove:
    if did in to_consider:
        del to_consider[did]
print("To:", len(to_consider))

To: 851


In [26]:
#Classification datasets with NumberoFClasses == 0 in the metafeatures
clf_wrong_metafeatures = {
     231,
     298,
     301,
     516,
     524,
     703,
     1028,
     1097,
     1228,
     1430,
     1432,
     1433,
     1571,
     1572,
     1574,
     1575,
     1579,
     1589,
     1591,
     1593,
     4532,
     23395,
     41943,
     42175,
     42176,
     42464,
     42636,
}

classes = [
    2,
     2,
     2,
     3,
     3,
     3,
     4,
     3,
     2,
     2,
     2,
     2,
     2,
     2,
     2,
     2,
     2,
     2,
     3,
     3,
     2,
     2,
     2,
     2,
     2,
     2,
     2,
]

In [27]:
final = pd.DataFrame(
    {k: v for k, v in all_datasets.items() if k in to_consider}
).transpose()

In [28]:
final_filtered = final.groupby(['NumberOfClasses', 'NumberOfInstances']).sample(n = 1)

In [29]:
all_datasets_as_frame.loc[all_datasets_as_frame["did"].isin(clf_wrong_metafeatures), "NumberOfClasses"] = classes


In [30]:
#set display max rows to 25

final_frame = pd.concat([final_filtered, all_datasets_as_frame[all_datasets_as_frame["did"].isin(clf_wrong_metafeatures)]])

In [31]:
final_binary = final_frame[final_frame["NumberOfClasses"] == 2]
final_multi = final_frame[final_frame["NumberOfClasses"] > 2]

In [44]:
# Uncomment to overwrite ids

# final_multi["did"].to_csv("dataset_ids/multiclass_dids.csv", index = False)
# final_binary["did"].to_csv("dataset_ids/binary_dids.csv", index = False)