In [10]:
%matplotlib inline
import openml as oml
import numpy as np
import pandas as pd
import sys
import math
from scipy.stats import norm
from matplotlib import pyplot
import sklearn.tree
import sklearn.preprocessing
from sklearn.externals.joblib import Parallel, delayed
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
sys.path.insert(0,'Python')
from openmlstudy14.preprocessing import ConditionalImputer
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Step 0: Get all datasets from OpenML
- Only classification datasets
- Only active (verified) datasets

In [2]:
# Get all OpenML datasets
status_type = 'active'
openml_list = oml.datasets.list_datasets(status=status_type) # Returns a dict
datalist = pd.DataFrame.from_dict(openml_list, orient='index') # Transform to pandas
#datalist = datalist[datalist.status == 'active'] # Only use active (verified) datasets
datalist = datalist[datalist.NumberOfClasses>=2] # Only classification
print("{} {} classification datasets".format(len(datalist), status_type))

1030 active classification datasets


In [3]:
# Bookkeeping
data_names = {k: v for (k, v) in datalist[['did','name']].values} # dataset names
data_status = {k: 'OK' for k in datalist.index} # dataset status (OK or reason for removal)
datalist_full = datalist.copy()

## Step 1: Apply simple preconditions
- Number of observations larger than 500 (meaningful evaluations)
- Number of observations smaller than 100000 (keep runtime manageable)
- Number of features does not exceed 5000 (keep runtime manageable)
- The ratio of the minority class and the majority class is > 0.05 (severely imbalanced datasets complicate analysis)
- Number of values for categorical features must not exceed 100 (severely slows down some algorithms)
- Sparsely formatted data (requires special data readers)

In [4]:
# Apply preconditions
data_status.update({k: 'Too small' for k in datalist.index[datalist.NumberOfInstances<500]})
data_status.update({k: 'Too large' for k in datalist.index[datalist.NumberOfInstances>100000]})
data_status.update({k: 'High-dimensional' for k in datalist.index[datalist.NumberOfFeatures>5000]})
data_status.update({k: 'Extreme imbalance' for k in datalist.index[datalist.MinorityClassSize / datalist.MajorityClassSize < 0.05]})
#data_status.update({k: 'Too many categories' for k in datalist.index[datalist.MaxNominalAttDistinctValues > 100]})
data_status.update({k: 'Sparse format' for k in datalist.index[datalist.format == 'Sparse_ARFF']})

# Filter dataset list
datalist = datalist[pd.Series({k:(v=='OK') for k,v in data_status.items()})] 

# Status update
[[x,list(data_status.values()).count(x)] for x in set(data_status.values())]

[['High-dimensional', 65],
 ['OK', 322],
 ['Too large', 111],
 ['Extreme imbalance', 171],
 ['Sparse format', 30],
 ['Too small', 331]]

## Step 2: Filter out special datasets
- Artificial datasets (may bias the results)
- Time series dataset (cannot use random sampling for evaluation)
- Text data (contains string features which need additional preprocessing)
- Multilabel data (multiple targets need to be predicted)
- Derived versions of datasets (with additional preprocessing)
- Datasets where the intended classification target is unclear
- Binarized regression problems
- Unknown origin (no description how data was collected and what the problem is)
- Grouped data (instances form groups (blocks) and can't be randomly sampled)

In [5]:
# Get lists of special datasets
artificial_set = set(oml.datasets.list_datasets(tag="artificial", status=status_type).keys())
timeseries_set = set(oml.datasets.list_datasets(tag="time_series", status=status_type).keys())
text_set = set(oml.datasets.list_datasets(tag="text_data", status=status_type).keys())
multilabel_set = set(oml.datasets.list_datasets(tag="multi_label", status=status_type).keys())
derived_set = set(oml.datasets.list_datasets(tag="derived", status=status_type).keys())
unspecified_set = set(oml.datasets.list_datasets(tag="unspecified_target_feature", status=status_type).keys())
binarized_set = set(oml.datasets.list_datasets(tag="binarized_regression_problem", status=status_type).keys())
unknown_set = set(oml.datasets.list_datasets(tag="origin_unknown", status=status_type).keys())
grouped_set = set(oml.datasets.list_datasets(tag="grouped_data", status=status_type).keys())
label_leakage = set(oml.datasets.list_datasets(tag="label_leakage", status=status_type).keys())
single_feature = set(oml.datasets.list_datasets(tag="single_feature_predictable", status=status_type).keys())

data_status.update({k: 'Artificial data' for k in artificial_set})
data_status.update({k: 'Time series data' for k in timeseries_set})
data_status.update({k: 'Text data' for k in text_set})
data_status.update({k: 'Multi-label data' for k in multilabel_set})
data_status.update({k: 'Derived (non-original) data' for k in derived_set})
data_status.update({k: 'Unspecified target feature' for k in unspecified_set})
data_status.update({k: 'Binarized regression problem' for k in binarized_set})
data_status.update({k: 'Unknown origin' for k in unknown_set})
data_status.update({k: 'Grouped data' for k in grouped_set})
data_status.update({k: 'Label leakage' for k in label_leakage})
data_status.update({k: 'Single feature' for k in single_feature})
#data_status.update({k: 'OpenML100' for k in openml100_set})

# Filter dataset list
datalist = datalist[pd.Series({k:(v=='OK') for k,v in data_status.items()})] 

# Status update
[[x,list(data_status.values()).count(x)] for x in set(data_status.values())]

[['High-dimensional', 65],
 ['Unknown origin', 6],
 ['Derived (non-original) data', 42],
 ['Time series data', 7],
 ['OK', 137],
 ['Too large', 25],
 ['Extreme imbalance', 105],
 ['Artificial data', 207],
 ['Multi-label data', 7],
 ['Sparse format', 30],
 ['Grouped data', 2],
 ['Binarized regression problem', 88],
 ['Unspecified target feature', 5],
 ['Label leakage', 2],
 ['Too small', 330],
 ['Text data', 2]]

## Step 3: Remove alternative versions of datasets
- Remove binarized versions of multi-class datasets
- Check other possible duplicates

In [6]:
# Sorting makes things easier
# We need the full list because there may be binarized versions of already removed datasets
datalist_full = datalist_full.sort_values(by=['name','NumberOfClasses'], ascending=[True, False])

checked_datasets = {
    40979: [1022, 20], # Correct version of mfeat-pixel
    40984: [958, 36], # Correct version of segment
    40994: [40990, 40989, 1467], # Correct version of climate-model-simulation-crashes
    1590: [179], # Correct version of adult
    40983: [1570], # Correct version of wilt
    40945: [40704], # Correct version of Titanic
    772: [948], # Correct version of classification version of the quake dataset
    40966: [40965, 40964], # Correct version of MiceProtein (needed when including in_preparation datasets)
    40982: [40973, 1504], # Correct version of steel-plates-fault (needed when including in_preparation datasets)
    23380: [1024, 473], # Correct version of cjs (needed when including in_preparation datasets)
    40597 : [40733], # Correct version of yeast (needed when including in_preparation datasets)
    1046 : [40829], # Correct version of mozilla4 (needed when including in_preparation datasets)
    # : [40958], # Correct version of Bankdata (needed when including in_preparation datasets)
    
}
duplicates_of = {}
# Mark the duplicates of datasets where we know which version is the correct one!
for cd in checked_datasets:
    for dup_id in checked_datasets[cd]:
        duplicates_of[dup_id] = cd

data_unique = {}
for index, row in datalist_full.iterrows():
    if row['did'] in duplicates_of:
        data_status[row['did']] = 'Duplicate of %d' % duplicates_of[row['did']]
    elif row['did'] in checked_datasets and data_status[row['did']] in ('OK', 'Possible duplicate'):
        data_status[row['did']] = 'OK'
    elif row['name'] not in data_unique:
        data_unique[row['name']] = row
    else:
        previous = data_unique[row['name']]
        if previous['NumberOfClasses'] > 2 and row['NumberOfClasses'] == 2:
            data_status[row['did']] = 'Binarized version of multiclass dataset'
        elif data_status[row['did']] in ('OK', 'Possible duplicate'):
            data_status[row['did']] = 'Possible duplicate'

# Filter dataset list
datalist = datalist[pd.Series({k:(v=='OK') for k,v in data_status.items()})] 
               
# Status update
[[x,list(data_status.values()).count(x)] for x in set(data_status.values())]

[['Duplicate of 1590', 1],
 ['OK', 101],
 ['Too large', 25],
 ['Duplicate of 40994', 1],
 ['Unspecified target feature', 5],
 ['Unknown origin', 6],
 ['Duplicate of 40945', 1],
 ['Extreme imbalance', 103],
 ['Multi-label data', 6],
 ['Too small', 291],
 ['High-dimensional', 65],
 ['Time series data', 7],
 ['Duplicate of 40984', 2],
 ['Grouped data', 2],
 ['Binarized regression problem', 86],
 ['Label leakage', 2],
 ['Text data', 2],
 ['Binarized version of multiclass dataset', 81],
 ['Derived (non-original) data', 35],
 ['Artificial data', 206],
 ['Duplicate of 40982', 1],
 ['Sparse format', 29],
 ['Duplicate of 40979', 2]]

In [7]:
# These need to be checked
[k for k,v in data_status.items() if v=='Possible duplicate']

[]

## Step 4: Remove trivial datasets
- See if a model (e.g. random forest) based on 1 feature can get perfect CV performance (JvR, removed this code. But Irish and cjs came out of this check. I will tag them. )
- See if a CV Decision Tree (flow id 7777) or a CV logistic regression (flow id 7778) 

In [8]:
perfect_scores_dt = {int(v.data_id) for k, v in oml.evaluations.list_evaluations('predictive_accuracy', flow=[7777]).items() if v.value == 1}
perfect_scores_lr = {int(v.data_id) for k, v in oml.evaluations.list_evaluations('predictive_accuracy', flow=[7778]).items() if v.value == 1}

for did in perfect_scores_dt:
    if data_status[did] == 'OK':
        data_status[did] = 'Too easy (perfect score decision tree)'
        
for did in perfect_scores_lr:
    if data_status[did] == 'OK':
        data_status[did] = 'Too easy (perfect score logistic regression)'
     
checked_datasets = {int(v.data_id) for v in oml.evaluations.list_evaluations('predictive_accuracy', flow=[7777]).values()}
for did in data_status:
    if data_status[did] == 'OK' and did not in checked_datasets:
        data_status[did] = 'Dataset not checked for triviality by DT!'
checked_datasets = {int(v.data_id) for v in oml.evaluations.list_evaluations('predictive_accuracy', flow=[7778]).values()}
# for did in data_status:
#     if data_status[did] == 'OK' and did not in checked_datasets:
#         data_status[did] = 'Dataset not checked for triviality by LR!'

[[x,list(data_status.values()).count(x)] for x in set(data_status.values())]

[['Too easy (perfect score decision tree)', 2],
 ['Duplicate of 1590', 1],
 ['OK', 79],
 ['Too large', 25],
 ['Duplicate of 40994', 1],
 ['Unspecified target feature', 5],
 ['Unknown origin', 6],
 ['Duplicate of 40945', 1],
 ['Extreme imbalance', 103],
 ['Multi-label data', 6],
 ['Too small', 291],
 ['High-dimensional', 65],
 ['Dataset not checked for triviality by DT!', 20],
 ['Time series data', 7],
 ['Duplicate of 40984', 2],
 ['Grouped data', 2],
 ['Binarized regression problem', 86],
 ['Label leakage', 2],
 ['Text data', 2],
 ['Binarized version of multiclass dataset', 81],
 ['Derived (non-original) data', 35],
 ['Artificial data', 206],
 ['Duplicate of 40982', 1],
 ['Sparse format', 29],
 ['Duplicate of 40979', 2]]

In [18]:
from tqdm import tqdm, tqdm_notebook

datasets = [k for k,v in data_status.items() if v=='OK']

max_score_per_dataset = {}
for idx, dataset_id in enumerate(tqdm_notebook(datasets)):
    try:
        
        dataset = oml.datasets.get_dataset(dataset_id)
        #print('processing dataset', dataset_id, dataset.name, '(',idx+1, '/', len(datasets), ')')
        if dataset.default_target_attribute is None:
            data_status[dataset_id] = 'No target specified'
            print('No target')
            continue
        X, y = dataset.get_data(target=dataset.default_target_attribute)

        #print('One hot encoding dataset to check for the true amount of total features.')
        categorical_indices = dataset.get_features_by_type('nominal', [dataset.default_target_attribute])
        clf = sklearn.pipeline.Pipeline(
            steps=[
                (
                    'imputer', ConditionalImputer(
                        strategy='median', 
                        strategy_nominal='mean', 
                        categorical_features=categorical_indices, 
                        fill_empty=-1,
                    )
                ), 
                (
                    'encoder', 
                    sklearn.preprocessing.OneHotEncoder(categorical_features=categorical_indices)
                )
            ]
        )
        hotencoded = clf.fit_transform(X)
        if hotencoded.shape[1] > 5000:
            data_status[dataset_id] = 'High-dimensional (after one hot encoding.)'
            print(dataset.name, 'too high one-hot-encoded dimensionality', hotencoded.shape[1])
            continue

        #print('building decision stump')
        clf = sklearn.pipeline.Pipeline(steps=[('imputer', sklearn.preprocessing.Imputer(strategy='median')), 
                                               ('classifier', sklearn.tree.DecisionTreeClassifier(max_depth=1))])
        _ = clf.fit(X, y)
        score = clf.score(X, y)
        
        #print('obtaining cv task .. ')
        # TODO

        max_score_per_dataset[dataset_id] = {
            'score': score,
            'name': dataset.name
        }
    except ValueError as e:
        data_status[dataset_id] = 'Python ValueError'
        print(dataset_id, e)
        continue
    except Exception as e:
        data_status[dataset_id] = 'Python Exception'
        print(dataset_id, e)
        continue
    
    
    if max_score_per_dataset[dataset_id]["score"] == 1.00:
        data_status[dataset_id] = 'Too easy (decisionstump on trainset)'
        print("Dataset ", dataset.name, "is too easy.")
    
results = pd.DataFrame(max_score_per_dataset).transpose()




In [19]:
results.sort_values(by='score')

Unnamed: 0,name,score
1493,one-hundred-plants-texture,0.0193871
1492,one-hundred-plants-shape,0.02
1491,one-hundred-plants-margin,0.02
40923,Devnagari-Script,0.0401522
6,letter,0.0718
300,isolet,0.0766962
40971,collins,0.109
307,vowel,0.167677
40499,texture,0.180545
1515,micro-mass,0.182137


## Step 5: Remove datasets for other reasons

In [20]:
data_status[40705] = 'Missing description.'

## Step 6a: Final selection
Final list of selected datasets:

In [21]:
final_datasets = [k for k,v in data_status.items() if v=='OK']
print('{} datasets selected'.format(len(final_datasets)))
{k:v for k,v in data_names.items() if k in final_datasets}

74 datasets selected


{3: 'kr-vs-kp',
 6: 'letter',
 12: 'mfeat-factors',
 14: 'mfeat-fourier',
 15: 'breast-w',
 16: 'mfeat-karhunen',
 18: 'mfeat-morphological',
 22: 'mfeat-zernike',
 23: 'cmc',
 28: 'optdigits',
 29: 'credit-approval',
 31: 'credit-g',
 32: 'pendigits',
 37: 'diabetes',
 38: 'sick',
 42: 'soybean',
 44: 'spambase',
 46: 'splice',
 50: 'tic-tac-toe',
 54: 'vehicle',
 151: 'electricity',
 182: 'satimage',
 188: 'eucalyptus',
 300: 'isolet',
 307: 'vowel',
 469: 'analcatdata_dmft',
 554: 'mnist_784',
 1049: 'pc4',
 1050: 'pc3',
 1053: 'jm1',
 1063: 'kc2',
 1067: 'kc1',
 1068: 'pc1',
 1461: 'bank-marketing',
 1462: 'banknote-authentication',
 1464: 'blood-transfusion-service-center',
 1468: 'cnae-9',
 1475: 'first-order-theorem-proving',
 1478: 'har',
 1480: 'ilpd',
 1485: 'madelon',
 1486: 'nomao',
 1487: 'ozone-level-8hr',
 1489: 'phoneme',
 1491: 'one-hundred-plants-margin',
 1492: 'one-hundred-plants-shape',
 1493: 'one-hundred-plants-texture',
 1494: 'qsar-biodeg',
 1497: 'wall-robot-n

## Step 6b: Check tags
Passed all tests, but not in CC-18:

In [27]:
CC18_set = set(oml.datasets.list_datasets(tag="OpenML-CC18").keys())

new_datasets = [k for k,v in data_status.items() if v=='OK' and k not in CC18_set]
{k:v for k,v in data_names.items() if k in new_datasets}

{42: 'soybean',
 1491: 'one-hundred-plants-margin',
 1492: 'one-hundred-plants-shape',
 1493: 'one-hundred-plants-texture',
 1515: 'micro-mass',
 40536: 'SpeedDating',
 40971: 'collins'}

Datasets tagged with CC18 that did not pass all tests:

In [28]:
new_datasets = [k for k,v in data_status.items() if k in CC18_set and v!='OK']
{k:v for k,v in data_names.items() if k in new_datasets}

{11: 'balance-scale',
 458: 'analcatdata_authorship',
 40927: 'CIFAR_10',
 40975: 'car',
 40978: 'Internet-Advertisements'}

## Step 6c: Overview of exclusions
Reasons to exclude datasets:

In [24]:
v = {}
for key, value in sorted(data_status.items()):
    v.setdefault(value, []).append(key)
{k:str(v) for k,v in v.items()}

{'Artificial data': '[11, 60, 70, 71, 72, 73, 74, 75, 76, 77, 78, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 146, 147, 148, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 271, 272, 333, 334, 335, 581, 1120, 1177, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194, 1195, 1196, 1197, 1198, 1199, 1200, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1351, 1352, 1353, 1354, 1355, 1356, 1357, 1358, 1359, 1360, 1361, 1362, 1363, 1364, 1365, 1366, 1367, 1368, 1369, 1370, 1371, 1372, 1373, 1374, 1375, 1376, 1377, 1378, 1379, 1380, 1381, 1382, 1383, 1384, 1385, 1386, 1387, 1388, 1389, 1390, 1391, 1392, 1393, 1394, 1395, 1396, 1397, 1398, 1399, 1400, 1401, 1402, 1403, 1404, 1405, 1406, 1407, 1408, 1409, 1410, 1459, 1460, 1479, 149