We use the Airbnb dataset from Homework 3 to illustrate how different AutoML frameworks work, by doing model selection on the training set and then evaluate on test set. The error metric we are using is balanced error rate, which is the average of false positive rate and false negative rate, and then take the average of those averages across classes.

In [1]:
import sys
import pandas as pd
import os
import time
from datetime import datetime
import numpy as np
import multiprocessing as mp

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

import autosklearn.classification
from autosklearn.metrics import balanced_accuracy

In [2]:
automl_path = 'oboe/automl/'
sys.path.append(automl_path)
from auto_learner import AutoLearner
import util

# disable warnings
import warnings
warnings.filterwarnings('ignore')

Prepare the Airbnb dataset.

In [3]:
airbnb_dataset_size = 3000 # number of points to keep in subsampling

df_airbnb = pd.read_csv("airbnb.csv", index_col=None, header=0)
df_airbnb.drop(df_airbnb[df_airbnb.price == np.nan].index, inplace=True)
features_real = [
  "host_listings_count",
  "host_total_listings_count",
  "accommodates",
  "bathrooms",
  "bedrooms",
  "guests_included",
  "extra_people",
  "minimum_nights",
  "maximum_nights",
  "availability_30",
  "availability_60",
  "availability_90",
  "availability_365",
  "number_of_reviews",
  "review_scores_rating",
  "review_scores_accuracy",
  "review_scores_cleanliness",
  "review_scores_checkin",
  "review_scores_communication",
  "review_scores_location",
  "price"
]

label = ["review_scores_value"]
x = df_airbnb[features_real].values
y = df_airbnb[label].values.flatten()

np.random.seed(0)
idx_to_keep = np.random.choice(np.arange(y.shape[0]), size=airbnb_dataset_size, replace=False)
x = x[idx_to_keep]
y = y[idx_to_keep]
    
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# Part I: auto-sklearn

We may restrict the estimator search space to only search for a good classifier among these models.

In [4]:
include_estimators = ["adaboost","gaussian_nb", "extra_trees", "gradient_boosting", 
                                 "liblinear_svc", "libsvm_svc","random_forest",
                                 "k_nearest_neighbors","decision_tree"]

We may also specify a running time limit.

In [5]:
runtime_limit = 120

In [6]:
# A wrapper class for the auto-sklearn learner.
def AutoSklearn(total_runtime, train_features, train_labels):
    clf = autosklearn.classification.AutoSklearnClassifier(
            time_left_for_this_task=total_runtime,
            tmp_folder='tmp/autosklearn_tmp_'+str(datetime.now()), 
            output_folder='tmp/autosklearn_output_'+str(datetime.now()),
            metric=balanced_accuracy,
            include_estimators = include_estimators,
    )
        
    clf.fit(train_features, train_labels)    
    return clf

Run auto-sklearn for 120 seconds.

In [7]:
runtime = 120
clf = AutoSklearn(runtime, x_train, y_train)

Get predicted training and test labels.

In [8]:
y_train_pred_autosklearn = clf.predict(x_train)

In [9]:
y_test_pred_autosklearn = clf.predict(x_test)

Show which models the learner has picked.

In [10]:
clf.show_models()

"[(1.000000, SimpleClassificationPipeline({'balancing:strategy': 'weighting', 'classifier:__choice__': 'random_forest', 'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preprocessing:numerical_transformer:imputation:strategy': 'median', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'quantile_transformer', 'feature_preprocessor:__choice__': 'polynomial', 'classifier:random_forest:bootstrap': 'False', 'classifier:random_forest:criterion': 'gini', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:max_features': 0.21794354428393548, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:min_impurity_decrease': 0.0, 'classifier:random_forest:min_samples_leaf': 2, 'classifier:random_forest:min_samples_split': 16, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'data_p

Show the error on test dataset.

In [11]:
util.error(y_train, y_train_pred_autosklearn, 'classification')

0.10065950071453614

In [12]:
util.error(y_test, y_test_pred_autosklearn, 'classification')

0.18850923114927096

# Part II: TPOT

TPOT is an AutoML tool that optimizes machine learning pipelines by genetic programming.

In [13]:
from tpot import TPOTClassifier

Run TPOT for 120 seconds.

In [14]:
tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, max_time_mins=.5)
tpot.fit(x_train, y_train)

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=20.0, style=ProgressStyle(des…


0.58 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: DecisionTreeClassifier(SelectFwe(input_matrix, alpha=0.042), criterion=entropy, max_depth=10, min_samples_leaf=20, min_samples_split=7)


TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
               disable_update_check=False, early_stop=None, generations=5,
               log_file=<ipykernel.iostream.OutStream object at 0x7f26b2870780>,
               max_eval_time_mins=5, max_time_mins=0.5, memory=None,
               mutation_rate=0.9, n_jobs=1, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=20,
               random_state=None, scoring=None, subsample=1.0, template=None,
               use_dask=False, verbosity=2, warm_start=False)

In [15]:
y_train_pred_tpot = tpot.predict(x_train)
y_test_pred_tpot = tpot.predict(x_test)

Show the error on test dataset.

In [16]:
#tpot training error
util.error(y_train, y_train_pred_tpot, 'classification')

0.3397069433952885

In [18]:
#tpot test error
util.error(y_test, y_test_pred_tpot, 'classification')

0.3557644506500882

# Part III: Oboe (still under development)

## Oboe Example 1: build an ensemble of models

In [19]:
#experimental settings
VERBOSE = False #whether to print out information indicating current fitting progress
N_CORES = 1 #number of cores
RUNTIME_BUDGET = 30

In [20]:
#optional: limit the types of algorithms
s = ['AB', 'ExtraTrees', 'GNB', 'KNN', 'RF', 'DT']

In [21]:
#autolearner arguments
autolearner_kwargs = {
    'p_type': 'classification',
    'runtime_limit': RUNTIME_BUDGET,
    'verbose': VERBOSE,
    'selection_method': 'min_variance',
    'algorithms': s,
    'stacking_alg': 'greedy',
    'n_cores': N_CORES,
    'build_ensemble': True,
}

In [22]:
#intialize the autolearner class
m = AutoLearner(**autolearner_kwargs)

In [23]:
# fit autolearner on training set and record runtime
start = time.time()
m.fit(x_train, y_train)
elapsed_time = time.time() - start

In [24]:
# use the fitted autolearner for prediction on test set
y_predicted = m.predict(x_test)
print("prediction error: {}".format(util.error(y_test, y_predicted, 'classification')))
print("elapsed time: {}".format(elapsed_time))
print("individual accuracies of selected models: {}".format(m.get_model_accuracy(y_test)))

prediction error: 0.3139487158460067
elapsed time: 27.216959714889526
individual accuracies of selected models: [0.31651432260061413, 0.32036420549246775, 0.26796126609153437, 0.42171443806106124, 0.31651432260061413]


In [25]:
# get names of the selected machine learning models
m.get_models()

{'ensemble method': 'greedy selection',
 'base learners': {'DT': [{'min_samples_split': 0.0001},
   {'min_samples_split': 4},
   {'min_samples_split': 1024},
   {'min_samples_split': 1e-05}],
  'GNB': [{}]}}

## Oboe Example 2: just select a collection of promising models without building an ensemble afterwards

In [26]:
#experimental settings
VERBOSE = False #whether to print out information indicating current fitting progress
N_CORES = 1 #number of cores
RUNTIME_BUDGET = 30

In [27]:
#optional: limit the types of algorithms
s = ['AB', 'ExtraTrees', 'GNB', 'KNN', 'RF', 'DT']

In [28]:
#autolearner arguments
autolearner_kwargs = {
    'p_type': 'classification',
    'runtime_limit': RUNTIME_BUDGET,
    'verbose': VERBOSE,
    'selection_method': 'min_variance',
    'algorithms': s,
    'stacking_alg': 'greedy',
    'n_cores': N_CORES,
    'build_ensemble': False,
}

In [29]:
#intialize the autolearner class
m = AutoLearner(**autolearner_kwargs)

In [30]:
# fit autolearner on training set and record runtime
start = time.time()
m.fit(x_train, y_train)
elapsed_time = time.time() - start

In [31]:
# use the fitted autolearner for prediction on test set
y_predicted = m.predict(x_test)
 
print("elapsed time: {}".format(elapsed_time))
print("accuracies of selected models: {}".format(m.get_model_accuracy(y_test)))

elapsed time: 10.797972679138184
accuracies of selected models: [0.31651432260061413, 0.31651432260061413, 0.31651432260061413, 0.31651432260061413, 0.3124772208748183, 0.32036420549246775, 0.35449564337367473, 0.3495441004342578, 0.39048041301086595, 0.26796126609153437, 0.42171443806106124, 0.3001999674360773, 0.3515055898002219, 0.2644854823131578, 0.32215416955204695, 0.31565880270077473, 0.377873248684861, 0.297898906174363]


Note that we do not have a single accuracy value here if we do not build an ensemble, instead, we just have a collection of fitted models with individual accuracies reported.

The following shows which models we have picked.

In [32]:
m.get_models()

{'DT': [{'min_samples_split': 1e-05},
  {'min_samples_split': 1e-05},
  {'min_samples_split': 0.0001},
  {'min_samples_split': 2},
  {'min_samples_split': 0.001},
  {'min_samples_split': 4},
  {'min_samples_split': 64},
  {'min_samples_split': 128},
  {'min_samples_split': 256},
  {'min_samples_split': 1024},
  {'min_samples_split': 8},
  {'min_samples_split': 16},
  {'min_samples_split': 32},
  {'min_samples_split': 0.01}],
 'GNB': [{}],
 'AB': [{'n_estimators': 50, 'learning_rate': 1},
  {'n_estimators': 50, 'learning_rate': 1.5},
  {'n_estimators': 100, 'learning_rate': 1}]}