# **Auto-Sklearn**

Reuirements:
setuptools
pytest>=4.6
Cython

numpy>=1.9.0
pandas<1.0
scipy>=0.14.1

scikit-learn>=0.22.0,<0.23

lockfile
joblib
psutil
pyyaml
liac-arff


ConfigSpace>=0.4.0,<0.5
pynisher>=0.4.2
pyrfr>=0.7,<0.9
smac>=0.12

In [1]:
!apt-get install swig -y
!pip install Cython numpy
!pip install auto-sklearn

Reading package lists... Done
Building dependency tree       
Reading state information... Done
swig is already the newest version (3.0.12-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-440
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [2]:
import sklearn.datasets
import sklearn.metrics
import autosklearn.classification as classifier
import autosklearn.regression as regressor
from sklearn.model_selection import train_test_split

# **# Regression Problem using Autosklearn.regression**

In [3]:
X, y = sklearn.datasets.load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=0)                          # split the dataset into training and test data

In [4]:
automl = regressor.AutoSklearnRegressor(                                                          #pass the training data to auto-sklearn 
          time_left_for_this_task=120, # run auto-sklearn for at most 2min
          per_run_time_limit=30, # spend at most 30 sec for each model training
          )

# train model(s)
automl.fit(X_train, y_train)                                                                      #evaluate the trained model on test data.

# evaluate
y_hat = automl.predict(X_test)
test_acc = sklearn.metrics.r2_score(y_test, y_hat)
print("Test R2 score {0}".format(test_acc))

Test R2 score 0.7445258026388097


In [5]:
print(automl.show_models())

[(0.300000, SimpleRegressionPipeline({'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'no_coalescense', 'data_preprocessing:numerical_transformer:imputation:strategy': 'median', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'minmax', 'feature_preprocessor:__choice__': 'no_preprocessing', 'regressor:__choice__': 'adaboost', 'regressor:adaboost:learning_rate': 0.4819561000898008, 'regressor:adaboost:loss': 'exponential', 'regressor:adaboost:max_depth': 9, 'regressor:adaboost:n_estimators': 477},
dataset_properties={
  'task': 4,
  'sparse': False,
  'multioutput': False,
  'target_type': 'regression',
  'signed': False})),
(0.280000, SimpleRegressionPipeline({'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'no_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'no_coalescense'

# classification Problem using Autosklearn.classification

In [6]:
X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = \
    sklearn.model_selection.train_test_split(X, y, random_state=1)                  # split the dataset into training and test data

automl = classifier.AutoSklearnClassifier(
          time_left_for_this_task=120,                                               #pass the training data to auto-sklearn 
          per_run_time_limit=30,
          resampling_strategy='holdout',
          resampling_strategy_arguments={'train_size': 0.80}
          )

automl.fit(X_train, y_train)

y_hat = automl.predict(X_test)
test_acc = sklearn.metrics.accuracy_score(y_test, y_hat)
print("Test Accuracy score: {0}".format(test_acc))

Test Accuracy score: 0.965034965034965


In [7]:
# Load data
X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=0)                                 # split the dataset into training and test data

In [8]:
# configure auto-sklearn
automl = classifier.AutoSklearnClassifier(                                                              #pass the training data to auto-sklearn and evaluate the trained model on test data.
          time_left_for_this_task=120, # run auto-sklearn for at most 2min
          per_run_time_limit=30, # spend at most 30 sec for each model training
          )

# train model(s)
automl.fit(X_train, y_train)

# evaluate
y_hat = automl.predict(X_test)
test_acc = sklearn.metrics.accuracy_score(y_test, y_hat)
print("Test Accuracy score {0}".format(test_acc))

Test Accuracy score 0.972027972027972


Auto-sklearn internally used a holdout set of the traning set to estimate the quality of the trained model. Based on this hold-out validation set, auto-sklearn reports a validation score.



In [9]:
print(automl.sprint_statistics())

auto-sklearn results:
  Dataset name: 528ab82dffffb5039325fc43a2c64979
  Metric: accuracy
  Best validation score: 0.978723
  Number of target algorithm runs: 13
  Number of successful target algorithm runs: 9
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 4
  Number of target algorithms that exceeded the memory limit: 0



In [10]:
print(automl.show_models())

[(0.420000, SimpleClassificationPipeline({'balancing:strategy': 'weighting', 'classifier:__choice__': 'sgd', 'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preprocessing:numerical_transformer:imputation:strategy': 'mean', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'minmax', 'feature_preprocessor:__choice__': 'no_preprocessing', 'classifier:sgd:alpha': 0.00039927077813935847, 'classifier:sgd:average': 'True', 'classifier:sgd:fit_intercept': 'True', 'classifier:sgd:learning_rate': 'constant', 'classifier:sgd:loss': 'log', 'classifier:sgd:penalty': 'l1', 'classifier:sgd:tol': 2.3026724800524452e-05, 'data_preprocessing:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction': 0.0005751185552832477, 'classifier:sgd:eta0': 0.0002226431182528295},
dataset_properties={
  'task': 1,
  'sparse'

In [11]:
print(sklearn.metrics.confusion_matrix(y_test, y_hat))

[[51  2]
 [ 2 88]]


In [12]:
print(sklearn.metrics.precision_score(y_test, y_hat))

0.9777777777777777


In [13]:
print(sklearn.metrics.recall_score(y_test, y_hat))

0.9777777777777777
