
# Classification

The following example shows how to fit a simple classification model with
*auto-sklearn*.


In [1]:
from pprint import pprint
import sklearn.metrics
import autosklearn.classification
import pandas as pd
import timeit
from sklearn.model_selection import train_test_split

## Data Loading



In [2]:
from typing import Optional
from pprint import pprint

import autosklearn.pipeline.components.data_preprocessing
import sklearn.metrics
from ConfigSpace.configuration_space import ConfigurationSpace

from autosklearn.askl_typing import FEAT_TYPE_TYPE
from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
from autosklearn.pipeline.constants import SPARSE, DENSE, UNSIGNED_DATA, INPUT
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split


class NoPreprocessing(AutoSklearnPreprocessingAlgorithm):
    def __init__(self, **kwargs):
        """This preprocessors does not change the data"""
        # Some internal checks makes sure parameters are set
        for key, val in kwargs.items():
            setattr(self, key, val)

    def fit(self, X, Y=None):
        return self

    def transform(self, X):
        return X

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            "shortname": "NoPreprocessing",
            "name": "NoPreprocessing",
            "handles_regression": True,
            "handles_classification": True,
            "handles_multiclass": True,
            "handles_multilabel": True,
            "handles_multioutput": True,
            "is_deterministic": True,
            "input": (SPARSE, DENSE, UNSIGNED_DATA),
            "output": (INPUT,),
        }

    @staticmethod
    def get_hyperparameter_search_space(
        feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None
    ):
        return ConfigurationSpace()  # Return an empty configuration as there is None


# Add NoPreprocessing component to auto-sklearn.
autosklearn.pipeline.components.data_preprocessing.add_preprocessor(NoPreprocessing)

In [4]:

file_path = '../../data/old/clean_data_rf.csv'
df = pd.read_csv(file_path)

y = df['Rating']
X = df.drop('Rating', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Build and fit a classifier



In [5]:
start_time = timeit.default_timer()
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=120,
    per_run_time_limit=30,
    include = {
        'classifier': ["mlp"],
        'feature_preprocessor': ["no_preprocessing"],
        'data_preprocessor': ["NoPreprocessing"]
    },
    tmp_folder="tmp/autosklearn_classification_example_tmp1",
)

automl.fit(X_train, y_train, dataset_name="airbnb")
## get configuration for a model/run
run_key = list(automl.automl_.runhistory_.data.keys())[0]
run_value = automl.automl_.runhistory_.data[run_key]
config=automl.automl_.runhistory_.ids_config[run_key.config_id]
print(config)

end_time = timeit.default_timer()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

  if is_sparse(X[column]):


MLPP INITTT


Fitting to the training data: 100%|[32m██████████[0m| 120/120 [01:48<00:00,  1.11it/s, The total time budget for this task is 0:02:00]


Configuration(values={
  'balancing:strategy': 'none',
  'classifier:__choice__': 'mlp',
  'classifier:mlp:activation': 'relu',
  'classifier:mlp:alpha': 0.0001,
  'classifier:mlp:batch_size': 'auto',
  'classifier:mlp:beta_1': 0.9,
  'classifier:mlp:beta_2': 0.999,
  'classifier:mlp:early_stopping': 'valid',
  'classifier:mlp:epsilon': 1e-08,
  'classifier:mlp:hidden_layer_depth': 5,
  'classifier:mlp:learning_rate_init': 0.001,
  'classifier:mlp:n_iter_no_change': 32,
  'classifier:mlp:num_nodes_per_layer': 256,
  'classifier:mlp:shuffle': 'True',
  'classifier:mlp:solver': 'adam',
  'classifier:mlp:tol': 0.0001,
  'classifier:mlp:validation_fraction': 0.1,
  'data_preprocessor:__choice__': 'NoPreprocessing',
  'feature_preprocessor:__choice__': 'no_preprocessing',
})

Execution time: 113.56619377600146 seconds


## View the models found by auto-sklearn



In [6]:
print(automl.leaderboard())

          rank  ensemble_weight type      cost  duration
model_id                                                
46           1             0.48  mlp  0.169811  1.670621
36           3             0.10  mlp  0.179245  1.566730
40           2             0.02  mlp  0.179245  0.474872
2            4             0.02  mlp  0.188679  0.490236
42           6             0.02  mlp  0.198113  0.556938
53           5             0.02  mlp  0.198113  1.624462
26           7             0.02  mlp  0.207547  0.698384
37           8             0.10  mlp  0.207547  0.925607
24          10             0.02  mlp  0.216981  0.655036
43           9             0.04  mlp  0.216981  1.257969
68          11             0.12  mlp  0.216981  1.218740
17          12             0.02  mlp  0.226415  0.731372
44          13             0.02  mlp  0.226415  1.118220


## Print the final ensemble constructed by auto-sklearn



In [7]:
pprint(automl.show_models(), indent=4)

{   2: {   'balancing': Balancing(random_state=1),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f5879ab02e0>,
           'cost': 0.18867924528301883,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f5879ab0160>,
           'ensemble_weight': 0.02,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f5879ab01c0>,
           'model_id': 2,
           'rank': 7,
           'sklearn_classifier': MLPClassifier(beta_1=0.999, beta_2=0.9, early_stopping=True,
              hidden_layer_sizes=(64, 64, 64), max_iter=32, n_iter_no_change=32,
              random_state=1, verbose=0, warm_start=True)},
    17: {   'balancing': Balancing(random_state=1, strategy='weighting'),
            'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f587a27eaf0>,
    

## Get the Score of the final ensemble



In [8]:
predictions = automl.predict(X_test)
print(predictions)
print(set(predictions))
print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, predictions))

[0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 1. 0. 1. 0. 1. 1. 1. 0. 0. 0.
 0. 0. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 1. 1. 0.
 0. 0. 0. 0. 0. 0. 1. 0.]
{0.0, 1.0}
Accuracy score: 0.7


  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
