
# Classification

The following example shows how to fit a simple classification model with
*auto-sklearn*.


In [9]:
from pprint import pprint
import sklearn.metrics
import autosklearn.classification
import timeit
import pandas as pd
from sklearn.model_selection import train_test_split

## Data Loading



In [10]:

file_path = '../data/titanic_dirty_data.csv'
df = pd.read_csv(file_path)
df = df.dropna(subset=['Survived'])
y = df['Survived']
X = df.drop('Survived', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Build and fit a classifier



In [11]:
start_time = timeit.default_timer()

automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=120,
    include = {
        'classifier': ["mlp"],
        'feature_preprocessor': ["no_preprocessing"]
    },
    tmp_folder="tmp/autosklearn_classification_example_tmp2",
)

automl.fit(X_train, y_train, dataset_name="airbnb")
## get configuration for a model/run
run_key = list(automl.automl_.runhistory_.data.keys())[0]
run_value = automl.automl_.runhistory_.data[run_key]
config=automl.automl_.runhistory_.ids_config[run_key.config_id]
print(config)


end_time = timeit.default_timer()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")


  if is_sparse(X[column]):
Fitting to the training data:   0%|[32m          [0m| 0/120 [00:00<?, ?it/s, The total time budget for this task is 0:02:00]

MLPP INITTT


Fitting to the training data:   2%|[32m▏         [0m| 2/120 [00:02<01:58,  1.00s/it, The total time budget for this task is 0:02:00]



Fitting to the training data:   2%|[32m▎         [0m| 3/120 [00:03<01:57,  1.00s/it, The total time budget for this task is 0:02:00]



Fitting to the training data: 100%|[32m██████████[0m| 120/120 [01:50<00:00,  1.09it/s, The total time budget for this task is 0:02:00]


Configuration(values={
  'balancing:strategy': 'none',
  'classifier:__choice__': 'mlp',
  'classifier:mlp:activation': 'relu',
  'classifier:mlp:alpha': 0.0001,
  'classifier:mlp:batch_size': 'auto',
  'classifier:mlp:beta_1': 0.9,
  'classifier:mlp:beta_2': 0.999,
  'classifier:mlp:early_stopping': 'valid',
  'classifier:mlp:epsilon': 1e-08,
  'classifier:mlp:hidden_layer_depth': 5,
  'classifier:mlp:learning_rate_init': 0.001,
  'classifier:mlp:n_iter_no_change': 32,
  'classifier:mlp:num_nodes_per_layer': 256,
  'classifier:mlp:shuffle': 'True',
  'classifier:mlp:solver': 'adam',
  'classifier:mlp:tol': 0.0001,
  'classifier:mlp:validation_fraction': 0.1,
  'data_preprocessor:__choice__': 'feature_type',
  'data_preprocessor:feature_type:numerical_transformer:imputation:strategy': 'mean',
  'data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__': 'standardize',
  'data_preprocessor:feature_type:text_transformer:text_encoding:__choice__': 'tfidf_encoding',
  'dat

## View the models found by auto-sklearn



In [4]:
print(automl.leaderboard())

          rank  ensemble_weight type      cost  duration
model_id                                                
2            1              0.8  mlp  0.157576  1.999788
3            2              0.2  mlp  0.169697  2.131531


In [6]:
pprint(automl.cv_results_)

{'budgets': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 'mean_fit_time': array([1.99978781, 2.13153148, 0.84018517, 1.12973261, 0.8757143 ,
       1.08627224, 1.09673429, 1.13344479]),
 'mean_test_score': array([0.84242424, 0.83030303, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        ]),
 'model_ids': [2, 3],
 'param_balancing:strategy': masked_array(data=['none', 'weighting', 'none', 'weighting', 'none',
                   'weighting', 'weighting', 'weighting'],
             mask=[False, False, False, False, False, False, False, False],
       fill_value='N/A',
            dtype='<U9'),
 'param_classifier:__choice__': masked_array(data=['mlp', 'mlp', 'mlp', 'mlp', 'mlp', 'mlp', 'mlp', 'mlp'],
             mask=[False, False, False, False, False, False, False, False],
       fill_value='N/A',
            dtype='<U3'),
 'param_classifier:mlp:activation': masked_array(data=['relu', 'relu', 'relu', 'tanh', 'relu', 'tanh', 'relu',
                   'relu'],
    

## Print the final ensemble constructed by auto-sklearn



In [7]:
pprint(automl.show_models(), indent=4)

{   2: {   'balancing': Balancing(random_state=1),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7fa9e8279eb0>,
           'cost': 0.1575757575757576,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7fa9e83b6fd0>,
           'ensemble_weight': 0.8,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7fa9e8279af0>,
           'model_id': 2,
           'rank': 1,
           'sklearn_classifier': MLPClassifier(beta_1=0.999, beta_2=0.9, early_stopping=True,
              hidden_layer_sizes=(64, 64, 64), max_iter=32, n_iter_no_change=32,
              random_state=1, verbose=0, warm_start=True)},
    3: {   'balancing': Balancing(random_state=1, strategy='weighting'),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7fa9e83898b0>,
        

## Get the Score of the final ensemble



In [8]:
start_time = timeit.default_timer()

predictions = automl.predict(X_test)

end_time = timeit.default_timer()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")
print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, predictions))
print("Precision", sklearn.metrics.precision_score(y_test, predictions))
print("Recall", sklearn.metrics.recall_score(y_test, predictions))

Execution time: 0.06419034300051862 seconds
Accuracy score: 0.76
Precision 0.7209302325581395
Recall 0.6326530612244898


  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
