
# Classification

The following example shows how to fit a simple classification model with
*auto-sklearn*.


In [6]:
from pprint import pprint
import sklearn.metrics
import autosklearn.classification
import timeit
import pandas as pd
from sklearn.model_selection import train_test_split

## Data Loading



In [7]:

file_path = '../data/titanic_dirty_data.csv'
df = pd.read_csv(file_path)
df = df.dropna(subset=['Survived'])
y = df['Survived']
X = df.drop('Survived', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Build and fit a classifier



In [21]:
start_time = timeit.default_timer()

automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=300,
    # include = {
    #     'classifier': ["mlp"],
    #     'feature_preprocessor': ["no_preprocessing"]
    # },
    delete_tmp_folder_after_terminate=True,
    tmp_folder="tmp/autosklearn_classification_example_tmp2",
)

automl.fit(X_train, y_train, dataset_name="titanic")
## get configuration for a model/run
run_key = list(automl.automl_.runhistory_.data.keys())[0]
run_value = automl.automl_.runhistory_.data[run_key]
config=automl.automl_.runhistory_.ids_config[run_key.config_id]
print(config)


end_time = timeit.default_timer()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")


  if is_sparse(X[column]):
Fitting to the training data:   1%|[32m          [0m| 3/300 [00:03<05:02,  1.02s/it, The total time budget for this task is 0:05:00]



Fitting to the training data: 100%|[32m██████████[0m| 300/300 [04:56<00:00,  1.01it/s, The total time budget for this task is 0:05:00]


Configuration(values={
  'balancing:strategy': 'none',
  'classifier:__choice__': 'random_forest',
  'classifier:random_forest:bootstrap': 'True',
  'classifier:random_forest:criterion': 'gini',
  'classifier:random_forest:max_depth': 'None',
  'classifier:random_forest:max_features': 0.5,
  'classifier:random_forest:max_leaf_nodes': 'None',
  'classifier:random_forest:min_impurity_decrease': 0.0,
  'classifier:random_forest:min_samples_leaf': 1,
  'classifier:random_forest:min_samples_split': 2,
  'classifier:random_forest:min_weight_fraction_leaf': 0.0,
  'data_preprocessor:__choice__': 'feature_type',
  'data_preprocessor:feature_type:numerical_transformer:imputation:strategy': 'mean',
  'data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__': 'standardize',
  'data_preprocessor:feature_type:text_transformer:text_encoding:__choice__': 'tfidf_encoding',
  'data_preprocessor:feature_type:text_transformer:text_encoding:tfidf_encoding:analyzer': 'char',
  'data_prepr

## View the models found by auto-sklearn



In [22]:
print(automl.leaderboard())

          rank  ensemble_weight           type      cost  duration
model_id                                                          
19           1             0.06            mlp  0.127273  4.255093
2            2             0.02  random_forest  0.157576  4.423953
5            4             0.04  random_forest  0.157576  5.146343
11           3             0.14  liblinear_svc  0.157576  2.123326
7            6             0.04    extra_trees  0.163636  7.010848
10           5             0.04            mlp  0.163636  2.769067
38           7             0.04            mlp  0.175758  3.678078
13           8             0.06    extra_trees  0.187879  2.839729
18           9             0.08  random_forest  0.193939  4.384611
4           10             0.04            lda  0.200000  2.252302
23          12             0.06       adaboost  0.206061  3.809381
46          11             0.18       adaboost  0.206061  1.850987
35          14             0.10  random_forest  0.212121  3.79

## Print the final ensemble constructed by auto-sklearn



In [23]:
pprint(automl.show_models(), indent=4)

{   2: {   'balancing': Balancing(random_state=1),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f873bb62130>,
           'cost': 0.1575757575757576,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f87578f7c70>,
           'ensemble_weight': 0.02,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f873bb62040>,
           'model_id': 2,
           'rank': 2,
           'sklearn_classifier': RandomForestClassifier(max_features=10, n_estimators=512, n_jobs=1,
                       random_state=1, warm_start=True)},
    4: {   'balancing': Balancing(random_state=1),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f873afcc700>,
           'cost': 0.19999999999999996,
           'data_preprocessor': <autosklearn.pipeline.components.

## Get the Score of the final ensemble



In [24]:
start_time = timeit.default_timer()

predictions = automl.predict(X_test)

end_time = timeit.default_timer()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")
print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, predictions))
print("Precision", sklearn.metrics.precision_score(y_test, predictions))
print("Recall", sklearn.metrics.recall_score(y_test, predictions))


  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):


Execution time: 0.9595692930015502 seconds
Accuracy score: 0.752
Precision 0.8
Recall 0.4897959183673469


  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):


In [25]:
pprint(automl.cv_results_)

{'budgets': [0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
             0.0,
          