
# Classification

The following example shows how to fit a simple classification model with
*auto-sklearn*.


In [8]:
from pprint import pprint
import sklearn.metrics
import autosklearn.classification
import pandas as pd
from sklearn.model_selection import train_test_split
import autosklearn

## Data Loading



In [9]:

file_path = '../data/titanic_dirty_data.csv'
# file_path = '../data/airbnb.csv'


df = pd.read_csv(file_path)
print(len(df))
df = df.dropna(subset=['Survived'])
y = df['Survived']
X = df.drop('Survived', axis=1)

# df = df.dropna(subset=['Rating'])
# y = df['Rating']
# X = df.drop('Rating', axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


624


## Build and fit a classifier



In [3]:
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=120,
    per_run_time_limit=10,
    include = {
        'classifier': ["mlp"],
    },
    tmp_folder="tmp/autosklearn_classification_example_tmp5",
)

automl.fit(X_train, y_train)
run_key = list(automl.automl_.runhistory_.data.keys())[0]
run_value = automl.automl_.runhistory_.data[run_key]
config=automl.automl_.runhistory_.ids_config[run_key.config_id]
print(config)

  if is_sparse(X[column]):
Fitting to the training data:   0%|[32m          [0m| 0/120 [00:00<?, ?it/s, The total time budget for this task is 0:02:00]

MLPP INITTT


Fitting to the training data: 100%|[32m██████████[0m| 120/120 [01:51<00:00,  1.07it/s, The total time budget for this task is 0:02:00]


Configuration(values={
  'balancing:strategy': 'none',
  'classifier:__choice__': 'mlp',
  'classifier:mlp:activation': 'relu',
  'classifier:mlp:alpha': 0.0001,
  'classifier:mlp:batch_size': 'auto',
  'classifier:mlp:beta_1': 0.9,
  'classifier:mlp:beta_2': 0.999,
  'classifier:mlp:early_stopping': 'valid',
  'classifier:mlp:epsilon': 1e-08,
  'classifier:mlp:hidden_layer_depth': 5,
  'classifier:mlp:learning_rate_init': 0.001,
  'classifier:mlp:n_iter_no_change': 32,
  'classifier:mlp:num_nodes_per_layer': 256,
  'classifier:mlp:shuffle': 'True',
  'classifier:mlp:solver': 'adam',
  'classifier:mlp:tol': 0.0001,
  'classifier:mlp:validation_fraction': 0.1,
  'data_preprocessor:__choice__': 'feature_type',
  'data_preprocessor:feature_type:numerical_transformer:imputation:strategy': 'mean',
  'data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__': 'standardize',
  'data_preprocessor:feature_type:text_transformer:text_encoding:__choice__': 'tfidf_encoding',
  'dat

## View the models found by auto-sklearn



In [4]:
print(automl.leaderboard())

          rank  ensemble_weight type      cost  duration
model_id                                                
4            1             0.12  mlp  0.115152  0.947709
48           2             0.04  mlp  0.121212  1.002826
37           3             0.08  mlp  0.145455  1.086634
57           4             0.04  mlp  0.151515  0.898793
13           5             0.06  mlp  0.157576  0.754961
14           6             0.02  mlp  0.169697  1.530150
20           7             0.04  mlp  0.169697  0.999433
9            8             0.08  mlp  0.175758  1.377708
44          10             0.08  mlp  0.175758  1.068163
62           9             0.02  mlp  0.175758  1.146705
28          11             0.06  mlp  0.181818  1.001655
33          12             0.10  mlp  0.181818  1.751342
5           14             0.10  mlp  0.224242  0.692738
54          13             0.02  mlp  0.224242  1.528497
55          15             0.02  mlp  0.260606  1.816128
29          16             0.02

## Print the final ensemble constructed by auto-sklearn



In [5]:
pprint(automl.show_models(), indent=4)

{   4: {   'balancing': Balancing(random_state=1, strategy='weighting'),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7fa8d51f11c0>,
           'cost': 0.11515151515151512,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7fa8d51e5b20>,
           'ensemble_weight': 0.12,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7fa8d51f1d60>,
           'model_id': 4,
           'rank': 1,
           'sklearn_classifier': MLPClassifier(alpha=4.2841884333778574e-06, beta_1=0.999, beta_2=0.9,
              hidden_layer_sizes=(64, 64, 64),
              learning_rate_init=0.0011804284312897009, max_iter=128,
              n_iter_no_change=32, random_state=1, validation_fraction=0.0,
              verbose=0, warm_start=True)},
    5: {   'balancing': Balancing(random_state=1),
           'classi

## Get the Score of the final ensemble



In [6]:
predictions = automl.predict(X_test)
print(set(predictions))
print(set(y_test))
print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, predictions))
print("Precision", sklearn.metrics.precision_score(y_test, predictions))
print("Recall", sklearn.metrics.recall_score(y_test, predictions))


  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):


{0, 1}
{0, 1}
Accuracy score: 0.792
Precision 0.8709677419354839
Recall 0.5510204081632653


  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):


In [7]:

from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt

file_path = '../data/clean_titanic_data_rf.csv'
# file_path = '../data/clean_airbnb_data_gb.csv'


df = pd.read_csv(file_path)

# df = df.dropna(subset=['Survived'])
# y = df['Survived']
# X = df.drop('Survived', axis=1)

df = df.dropna(subset=['Rating'])
y = df['Rating']
X = df.drop('Rating', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

feature_names = [f"feature {i}" for i in range(X.shape[1])]
forest = RandomForestClassifier(random_state=0)
forest.fit(X_train, y_train)

importances = forest.feature_importances_
print(len(importances))
sorted_indices = np.argsort(importances)[::-1]
print(sorted_indices)


106
[  6   7  14   5   9  74  12  91   1  13  81  76  10  96  93  79  57  51
  39  83  65  73  52  19  97  86  77  31  34  24  29  82   8  23  54  85
  27  64  11  87  44  25   2  59  20  90  88  56  41  99  80  48  21  15
  58 101  22  38  89  92  35  63  69  45  18  61  60   0  66  84  98  55
  37  50  47  36  40 100  70  53  68  17  43  32  30  49  71  46  94 102
  33  67  42  26  16 103  28  78  75 105  72  62  95 104   3   4]


  array.dtypes.apply(is_sparse).any()):


In [10]:
rf_predictions = forest.predict(X_test)
print(set(rf_predictions))
print(set(y_test))
print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, rf_predictions))
print("Precision", sklearn.metrics.precision_score(y_test, rf_predictions))
print("Recall", sklearn.metrics.recall_score(y_test, rf_predictions))

{0.0, 1.0}
{0.0, 1.0}
Accuracy score: 0.848
Precision 0.7906976744186046
Recall 0.7727272727272727


  array.dtypes.apply(is_sparse).any()):
