
# Classification

The following example shows how to fit a simple classification model with
*auto-sklearn*.


In [2]:
from pprint import pprint
import sklearn.metrics
import autosklearn.classification
import pandas as pd
from sklearn.model_selection import train_test_split
import autosklearn

## Data Loading



In [3]:

# file_path = '../data/titanic_dirty_data.csv'
file_path = '../data/airbnb.csv'


df = pd.read_csv(file_path)
print(len(df))

# df = df.dropna(subset=['Survived'])
# y = df['Survived']
# X = df.drop('Survived', axis=1)

df = df.dropna(subset=['Rating'])
y = df['Rating']
X = df.drop('Rating', axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


23202


## Build and fit a classifier



In [4]:
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=120,
    per_run_time_limit=10,
    include = {
        'classifier': ["gradient_boosting"],
    },
    tmp_folder="tmp/autosklearn_classification_example_tmp5",
)

automl.fit(X_train, y_train)
run_key = list(automl.automl_.runhistory_.data.keys())[0]
run_value = automl.automl_.runhistory_.data[run_key]
config=automl.automl_.runhistory_.ids_config[run_key.config_id]
print(config)

  if is_sparse(X[column]):
Fitting to the training data:  15%|[32m█▌        [0m| 18/120 [00:18<01:42,  1.00s/it, The total time budget for this task is 0:02:00]



Fitting to the training data:  18%|[32m█▊        [0m| 21/120 [00:21<01:39,  1.00s/it, The total time budget for this task is 0:02:00]



Fitting to the training data:  18%|[32m█▊        [0m| 22/120 [00:22<01:38,  1.00s/it, The total time budget for this task is 0:02:00]



Fitting to the training data:  28%|[32m██▊       [0m| 33/120 [00:33<01:27,  1.00s/it, The total time budget for this task is 0:02:00]



Fitting to the training data:  30%|[32m███       [0m| 36/120 [00:36<01:24,  1.00s/it, The total time budget for this task is 0:02:00]



Fitting to the training data:  31%|[32m███       [0m| 37/120 [00:37<01:23,  1.00s/it, The total time budget for this task is 0:02:00]



Fitting to the training data:  32%|[32m███▏      [0m| 38/120 [00:38<01:22,  1.00s/it, The total time budget for this task is 0:02:00]



Fitting to the training data: 100%|[32m██████████[0m| 120/120 [01:50<00:00,  1.09it/s, The total time budget for this task is 0:02:00]


Configuration(values={
  'balancing:strategy': 'none',
  'classifier:__choice__': 'gradient_boosting',
  'classifier:gradient_boosting:early_stop': 'off',
  'classifier:gradient_boosting:l2_regularization': 1e-10,
  'classifier:gradient_boosting:learning_rate': 0.1,
  'classifier:gradient_boosting:loss': 'auto',
  'classifier:gradient_boosting:max_bins': 255,
  'classifier:gradient_boosting:max_depth': 'None',
  'classifier:gradient_boosting:max_leaf_nodes': 31,
  'classifier:gradient_boosting:min_samples_leaf': 20,
  'classifier:gradient_boosting:scoring': 'loss',
  'classifier:gradient_boosting:tol': 1e-07,
  'data_preprocessor:__choice__': 'feature_type',
  'data_preprocessor:feature_type:numerical_transformer:imputation:strategy': 'mean',
  'data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__': 'standardize',
  'data_preprocessor:feature_type:text_transformer:text_encoding:__choice__': 'tfidf_encoding',
  'data_preprocessor:feature_type:text_transformer:text_e

## View the models found by auto-sklearn



In [5]:
print(automl.leaderboard())

          rank  ensemble_weight               type      cost  duration
model_id                                                              
12           1             0.84  gradient_boosting  0.300849  7.471197
18           2             0.16  gradient_boosting  0.304603  9.523919


## Print the final ensemble constructed by auto-sklearn



In [6]:
pprint(automl.show_models(), indent=4)

{   12: {   'balancing': Balancing(random_state=1),
            'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f354c038b50>,
            'cost': 0.3008488410055501,
            'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f35473235e0>,
            'ensemble_weight': 0.84,
            'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f354c0385e0>,
            'model_id': 12,
            'rank': 1,
            'sklearn_classifier': HistGradientBoostingClassifier(early_stopping=True,
                               l2_regularization=3.609412172481434e-10,
                               learning_rate=0.05972079854295879, max_iter=512,
                               max_leaf_nodes=4, min_samples_leaf=2,
                               n_iter_no_change=14, random_state=1,
                               validation_fracti

## Get the Score of the final ensemble



In [7]:
predictions = automl.predict(X_test)
print(set(predictions))
print(set(y_test))
print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, predictions))
print("Precision", sklearn.metrics.precision_score(y_test, predictions,pos_label='Y'))
print("Recall", sklearn.metrics.recall_score(y_test, predictions,pos_label='Y'))


  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):


{'Y', 'N'}
{'Y', 'N'}
Accuracy score: 0.6901529842706313
Precision 0.7097013083189336
Recall 0.9164807140580172


In [8]:

from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt

# file_path = '../data/clean_titanic_data_rf.csv'
file_path = '../data/clean_airbnb_data_gb.csv'


df = pd.read_csv(file_path)

# df = df.dropna(subset=['Survived'])
# y = df['Survived']
# X = df.drop('Survived', axis=1)

df = df.dropna(subset=['Rating'])
y = df['Rating']
X = df.drop('Rating', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

feature_names = [f"feature {i}" for i in range(X.shape[1])]
forest = RandomForestClassifier(random_state=0)
forest.fit(X_train, y_train)

importances = forest.feature_importances_
print(len(importances))
sorted_indices = np.argsort(importances)[::-1]
print(sorted_indices)


  array.dtypes.apply(is_sparse).any()):


138
[  6   5   7   4   3   2   1   0  33   8  35  29  12  13  30  31  15  22
  32  34  10  24  16  19  27  20  25  37  36  21  11  23  26  17   9  28
  18  14 116  84 102 135 136 130  72  74 137 129 118  87 132  73  64 133
  94 126 122  90  75  43 131 125 107 134 128 110 117 121  93  78  97 113
  58  55  98 127  68  57  59 100 111 123  40  54 120 124 109 106  86  99
  85  82  38 108  77  92  91 119 112  51  88  47  39  62  81  48 115  66
  53 101 114  61 103 105  49  45  44  96 104  50  41  52  69  63  56  42
  80  70  95  79  60  46  71  89  83  76  67  65]


In [11]:
rf_predictions = forest.predict(X_test)
print(set(rf_predictions))
print(set(y_test))
print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, rf_predictions))
print("Precision", sklearn.metrics.precision_score(y_test, rf_predictions))
print("Recall", sklearn.metrics.recall_score(y_test, rf_predictions))

{0.0, 1.0}
{0.0, 1.0}
Accuracy score: 0.7239819004524887
Precision 0.7612273361227336
Recall 0.8652504755865568


  array.dtypes.apply(is_sparse).any()):
