
# Classification

The following example shows how to fit a simple classification model with
*auto-sklearn*.


In [1]:
from pprint import pprint
import sklearn.metrics
import autosklearn.classification
import pandas as pd
from sklearn.model_selection import train_test_split
import autosklearn

## Data Loading



In [2]:

# file_path = '../data/titanic_dirty_data.csv'
file_path = '../data/credit.csv'


df = pd.read_csv(file_path)
print(len(df))

# df = df.dropna(subset=['Survived'])
# y = df['Survived']
# X = df.drop('Survived', axis=1)

df = df.dropna(subset=['SeriousDlqin2yrs'])
y = df['SeriousDlqin2yrs']
X = df.drop('SeriousDlqin2yrs', axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


45000


## Build and fit a classifier



In [3]:
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=120,
    per_run_time_limit=10,
    include = {
        'classifier': ["gradient_boosting"],
    },
    tmp_folder="tmp/autosklearn_classification_example_tmp5",
)

automl.fit(X_train, y_train)
run_key = list(automl.automl_.runhistory_.data.keys())[0]
run_value = automl.automl_.runhistory_.data[run_key]
config=automl.automl_.runhistory_.ids_config[run_key.config_id]
print(config)

  if is_sparse(X[column]):
Fitting to the training data:   0%|[32m          [0m| 0/120 [00:00<?, ?it/s, The total time budget for this task is 0:02:00]

Fitting to the training data:   6%|[32m▌         [0m| 7/120 [00:07<01:53,  1.00s/it, The total time budget for this task is 0:02:00]

	Models besides current dummy model: 0
	Dummy models: 1


Fitting to the training data:   7%|[32m▋         [0m| 8/120 [00:08<01:52,  1.00s/it, The total time budget for this task is 0:02:00]



Fitting to the training data:  13%|[32m█▎        [0m| 16/120 [00:16<01:44,  1.00s/it, The total time budget for this task is 0:02:00]

	Models besides current dummy model: 0
	Dummy models: 1


Fitting to the training data:  22%|[32m██▎       [0m| 27/120 [00:27<01:33,  1.00s/it, The total time budget for this task is 0:02:00]

	Models besides current dummy model: 0
	Dummy models: 1


Fitting to the training data: 100%|[32m██████████[0m| 120/120 [01:50<00:00,  1.09it/s, The total time budget for this task is 0:02:00]


Configuration(values={
  'balancing:strategy': 'none',
  'classifier:__choice__': 'gradient_boosting',
  'classifier:gradient_boosting:early_stop': 'off',
  'classifier:gradient_boosting:l2_regularization': 1e-10,
  'classifier:gradient_boosting:learning_rate': 0.1,
  'classifier:gradient_boosting:loss': 'auto',
  'classifier:gradient_boosting:max_bins': 255,
  'classifier:gradient_boosting:max_depth': 'None',
  'classifier:gradient_boosting:max_leaf_nodes': 31,
  'classifier:gradient_boosting:min_samples_leaf': 20,
  'classifier:gradient_boosting:scoring': 'loss',
  'classifier:gradient_boosting:tol': 1e-07,
  'data_preprocessor:__choice__': 'feature_type',
  'data_preprocessor:feature_type:numerical_transformer:imputation:strategy': 'mean',
  'data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__': 'standardize',
  'feature_preprocessor:__choice__': 'no_preprocessing',
})



## View the models found by auto-sklearn



In [4]:
print(automl.leaderboard())

          rank  ensemble_weight               type      cost  duration
model_id                                                              
22           1             0.32  gradient_boosting  0.063468  4.675313
6            2             0.08  gradient_boosting  0.064141  7.537183
16           3             0.04  gradient_boosting  0.064899  4.240039
11           4             0.28  gradient_boosting  0.064983  3.739567
15           5             0.28  gradient_boosting  0.065657  9.356932


## Print the final ensemble constructed by auto-sklearn



In [5]:
pprint(automl.show_models(), indent=4)

{   6: {   'balancing': Balancing(random_state=1),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f57a243e130>,
           'cost': 0.06414141414141417,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f57a21eebb0>,
           'ensemble_weight': 0.08,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f57a21eef70>,
           'model_id': 6,
           'rank': 2,
           'sklearn_classifier': HistGradientBoostingClassifier(early_stopping=True,
                               l2_regularization=1.7108930238344161e-10,
                               learning_rate=0.010827728124541558, max_iter=512,
                               max_leaf_nodes=25, min_samples_leaf=4,
                               n_iter_no_change=19, random_state=1,
                               validation_fraction=0.1

## Get the Score of the final ensemble



In [7]:
predictions = automl.predict(X_test)
print(set(predictions))
print(set(y_test))
print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, predictions))
print("Precision", sklearn.metrics.precision_score(y_test, predictions))
print("Recall", sklearn.metrics.recall_score(y_test, predictions))


  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):


{0, 1}
{0, 1}
Accuracy score: 0.9368888888888889
Precision 0.5462962962962963
Recall 0.20068027210884354


In [8]:

from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt

# file_path = '../data/clean_titanic_data_rf.csv'
file_path = '../data/clean_credit_data_gb.csv'


df = pd.read_csv(file_path)

# df = df.dropna(subset=['Survived'])
# y = df['Survived']
# X = df.drop('Survived', axis=1)

df = df.dropna(subset=['Rating'])
y = df['Rating']
X = df.drop('Rating', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

feature_names = [f"feature {i}" for i in range(X.shape[1])]
forest = RandomForestClassifier(random_state=0)
forest.fit(X_train, y_train)

importances = forest.feature_importances_
print(len(importances))
sorted_indices = np.argsort(importances)[::-1]
print(sorted_indices)


  array.dtypes.apply(is_sparse).any()):


10
[0 3 4 1 6 5 2 8 9 7]


In [9]:
rf_predictions = forest.predict(X_test)
print(set(rf_predictions))
print(set(y_test))
print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, rf_predictions))
print("Precision", sklearn.metrics.precision_score(y_test, rf_predictions))
print("Recall", sklearn.metrics.recall_score(y_test, rf_predictions))

{0.0, 1.0}
{0.0, 1.0}
Accuracy score: 0.9327777777777778
Precision 0.48663101604278075
Recall 0.15166666666666667


  array.dtypes.apply(is_sparse).any()):
