
# Classification

The following example shows how to fit a simple classification model with
*auto-sklearn*.


In [1]:
from pprint import pprint
import sklearn.metrics
import autosklearn.classification
import pandas as pd
from sklearn.model_selection import train_test_split
import autosklearn

## Data Loading



In [2]:

# file_path = '../data/titanic_dirty_data.csv'
file_path = '../data/airbnb.csv'


df = pd.read_csv(file_path)

# df = df.dropna(subset=['Survived'])
# y = df['Survived']
# X = df.drop('Survived', axis=1)

df = df.dropna(subset=['Rating'])
y = df['Rating']
X = df.drop('Rating', axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Build and fit a classifier



In [3]:
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=120,
    per_run_time_limit=10,
    include = {
        'classifier': ["random_forest"],
    },
    tmp_folder="tmp/autosklearn_classification_example_tmp5",
)

automl.fit(X_train, y_train)
run_key = list(automl.automl_.runhistory_.data.keys())[0]
run_value = automl.automl_.runhistory_.data[run_key]
config=automl.automl_.runhistory_.ids_config[run_key.config_id]
print(config)

  if is_sparse(X[column]):


RANDOM FOREST INTI


Fitting to the training data:  13%|[32m█▎        [0m| 16/120 [00:16<01:44,  1.00s/it, The total time budget for this task is 0:02:00]



Fitting to the training data:  15%|[32m█▌        [0m| 18/120 [00:18<01:42,  1.00s/it, The total time budget for this task is 0:02:00]



Fitting to the training data:  24%|[32m██▍       [0m| 29/120 [00:29<01:31,  1.00s/it, The total time budget for this task is 0:02:00]



Fitting to the training data: 100%|[32m██████████[0m| 120/120 [01:51<00:00,  1.08it/s, The total time budget for this task is 0:02:00]


Configuration(values={
  'balancing:strategy': 'none',
  'classifier:__choice__': 'random_forest',
  'classifier:random_forest:bootstrap': 'True',
  'classifier:random_forest:criterion': 'gini',
  'classifier:random_forest:max_depth': 'None',
  'classifier:random_forest:max_features': 0.5,
  'classifier:random_forest:max_leaf_nodes': 'None',
  'classifier:random_forest:min_impurity_decrease': 0.0,
  'classifier:random_forest:min_samples_leaf': 1,
  'classifier:random_forest:min_samples_split': 2,
  'classifier:random_forest:min_weight_fraction_leaf': 0.0,
  'data_preprocessor:__choice__': 'feature_type',
  'data_preprocessor:feature_type:numerical_transformer:imputation:strategy': 'mean',
  'data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__': 'standardize',
  'data_preprocessor:feature_type:text_transformer:text_encoding:__choice__': 'tfidf_encoding',
  'data_preprocessor:feature_type:text_transformer:text_encoding:tfidf_encoding:analyzer': 'char',
  'data_prepr

## View the models found by auto-sklearn



In [4]:
print(automl.leaderboard())

          rank  ensemble_weight           type      cost  duration
model_id                                                          
9            1             0.92  random_forest  0.318479  6.814975
5            2             0.08  random_forest  0.326314  5.641768


## Print the final ensemble constructed by auto-sklearn



In [5]:
pprint(automl.show_models(), indent=4)

{   5: {   'balancing': Balancing(random_state=1),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7efc67f65d60>,
           'cost': 0.3263140711720536,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7efc67f102b0>,
           'ensemble_weight': 0.08,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7efc67f12100>,
           'model_id': 5,
           'rank': 2,
           'sklearn_classifier': RandomForestClassifier(max_features=1, min_samples_leaf=2, min_samples_split=20,
                       n_estimators=512, n_jobs=1, random_state=1,
                       warm_start=True)},
    9: {   'balancing': Balancing(random_state=1),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7efc67f7ba00>,
           'cost': 0.3184786157362063,
  

## Get the Score of the final ensemble



In [6]:
predictions = automl.predict(X_test)
print(set(predictions))
print(set(y_test))
print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, predictions))
print("Precision", sklearn.metrics.precision_score(y_test, predictions,pos_label='Y'))
print("Recall", sklearn.metrics.recall_score(y_test, predictions,pos_label='Y'))


  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):


{'Y', 'N'}
{'Y', 'N'}
Accuracy score: 0.6726998491704375
Precision 0.6819613135402609
Recall 0.9665285304430985


In [7]:

from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt

# file_path = '../data/clean_titanic_data_rf.csv'
file_path = '../data/clean_airbnb_data_rf.csv'


df = pd.read_csv(file_path)

# df = df.dropna(subset=['Survived'])
# y = df['Survived']
# X = df.drop('Survived', axis=1)

df = df.dropna(subset=['Rating'])
y = df['Rating']
X = df.drop('Rating', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

feature_names = [f"feature {i}" for i in range(X.shape[1])]
forest = RandomForestClassifier(random_state=0)
forest.fit(X_train, y_train)

importances = forest.feature_importances_
print(len(importances))
sorted_indices = np.argsort(importances)[::-1]  
print(sorted_indices)


  array.dtypes.apply(is_sparse).any()):


138
[  6   7   5   4   3   2   1   0  33   8  29  35  12  15  13  31  30  32
  22  34  24  37  25  36  27  10  26  20   9  19  16  23  17  21  11  28
  18  14 137  72 120  84 102 131  74 116  87 129 127  64 117  75 132 118
 136 135 122 133  93 130  73  78 125 128  90 134  94 107 121  98  97  40
 124 108 110 126  58  85  43 100  59  55  86  54 109  57  99  38 106 111
 113  82 119  88 123  68  77  92  91  81  47  51  39  62  49  48  66 112
  45  61  53 114 101 115 104  63 105 103  56  52  69  42  46  50  95  80
  96  70  41  89  60  79  44  83  71  65  76  67]


In [8]:
rf_predictions = forest.predict(X_test)
print(set(rf_predictions))
print(set(y_test))
print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, rf_predictions))
print("Precision", sklearn.metrics.precision_score(y_test, rf_predictions))
print("Recall", sklearn.metrics.recall_score(y_test, rf_predictions))

{0.0, 1.0}
{0.0, 1.0}
Accuracy score: 0.7231200172376643
Precision 0.7592233009708738
Recall 0.8677869372225745


  array.dtypes.apply(is_sparse).any()):
