
# Classification

The following example shows how to fit a simple classification model with
*auto-sklearn*.


In [1]:
from pprint import pprint

import sklearn.datasets
import sklearn.metrics

import autosklearn.classification

## Data Loading



In [2]:

import pandas as pd
from sklearn.model_selection import train_test_split
import autosklearn
file_path_x = '/home/preethi/projects/CS8803-MDS-Data-Preprocessing-Transferability/autosklearn-pipeline/tr_file_labels.csv'
file_path_airbnb = '/home/preethi/projects/CS8803-MDS-Data-Preprocessing-Transferability/data/airbnb.csv'
df = pd.read_csv(file_path_x)
# df_y = pd.read_csv(file_path_airbnb)
df = df.dropna(subset=['Label'])
y = df['Label']

X = df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# y_train
df['Label'].isnull().sum()

0

In [5]:
from typing import Optional
from pprint import pprint

import autosklearn.classification
import autosklearn.pipeline.components.data_preprocessing
import sklearn.metrics
from ConfigSpace.configuration_space import ConfigurationSpace

from autosklearn.askl_typing import FEAT_TYPE_TYPE
from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
from autosklearn.pipeline.constants import SPARSE, DENSE, UNSIGNED_DATA, INPUT
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split


class NoPreprocessing(AutoSklearnPreprocessingAlgorithm):
    def __init__(self, **kwargs):
        """This preprocessors does not change the data"""
        # Some internal checks makes sure parameters are set
        for key, val in kwargs.items():
            setattr(self, key, val)

    def fit(self, X, Y=None):
        return self

    def transform(self, X):
        return X

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            "shortname": "NoPreprocessing",
            "name": "NoPreprocessing",
            "handles_regression": True,
            "handles_classification": True,
            "handles_multiclass": True,
            "handles_multilabel": True,
            "handles_multioutput": True,
            "is_deterministic": True,
            "input": (SPARSE, DENSE, UNSIGNED_DATA),
            "output": (INPUT,),
        }

    @staticmethod
    def get_hyperparameter_search_space(
        feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None
    ):
        return ConfigurationSpace()  # Return an empty configuration as there is None


# Add NoPreprocessing component to auto-sklearn.
autosklearn.pipeline.components.data_preprocessing.add_preprocessor(NoPreprocessing)

## Build and fit a classifier



In [8]:
# from autosklearn.experimental.askl2 import AutoSklearn2Classifier

automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=120,
    per_run_time_limit=30,
    include = {
        'classifier': ["random_forest"],
        'feature_preprocessor': ["no_preprocessing"],
        'data_preprocessor': ["NoPreprocessing"]
    },
    tmp_folder="tmp/autosklearn_classification_example_tmp6",
)

automl.fit(X_train, y_train, dataset_name="airbnb")
## get configuration for a model/run
run_key = list(automl.automl_.runhistory_.data.keys())[0]
run_value = automl.automl_.runhistory_.data[run_key]
config=automl.automl_.runhistory_.ids_config[run_key.config_id]
print(config)



INITT
helloo


TypeError: '<' not supported between instances of 'float' and 'str'

In [10]:
y_train.isnull

<bound method Series.isnull of 4526       N
5965       Y
9783       Y
2245     NaN
3208     NaN
        ... 
11964      N
5191       N
5390       Y
860        Y
7270     NaN
Name: Label, Length: 9948, dtype: object>

## View the models found by auto-sklearn



In [4]:
print(automl.leaderboard())

          rank  ensemble_weight           type      cost   duration
model_id                                                           
2            1             0.70  random_forest  0.277179  17.512100
3            2             0.08  random_forest  0.291707  20.362909
10           3             0.22  random_forest  0.317662  13.545451


## Print the final ensemble constructed by auto-sklearn



In [5]:
pprint(automl.show_models(), indent=4)

{   2: {   'balancing': Balancing(random_state=1),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7fcd369b5580>,
           'cost': 0.277179236043095,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7fcd370bb430>,
           'ensemble_weight': 0.7000000000000001,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7fcd369b5940>,
           'model_id': 2,
           'rank': 1,
           'sklearn_classifier': RandomForestClassifier(max_features=11, n_estimators=512, n_jobs=1,
                       random_state=1, warm_start=True)},
    3: {   'balancing': Balancing(random_state=1, strategy='weighting'),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7fcd3708bb80>,
           'cost': 0.29170747633039507,
           'data_preprocessor'

## Get the Score of the final ensemble



In [6]:
predictions = automl.predict(X_test)
print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, predictions))

[]
DFFFF
       0         1    2    3      4          5         6          7        8   \
0     2.0  2.000000  2.0  4.0    9.0  179.00000  32.75659 -117.11892  92116.0   
1     1.0  1.474143  1.0  2.0   27.0  129.00000  38.90513  -77.05231  20037.0   
2     1.0  1.000000  1.0  2.0  172.0  112.00000  37.58423 -122.35999  94134.0   
3     1.0  2.000000  5.0  6.0   69.0  100.00000  40.72976  -73.95189  11222.0   
4     1.0  1.000000  1.0  3.0  112.0  130.00000  40.67026  -73.94970  11225.0   
...   ...       ...  ...  ...    ...        ...       ...        ...      ...   
4636  1.0  1.000000  1.0  2.0  154.0   99.00000  40.76717  -73.95532  10021.0   
4637  1.0  2.000000  3.0  6.0   15.0   90.00000  38.93348  -77.03006  20010.0   
4638  1.0  2.000000  2.0  5.0   79.0  118.41095  32.74007 -117.12474  92104.0   
4639  1.0  1.000000  4.0  2.0   40.0   19.00000  29.97211  -90.06534  70116.0   
4640  2.0  3.000000  3.0  5.0   17.0  110.00000  33.73939  -84.38542  30312.0   

           9   ...