
# Classification

The following example shows how to fit a simple classification model with
*auto-sklearn*.


In [9]:
from pprint import pprint
import sklearn.metrics
import autosklearn.classification
import timeit

## Data Loading



In [4]:

import pandas as pd
from sklearn.model_selection import train_test_split
import autosklearn
file_path = '../data/airbnb.csv'

df = pd.read_csv(file_path)
df = df.sample(n = 1000, random_state=16)
print("Original df len", len(df))
print("Checking if y has nulls", df["Rating"].isnull().sum())
# df = df.dropna(subset=['Rating'])
# print("Drop empty rating cols df len", len(df))
y = df['Rating']
print("Confirming y doesnt have nulls", y.isnull().sum())
X = df.drop('Rating', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train.isnull().sum()
print("xlen", (X_train.shape[0]))
print("ylen", len(y_train))

Original df len 1000
Checking if y has nulls 0
Confirming y doesnt have nulls 0
xlen 800
ylen 800


In [11]:
df['Rating'].isnull().sum()
# y_train
# df

0

## Build and fit a classifier



In [12]:
start_time = timeit.default_timer()

automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=120,
    per_run_time_limit=30,
    include = {
        'classifier': ["random_forest"],
        'feature_preprocessor': ["no_preprocessing"],
    },
    tmp_folder="tmp/autosklearn_classification_example_tmp4",
)

automl.fit(X_train, y_train, dataset_name="airbnb")
## get configuration for a model/run
run_key = list(automl.automl_.runhistory_.data.keys())[0]
run_value = automl.automl_.runhistory_.data[run_key]
config=automl.automl_.runhistory_.ids_config[run_key.config_id]
print(config)


end_time = timeit.default_timer()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

INITT
helloo


In [7]:
import pandas as pd
print("xlen", len(X_train))
print("ylen", len(y_train))
file_path_X_train = "/home/preethi/projects/CS8803-MDS-Data-Preprocessing-Transferability/autosklearn-pipeline/tr_file.csv"
file_path_X_train_w_labels = "/home/preethi/projects/CS8803-MDS-Data-Preprocessing-Transferability/autosklearn-pipeline/tr_file_labels.csv"
X_train_without_labels = pd.read_csv(file_path_X_train)
print('len xtrain cleaned csv', X_train_without_labels.shape[0])
# X_train_without_labels['Label'] = y_train 
# print(y_train.isnull().sum())
# X_train_without_labels.to_csv(file_path_X_train_w_labels, index=False)

xlen 800
ylen 800
len xtrain cleaned csv 536


In [31]:
file_path_X_train['1'].isnull().sum()

TypeError: string indices must be integers

## View the models found by auto-sklearn



In [13]:
print(automl.leaderboard())

          rank  ensemble_weight           type      cost   duration
model_id                                                           
2            1             0.82  random_forest  0.277016  23.999457
3            2             0.18  random_forest  0.289259  27.258831


## Print the final ensemble constructed by auto-sklearn



In [14]:
pprint(automl.show_models(), indent=4)

{   2: {   'balancing': Balancing(random_state=1),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f9652f29df0>,
           'cost': 0.2770159973881815,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f9652f32640>,
           'ensemble_weight': 0.82,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f9652f29730>,
           'model_id': 2,
           'rank': 1,
           'sklearn_classifier': RandomForestClassifier(max_features=11, n_estimators=512, n_jobs=1,
                       random_state=1, warm_start=True)},
    3: {   'balancing': Balancing(random_state=1, strategy='weighting'),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f9652eacd60>,
           'cost': 0.2892588965066928,
           'data_preprocessor': <autosklearn

## Get the Score of the final ensemble



In [15]:
start_time = timeit.default_timer()

predictions = automl.predict(X_test)

end_time = timeit.default_timer()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")
print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, predictions))

[]
DFFFF
       0         1    2    3      4          5         6          7        8   \
0     2.0  2.000000  2.0  4.0    9.0  179.00000  32.75659 -117.11892  92116.0   
1     1.0  1.474143  1.0  2.0   27.0  129.00000  38.90513  -77.05231  20037.0   
2     1.0  1.000000  1.0  2.0  172.0  112.00000  37.58423 -122.35999  94134.0   
3     1.0  2.000000  5.0  6.0   69.0  100.00000  40.72976  -73.95189  11222.0   
4     1.0  1.000000  1.0  3.0  112.0  130.00000  40.67026  -73.94970  11225.0   
...   ...       ...  ...  ...    ...        ...       ...        ...      ...   
4636  1.0  1.000000  1.0  2.0  154.0   99.00000  40.76717  -73.95532  10021.0   
4637  1.0  2.000000  3.0  6.0   15.0   90.00000  38.93348  -77.03006  20010.0   
4638  1.0  2.000000  2.0  5.0   79.0  118.41095  32.74007 -117.12474  92104.0   
4639  1.0  1.000000  4.0  2.0   40.0   19.00000  29.97211  -90.06534  70116.0   
4640  2.0  3.000000  3.0  5.0   17.0  110.00000  33.73939  -84.38542  30312.0   

           9   ...