<a href="https://colab.research.google.com/github/Rishav-hub/Auto-sklearn/blob/main/03_Auto_Sklearn_for_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install auto-sklearn==0.14.7


In [1]:
# print autosklearn version
import autosklearn
print('autosklearn: %s' % autosklearn.__version__)

autosklearn: 0.14.7


In [2]:
import pandas as pd
import sklearn.metrics
from sklearn.model_selection import train_test_split, StratifiedKFold
# autosklearn
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.metrics import (accuracy,
                                 f1,
                                 roc_auc,
                                 precision,
                                 average_precision,
                                 recall,
                                 log_loss)

## Load the dataset



In [16]:
df = pd.read_csv("https://raw.githubusercontent.com/Rishav-hub/Auto-sklearn/main/bank-additional-full.csv", sep= ";")

In [17]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


## Prepare the data
Auto-Sklearn requires us to identify is a column is numerical categorical either in the pandas dataframe or we can do it later in the fit function. Lets convert it now.

In [18]:
num_cols = ['duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
df[num_cols] = df[num_cols].apply(pd.to_numeric)
df[cat_cols] = df[cat_cols].apply(pd.Categorical)

In [19]:
y = df.pop('y')
X = df.copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1, stratify=y)

In [7]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed'],
      dtype='object')

## Instantiate the classifier



In [None]:
skf = StratifiedKFold(n_splits=5)
  
clf = AutoSklearnClassifier(time_left_for_this_task=600,
                            max_models_on_disc=5,
                            memory_limit = 10240,
                            resampling_strategy=skf,
                            ensemble_size = 3,
                            metric = average_precision,
                            scoring_functions=[roc_auc, average_precision, accuracy, f1, precision, recall, log_loss])

## Fit the classifier



In [9]:
clf.fit(X = X_train, y = y_train)

AutoSklearnClassifier(ensemble_size=3, max_models_on_disc=5, memory_limit=10240,
                      metric=average_precision, per_run_time_limit=60,
                      resampling_strategy=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
                      scoring_functions=[roc_auc, average_precision, accuracy,
                                         f1, precision, recall, log_loss],
                      time_left_for_this_task=600)

In [None]:
df_cv_results = pd.DataFrame(clf.cv_results_).sort_values(by = 'mean_test_score', ascending = False)
df_cv_results

Unnamed: 0,mean_test_score,metric_roc_auc,metric_average_precision,metric_accuracy,metric_f1,metric_precision,metric_recall,metric_log_loss,mean_fit_time,params,...,param_data_preprocessor:feature_type:numerical_transformer:rescaling:robust_scaler:q_max,param_data_preprocessor:feature_type:numerical_transformer:rescaling:robust_scaler:q_min,param_feature_preprocessor:fast_ica:n_components,param_feature_preprocessor:kernel_pca:coef0,param_feature_preprocessor:kernel_pca:degree,param_feature_preprocessor:kernel_pca:gamma,param_feature_preprocessor:nystroem_sampler:coef0,param_feature_preprocessor:nystroem_sampler:degree,param_feature_preprocessor:nystroem_sampler:gamma,param_feature_preprocessor:select_rates_classification:mode
7,0.628876,0.936279,0.628876,0.909803,0.500591,0.656462,0.420272,0.193774,50.189104,"{'balancing:strategy': 'weighting', 'classifie...",...,,,,,,,,,,
0,0.0,,,,,,,,60.013599,"{'balancing:strategy': 'none', 'classifier:__c...",...,,,,,,,,,,
1,0.0,,,,,,,,60.065836,"{'balancing:strategy': 'weighting', 'classifie...",...,,,,,,,,,,
2,0.0,,,,,,,,60.01557,"{'balancing:strategy': 'none', 'classifier:__c...",...,,,,,,,,,,
3,0.0,,,,,,,,60.012574,"{'balancing:strategy': 'weighting', 'classifie...",...,,,,,,,,,,
4,0.0,,,,,,,,60.024678,"{'balancing:strategy': 'none', 'classifier:__c...",...,0.768284,0.241008,,,,,,,,
5,0.0,,,,,,,,60.065937,"{'balancing:strategy': 'none', 'classifier:__c...",...,,,,,,,,,,
6,0.0,,,,,,,,60.050643,"{'balancing:strategy': 'weighting', 'classifie...",...,,,,,,,,,,
8,0.0,,,,,,,,60.01835,"{'balancing:strategy': 'none', 'classifier:__c...",...,0.747942,0.187013,18.0,,,,,,,
9,0.0,,,,,,,,48.020764,"{'balancing:strategy': 'none', 'classifier:__c...",...,,,,,,,,,,


In [11]:
clf.leaderboard(detailed = True, ensemble_only=False)

Unnamed: 0_level_0,rank,ensemble_weight,type,cost,duration,config_id,train_loss,seed,start_time,end_time,budget,status,data_preprocessors,feature_preprocessors,balancing_strategy,config_origin
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
9,1,1.0,mlp,0.371124,49.493079,8,0.342127,0,1655049000.0,1655049000.0,0.0,StatusType.SUCCESS,[],[feature_agglomeration],weighting,Initial design


In [12]:
clf.get_models_with_weights()

[(1.0,
  SimpleClassificationPipeline({'balancing:strategy': 'weighting', 'classifier:__choice__': 'mlp', 'data_preprocessor:__choice__': 'feature_type', 'feature_preprocessor:__choice__': 'feature_agglomeration', 'classifier:mlp:activation': 'relu', 'classifier:mlp:alpha': 0.010532545646916008, 'classifier:mlp:batch_size': 'auto', 'classifier:mlp:beta_1': 0.9, 'classifier:mlp:beta_2': 0.999, 'classifier:mlp:early_stopping': 'train', 'classifier:mlp:epsilon': 1e-08, 'classifier:mlp:hidden_layer_depth': 1, 'classifier:mlp:learning_rate_init': 0.0003663690531776363, 'classifier:mlp:n_iter_no_change': 32, 'classifier:mlp:num_nodes_per_layer': 147, 'classifier:mlp:shuffle': 'True', 'classifier:mlp:solver': 'adam', 'classifier:mlp:tol': 0.0001, 'data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__': 'no_encoding', 'data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__': 'no_coalescense', 'data_preprocessor:feature_type:numer

In [14]:
# additional trainings statistics
clf.sprint_statistics()

'auto-sklearn results:\n  Dataset name: 5283f2d4-ea66-11ec-80dd-0242ac1c0002\n  Metric: average_precision\n  Best validation score: 0.628876\n  Number of target algorithm runs: 10\n  Number of successful target algorithm runs: 1\n  Number of crashed target algorithm runs: 0\n  Number of target algorithms that exceeded the time limit: 9\n  Number of target algorithms that exceeded the memory limit: 0\n'

## Refit with all the training data
we need to call the refit method to fit the models pipeline found during cross validation with all the training data

In [15]:
clf.refit(X = X_train, y = y_train)

AutoSklearnClassifier(ensemble_size=3, max_models_on_disc=5, memory_limit=10240,
                      metric=average_precision, per_run_time_limit=60,
                      resampling_strategy=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
                      scoring_functions=[roc_auc, average_precision, accuracy,
                                         f1, precision, recall, log_loss],
                      time_left_for_this_task=600)

## Save Model

In [21]:
import joblib
joblib.dump(clf, 'model.joblib')

['model.joblib']

## Load Model and Predict

In [22]:
from sklearn.metrics import accuracy_score
model = joblib.load("model.joblib")
y_hat = model.predict(X_test)
acc = accuracy_score(y_test, y_hat)
print("Accuracy: %.3f" % acc)

Accuracy: 0.914
