### Step 2 : AutoML

In this second section, we discuss on the utilisation of AutoMl tools, such as auto-sklearn.
If you are using colab or don't have auto-sklearn installed, you may need to run the following cell at first in order to install auto-sklearn. This will require you to restart the runtime (a prompt will invite you to).

Restarting the runtime will clear all your variables and imported libraries, so you will need to import them again.

In [4]:
# 1. uninstall all affected packages
!pip uninstall -y Cython scipy pyparsing scikit_learn imbalanced-learn mlxtend yellowbrick

[0m

In [1]:
# 2. install packages to be downgraded
!pip install Cython==0.29.36 scipy==1.9 pyparsing==2.4



In [2]:
# 3. install older scikit-learn disregarding its dependencies
!pip install scikit-learn==0.24.2 --no-build-isolation

Collecting scikit-learn==0.24.2
  Using cached scikit-learn-0.24.2.tar.gz (7.5 MB)
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-learn
  Building wheel for scikit-learn (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-learn: filename=scikit_learn-0.24.2-cp310-cp310-linux_x86_64.whl size=22231958 sha256=55b8706cd1ab64f6511b77a577db34968826a818a5feb51058f973dc59cb05c0
  Stored in directory: /root/.cache/pip/wheels/13/a4/68/4e78865652fa14db4a162b491e5138565f97646f9e1f2ab8cc
Successfully built scikit-learn
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 0.13.0

In [3]:
# 4. finally install auto-sklearn
!pip install auto-sklearn

Collecting auto-sklearn
  Downloading auto-sklearn-0.15.0.tar.gz (6.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting liac-arff (from auto-sklearn)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ConfigSpace<0.5,>=0.4.21 (from auto-sklearn)
  Downloading ConfigSpace-0.4.21-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pynisher<0.7,>=0.6.3 (from auto-sklearn)
  Downloading pynisher-0.6.4.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyrfr<0.9,>=0.8.1 (from auto-sklearn)
  Downloadi

In [6]:
# 5. then, try loading the package repeatedly until trash in its dependencies are clean
import autosklearn

In [7]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
X, y = fetch_california_housing(return_X_y = True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
import autosklearn.regression
import sklearn.model_selection
import sklearn.datasets
import os, shutil
from sklearn.metrics import mean_squared_error, mean_absolute_error

automl = autosklearn.regression.AutoSklearnRegressor(
    include = {'regressor': ["libsvm_svr", "sgd"]},
    time_left_for_this_task=120,
    per_run_time_limit=30,
    tmp_folder='/tmp/california_housing_tmp',
)
automl.fit(X_train, y_train, dataset_name='California_Housing')

print(automl.leaderboard())

y_pred = automl.predict(X_test, y_test)
print("MSE = ", mean_squared_error(y_test, y_pred))
print("MRE = ", mean_absolute_error(y_test, y_pred))

          rank  ensemble_weight        type      cost   duration
model_id                                                        
2            1              1.0  libsvm_svr  0.276724  16.678274
MSE =  0.3761413445696991
MRE =  0.41119474886431245


In [9]:
from pprint import pprint
pprint(automl.show_models(), indent=4)

{   2: {   'cost': 0.2767239824457497,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7c03291b3af0>,
           'ensemble_weight': 1.0,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7c03291b1060>,
           'model_id': 2,
           'rank': 1,
           'regressor': <autosklearn.pipeline.components.regression.RegressorChoice object at 0x7c03291b1e70>,
           'sklearn_regressor': SVR(cache_size=1933.5260416666667, gamma=0.1, verbose=0)}}


**Question 8**

What are the evaluated models by autoML ?
Which model obtain the best performance ?
What are the parameters of the best model ?

In [18]:
# Use dir() to inspect available attributes and methods
dir(automl)


['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_n_features',
 '_estimator_type',
 '_get_automl_class',
 '_get_param_names',
 '_get_tags',
 '_leaderboard_columns',
 '_more_tags',
 '_n_jobs',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_resolve_ensemble_class',
 '_validate_data',
 'allow_string_features',
 'automl_',
 'build_automl',
 'cv_results_',
 'dask_client',
 'dataset_compression',
 'delete_tmp_folder_after_terminate',
 'disable_evaluator_output',
 'ensemble_class',
 'ensemble_kwargs',
 'ensemble_nbest',
 'ensemble_size',
 'exclude',
 'fANOVA_input_',
 'fit',
 'fit_ensemble',
 'fit_pipeline',
 'get_configu

In [19]:
# Access the AutoML leaderboard
leaderboard = automl.leaderboard()

# Check if the leaderboard is not empty
if leaderboard is not None:
    for rank, model_info in enumerate(leaderboard, start=1):
        print(f"Rank {rank} - Model Info:")
        pprint(model_info, indent=4)
        print("\n" + "="*50 + "\n")
else:
    print("No leaderboard information found.")


Rank 1 - Model Info:
'rank'


Rank 2 - Model Info:
'ensemble_weight'


Rank 3 - Model Info:
'type'


Rank 4 - Model Info:
'cost'


Rank 5 - Model Info:
'duration'




In [22]:
# Print information about the cross-validation results
pprint(automl.cv_results_, indent=4)

{   'budgets': [0.0, 0.0, 0.0, 0.0],
    'mean_fit_time': array([16.67827415, 30.03382158, 30.03564048, 26.03270388]),
    'mean_test_score': array([0.72327602, 0.        , 0.        , 0.        ]),
    'param_data_preprocessor:__choice__': masked_array(data=['feature_type', 'feature_type', 'feature_type',
                   'feature_type'],
             mask=[False, False, False, False],
       fill_value='N/A',
            dtype='<U12'),
    'param_data_preprocessor:feature_type:numerical_transformer:imputation:strategy': masked_array(data=['mean', 'median', 'mean', 'most_frequent'],
             mask=[False, False, False, False],
       fill_value='N/A',
            dtype='<U13'),
    'param_data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__': masked_array(data=['standardize', 'minmax', 'none', 'minmax'],
             mask=[False, False, False, False],
       fill_value='N/A',
            dtype='<U11'),
    'param_data_preprocessor:feature_type:numerical_trans

**The top-performing model (Rank 1) is a support vector regression (SVR) with the following parameters:**


{
  
    'data_preprocessor:__choice__': 'feature_type',
    'data_preprocessor:feature_type:numerical_transformer:imputation:strategy': 'mean',
    'data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__': 'standardize',
    'feature_preprocessor:__choice__': 'no_preprocessing',
    'regressor:__choice__': 'libsvm_svr',
    'regressor:libsvm_svr:C': 1.0,
    'regressor:libsvm_svr:epsilon': 0.1,
    'regressor:libsvm_svr:gamma': 0.1,
    'regressor:libsvm_svr:kernel': 'rbf',
    'regressor:libsvm_svr:max_iter': -1,
    'regressor:libsvm_svr:shrinking': 'True',
    'regressor:libsvm_svr:tol': 0.001
}


**To code 2.2**

With the help of the previous code, use autoML for the classification task on MNIST, by limiting the exploration to KNN and Adaboost.

In [28]:
import autosklearn.classification
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score

# Load MNIST dataset
mnist = fetch_openml('mnist_784')
X, y = mnist.data.astype(float), mnist.target.astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [59]:
# Create and fit the Auto-sklearn classifier with increased time limit
automl_classifier = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=300,  # Set a higher time limit for exploration (in seconds)
    per_run_time_limit=30,  # Set the time limit for each model
    ensemble_kwargs={'ensemble_size': 1},  # Disable ensembling for simplicity
    include={'classifier': ['k_nearest_neighbors', 'adaboost']},  # Limit to KNN and AdaBoost
    resampling_strategy='holdout',
    resampling_strategy_arguments={'train_size': 0.8},
    seed=43,  # Change the seed value
    n_jobs=-1,  # Use all available CPU cores
    memory_limit=2048,  # Increase memory limit to 2GB per model
    delete_tmp_folder_after_terminate=False  # Keep the output folder
)


In [60]:
# Fit the Auto-sklearn classifier
automl_classifier.fit(X_train, y_train)

# Get the predictions on the test set
y_pred = automl_classifier.predict(X_test,y_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("MSE = ", mean_squared_error(y_test, y_pred))
print("MRE = ", mean_absolute_error(y_test, y_pred))
print(f'Accuracy: {accuracy}')

MSE =  28.452357142857142
MRE =  4.481214285714286
Accuracy: 0.09592857142857143


**Question 9**

What are the evaluated models by autoML ?
Which model obtain the best performance ?
What are the parameters of the best model ?

In [61]:
models_with_weights = automl.get_models_with_weights()
pprint(models_with_weights, indent=4)

[   (   1.0,
        SimpleRegressionPipeline({'data_preprocessor:__choice__': 'feature_type', 'feature_preprocessor:__choice__': 'no_preprocessing', 'regressor:__choice__': 'libsvm_svr', 'data_preprocessor:feature_type:numerical_transformer:imputation:strategy': 'mean', 'data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__': 'standardize', 'regressor:libsvm_svr:C': 1.0, 'regressor:libsvm_svr:epsilon': 0.1, 'regressor:libsvm_svr:kernel': 'rbf', 'regressor:libsvm_svr:max_iter': -1, 'regressor:libsvm_svr:shrinking': 'True', 'regressor:libsvm_svr:tol': 0.001, 'regressor:libsvm_svr:gamma': 0.1},
dataset_properties={
  'task': 4,
  'sparse': False,
  'multioutput': False,
  'target_type': 'regression',
  'signed': False}))]


[   

    (   
        1.0,
        SimpleRegressionPipeline({
            'data_preprocessor:__choice__': 'feature_type',
            'feature_preprocessor:__choice__': 'no_preprocessing',
            'regressor:__choice__': 'libsvm_svr',
            'data_preprocessor:feature_type:numerical_transformer:imputation:strategy': 'mean',
            'data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__': 'standardize',
            'regressor:libsvm_svr:C': 1.0,
            'regressor:libsvm_svr:epsilon': 0.1,
            'regressor:libsvm_svr:kernel': 'rbf',
            'regressor:libsvm_svr:max_iter': -1,
            'regressor:libsvm_svr:shrinking': 'True',
            'regressor:libsvm_svr:tol': 0.001,
            'regressor:libsvm_svr:gamma': 0.1
        },
        dataset_properties={
            'task': 4,
            'sparse': False,
            'multioutput': False,
            'target_type': 'regression',
            'signed': False
        })
    )
]

This output indicates that the autoML framework has chosen a model with a weight of 1.0. The model is a **libsvm_svr** (Support Vector Regressor with a radial basis function kernel). The hyperparameters of this model include C, epsilon, kernel type, maximum iterations, shrinking heuristic, tolerance, and gamma.

### Bonus step

As a bonus step, have fun and remove a maximum of constraints of your autoML model. Which model obtain the best performances ? Describe the parameters of this model. You can do it for either for regression or classification or both.

In [62]:
# Create and fit the Auto-sklearn classifier with relaxed constraints
automl_classifier_relaxed = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=600,  # Increase time limit for exploration
    per_run_time_limit=60,  # Set the time limit for each model
    resampling_strategy='holdout',
    resampling_strategy_arguments={'train_size': 0.8},
    seed=42,
    n_jobs=-1,  # Use all available CPU cores
    memory_limit=4096,  # Increase memory limit to 4GB per model
    delete_tmp_folder_after_terminate=False  # Keep the output folder
)

# Fit the Auto-sklearn classifier with relaxed constraints
automl_classifier_relaxed.fit(X_train, y_train)

print(automl_classifier_relaxed.leaderboard())

y_pred = automl_classifier_relaxed.predict(X_test, y_test)
print("MSE = ", mean_squared_error(y_test, y_pred))
print("MRE = ", mean_absolute_error(y_test, y_pred))


          rank  ensemble_weight  type  cost duration
model_id                                            
1            1              1.0  <NA>  <NA>     <NA>
MSE =  28.452357142857142
MRE =  4.481214285714286


In [63]:
from autosklearn.regression import AutoSklearnRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Create and fit the Auto-sklearn regressor with relaxed constraints
automl_regressor_relaxed = AutoSklearnRegressor(
    time_left_for_this_task=600,  # Increase time limit for exploration
    per_run_time_limit=60,  # Set the time limit for each model
    resampling_strategy='holdout',
    resampling_strategy_arguments={'train_size': 0.8},
    seed=42,
    n_jobs=-1,  # Use all available CPU cores
    memory_limit=4096,  # Increase memory limit to 4GB per model
    delete_tmp_folder_after_terminate=False  # Keep the output folder
)

# Fit the Auto-sklearn regressor with relaxed constraints
automl_regressor_relaxed.fit(X_train, y_train)

# Display the leaderboard
print(automl_regressor_relaxed.show_models())

# Make predictions on the test set
y_pred = automl_regressor_relaxed.predict(X_test)

# Evaluate performance metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse}')
print(f'Mean Absolute Error (MAE): {mae}')


{22: {'model_id': 22, 'rank': 1, 'cost': 0.19687620045755239, 'ensemble_weight': 1.0, 'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7c02f52cbfd0>, 'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7c02f52c2bc0>, 'regressor': <autosklearn.pipeline.components.regression.RegressorChoice object at 0x7c02f52c0c40>, 'sklearn_regressor': HistGradientBoostingRegressor(l2_regularization=2.208787572338781e-05,
                              learning_rate=0.036087332404571744, max_iter=512,
                              max_leaf_nodes=64, min_samples_leaf=3,
                              n_iter_no_change=18, random_state=42,
                              validation_fraction=None, warm_start=True)}}
Mean Squared Error (MSE): 1.6634154478249343
Mean Absolute Error (MAE): 0.8322434691619128
