# AtoML and Hyperparameter Tuning

Task: Use automated machine learning tools or perform a manual model hyperparameter search to find suitable models and settings for your problem.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
import seaborn as sns
import xarray as xr
import pandas as pd
import deep_snow.dataset
from sklearn.preprocessing import MinMaxScaler
import os
from tqdm import tqdm

In [2]:
df = pd.read_parquet('/mnt/c/Users/JackE/uw/courses/aut24/ml_geo/final_data/classic_ml_val_v1.parquet')
scaler = MinMaxScaler()
cols_to_normalize = [col for col in df.columns if col != 'aso_sd']
df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])

In [3]:
y = df.iloc[:,0].values
data = df.iloc[:, 1:].values
from sklearn.model_selection import train_test_split
# Split data into 80% train and 20% test subsets
print(f"There are {data.shape[0]} data samples")
X_train, X_test, y_train, y_test = train_test_split(
    data, y, test_size=0.2, shuffle=True)

There are 5763835 data samples


### Let's do some manual hyperparameter optimization on our Voting Regression model from our regression_analysis notebook

Using pycaret as seen in class would be interesting, but due to the lack of support past Python 3.10 (and given our environment is 3.12) and the size of our data, the computational resource constraints would be a bit grueling

In [4]:
# here is our baseline model found from the previous notebook
from sklearn import metrics
from sklearn.linear_model import Ridge
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import VotingRegressor

# Ridge Regressor
ridge_reg = Ridge()
ridge_reg.fit(X_train, y_train)
ridge_prediction = ridge_reg.predict(X_test)
print("Ridge Mean Absolute Error:", metrics.mean_absolute_error(y_true=y_test, y_pred=ridge_prediction))

# Dummy Regressor (Baseline)
dummy_reg = DummyRegressor(strategy="median")
dummy_reg.fit(X_train, y_train)
dummy_prediction = dummy_reg.predict(X_test)
print("Dummy Regressor Mean Absolute Error:", metrics.mean_absolute_error(y_true=y_test, y_pred=dummy_prediction))

# Voting Regressor (Dummy and Ridge)
voting_reg = VotingRegressor(estimators=[('dummy', dummy_reg), ('ridge', ridge_reg)])
voting_reg.fit(X_train, y_train)
voting_prediction = voting_reg.predict(X_test)
print("Voting Regressor (Dummy + Ridge) Mean Absolute Error:", metrics.mean_absolute_error(y_true=y_test, y_pred=voting_prediction))

Ridge Mean Absolute Error: 0.4801591243341718
Dummy Regressor Mean Absolute Error: 0.46853125
Voting Regressor (Dummy + Ridge) Mean Absolute Error: 0.44280018300508045


In [None]:
# we're only going to optimize the hyperparameters on the voting regressor as explained in our regression analysis notebook
# the voting regressor model preformed much better than the elastic net and lasso regressor, the lone dummy regressor
# and the lone ridge regressor have closer mean absolute errors to the voting regressor, but since the voting regressor is the combination
# of the two, we'll *somewhat* in essence be optimizing the hyperparameters on both the ridge and dummy regressor as well

In [5]:
voting_reg.get_params()

{'estimators': [('dummy', DummyRegressor(strategy='median')),
  ('ridge', Ridge())],
 'n_jobs': None,
 'verbose': False,
 'weights': None,
 'dummy': DummyRegressor(strategy='median'),
 'ridge': Ridge(),
 'dummy__constant': None,
 'dummy__quantile': None,
 'dummy__strategy': 'median',
 'ridge__alpha': 1.0,
 'ridge__copy_X': True,
 'ridge__fit_intercept': True,
 'ridge__max_iter': None,
 'ridge__positive': False,
 'ridge__random_state': None,
 'ridge__solver': 'auto',
 'ridge__tol': 0.0001}

In [None]:
# let's start with a gridsearch
# ~4 minutes to run
from sklearn.model_selection import GridSearchCV
param_grid = {
    'ridge__alpha': [0.1, 1.0, 10.0],            # Regularization strength for Ridge
    'ridge__fit_intercept': [True, False],        # Whether to calculate the intercept for Ridge
    'dummy__strategy': ['mean', 'median'],        # Strategy for filling missing values in the Dummy Regressor
    'weights': [[1, 1], [0.5, 1.5], [2, 1]],      # Weights to apply to each regressor in the Voting Regressor
}

grid_search = GridSearchCV(voting_reg, param_grid, cv=5, verbose=3, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)
print("Best Mean Absolute Error:", -grid_search.best_score_)

In [9]:
# collapsed the output of the above cell for notebook display purposes
print("Best parameters found:", grid_search.best_params_)
print("Best Mean Absolute Error:", -grid_search.best_score_)

Best parameters found: {'dummy__strategy': 'median', 'ridge__alpha': 0.1, 'ridge__fit_intercept': True, 'weights': [1, 1]}
Best Mean Absolute Error: 0.44285820530527464


In [None]:
# now let's do a random search
# ~7 minutes to run
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
param_distributions = {
    'ridge__alpha': uniform(0.1, 10),
    'ridge__fit_intercept': [True, False],
    'dummy__strategy': ['mean', 'median'],
    'weights': [[1, 1], [0.5, 1.5], [2, 1]],
}
random_search = RandomizedSearchCV(
    voting_reg,
    param_distributions=param_distributions,
    n_iter=100,
    random_state=0,
    cv=5,
    scoring='neg_mean_absolute_error',
    verbose=3
)
random_search.fit(X_train, y_train)
print("Best parameters found:", random_search.best_params_)
print("Best Mean Absolute Error:", -random_search.best_score_)

In [None]:
# collapsed the output of the above cell for notebook display purposes
print("Best parameters found:", random_search.best_params_)
print("Best Mean Absolute Error:", -random_search.best_score_)

Best parameters found: {'dummy__strategy': 'median', 'ridge__alpha': 0.19356704856532617, 'ridge__fit_intercept': True, 'weights': [1, 1]}
Best Mean Absolute Error: 0.44285839048646203


In [None]:
# we see above that the grid search params provide an ever so slightly better mean absolute error than the random search params
# though this difference is neglegent. this is understandable as the only difference between the two is the ridge__alpha parameter
# which is still quite similar between the two

In [None]:
# for the sake of this assignment, we can safely say that this model is the best in terms of computational resources and time
# (and performs the best out of the models we tested)
# for the classic ml models we were interested in testing, their computational intake was far too inefficient for even running on
# a small subset of the small subset of the data of interest. doing any sort of hyperparameter optimization on those models
# trained and tested on the entire dataset would be infeasible and i can't even give an estimate on how many hours
# and given the point of this project is to improve upon an existing deep CNN, we thought that exploring faster algorithms would
# be more interesting (and realistic)