# Model Optimization - Hyperparameter tuning

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px

import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Add the '../imports' directory to the sys.path list
import sys
sys.path.append('../imports')
from helper_functions import split_datetime
from data_preprocessing import merge_data, remove_col

## Import Data

In [None]:
# Get the train data

DATA_DIR = "../data/"

# Read CSVs and parse relevant date columns
train = pd.read_csv(DATA_DIR + "train.csv")
client_train = pd.read_csv(DATA_DIR + "client.csv")
historical_weather_train = pd.read_csv(DATA_DIR + "historical_weather.csv")
forecast_weather_train = pd.read_csv(DATA_DIR + "forecast_weather.csv")
electricity_prices_train = pd.read_csv(DATA_DIR + "electricity_prices.csv")
gas_prices_train = pd.read_csv(DATA_DIR + "gas_prices.csv")
weather_station_to_county_mapping = pd.read_csv(DATA_DIR + 'weather_station_to_county_mapping.csv')

## Data Preprocessing

In [None]:
# We merge all DataFrames 
merged_train_df = merge_data(train, client_train, historical_weather_train,
        forecast_weather_train, electricity_prices_train, gas_prices_train, weather_station_to_county_mapping)

In [None]:
# Drop all non needed columns (ids and timestamps)
model_df = remove_col(merged_train_df)

## Feature Selection

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector

In [None]:
feat_sel_df = model_df.copy()

# train-test split
X = feat_sel_df.drop('target', axis=1)
y = feat_sel_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

## feature selection
xgboost =  XGBRegressor(enable_categorical = True)
sfs = SequentialFeatureSelector(xgboost, scoring='neg_mean_absolute_error')
sfs.fit(X_train, y_train)

sfs.get_support()

array([ True,  True,  True,  True,  True,  True, False, False, False,
       False, False,  True,  True, False,  True, False,  True,  True,
       False, False, False, False,  True,  True,  True, False, False,
       False, False, False, False, False,  True, False,  True,  True,
       False,  True,  True,  True, False,  True])

In [None]:
sfs.feature_names_in_

array(['county', 'is_business', 'product_type', 'is_consumption',
       'eic_count_client', 'installed_capacity_client',
       'lowest_price_per_mwh_gas_prices',
       'highest_price_per_mwh_gas_prices',
       'euros_per_mwh_electricity_prices', 'temperature_hist_weather',
       'dewpoint_hist_weather', 'rain_hist_weather',
       'snowfall_hist_weather', 'surface_pressure_hist_weather',
       'cloudcover_total_hist_weather', 'cloudcover_low_hist_weather',
       'cloudcover_mid_hist_weather', 'cloudcover_high_hist_weather',
       'windspeed_10m_hist_weather', 'winddirection_10m_hist_weather',
       'shortwave_radiation_hist_weather',
       'direct_solar_radiation_hist_weather',
       'diffuse_radiation_hist_weather', 'temperature_forecast_weather',
       'dewpoint_forecast_weather', 'cloudcover_high_forecast_weather',
       'cloudcover_low_forecast_weather',
       'cloudcover_mid_forecast_weather',
       'cloudcover_total_forecast_weather',
       '10_metre_u_wind_compon

In [None]:
sfs.feature_names_in_[sfs.get_support()]

array(['county', 'is_business', 'product_type', 'is_consumption',
       'eic_count_client', 'installed_capacity_client',
       'rain_hist_weather', 'snowfall_hist_weather',
       'cloudcover_total_hist_weather', 'cloudcover_mid_hist_weather',
       'cloudcover_high_hist_weather', 'diffuse_radiation_hist_weather',
       'temperature_forecast_weather', 'dewpoint_forecast_weather',
       'surface_solar_radiation_downwards_forecast_weather',
       'total_precipitation_forecast_weather', 'year', 'week', 'hour',
       'day_of_year', 'day_of_week'], dtype=object)

selected features:

['county', 'is_business', 'product_type', 'is_consumption',
       'eic_count_client', 'installed_capacity_client',
       'rain_hist_weather', 'snowfall_hist_weather',
       'cloudcover_total_hist_weather', 'cloudcover_mid_hist_weather',
       'cloudcover_high_hist_weather', 'diffuse_radiation_hist_weather',
       'temperature_forecast_weather', 'dewpoint_forecast_weather',
       'surface_solar_radiation_downwards_forecast_weather',
       'total_precipitation_forecast_weather', 'year', 'week', 'hour',
       'day_of_year', 'day_of_week']

In [None]:
import numpy as np

In [None]:
cols = sfs.feature_names_in_[sfs.get_support()].tolist()


model_df[cols]

Unnamed: 0,county,is_business,product_type,is_consumption,eic_count_client,installed_capacity_client,rain_hist_weather,snowfall_hist_weather,cloudcover_total_hist_weather,cloudcover_mid_hist_weather,...,diffuse_radiation_hist_weather,temperature_forecast_weather,dewpoint_forecast_weather,surface_solar_radiation_downwards_forecast_weather,total_precipitation_forecast_weather,year,week,hour,day_of_year,day_of_week
0,0,0,1,0,,,,,,,...,,,,,,2021,35,0,244,2
1,0,0,1,1,,,,,,,...,,,,,,2021,35,0,244,2
2,0,0,2,0,,,,,,,...,,,,,,2021,35,0,244,2
3,0,0,2,1,,,,,,,...,,,,,,2021,35,0,244,2
4,0,0,3,0,,,,,,,...,,,,,,2021,35,0,244,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018347,15,1,0,1,15.0,620.0,0.0,0.0,21.2,16.4,...,0.0,11.484033,6.748584,0.0,0.0,2023,22,23,151,2
2018348,15,1,1,0,20.0,624.5,0.0,0.0,21.2,16.4,...,0.0,11.484033,6.748584,0.0,0.0,2023,22,23,151,2
2018349,15,1,1,1,20.0,624.5,0.0,0.0,21.2,16.4,...,0.0,11.484033,6.748584,0.0,0.0,2023,22,23,151,2
2018350,15,1,3,0,55.0,2188.2,0.0,0.0,21.2,16.4,...,0.0,11.484033,6.748584,0.0,0.0,2023,22,23,151,2


In [None]:
#run model again, with selected features


df = model_df.copy()
## do stuff to model_df

# only keep columns selected by SFS
cols = sfs.feature_names_in_[sfs.get_support()].tolist()
df = df[cols]

# train-test split
X = df
y = model_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# training
model = XGBRegressor(enable_categorical=True) 
model.fit(X_train, y_train)

# evaluation
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# main optimisation metric
print('Mean absolute error test', mean_absolute_error(y_test, y_test_pred))
print('Mean absolute error train', mean_absolute_error(y_train, y_train_pred))

Mean absolute error test 49.646839153001004
Mean absolute error train 48.56717060826106


## XGBoost Parameters

What parameters can we tune?
Source: https://xgboost.readthedocs.io/en/stable/parameter.html

- `booster` [default: `gbtree`]: 
  - Description: Specifies the booster type to use.
  - Options: 
    - `gbtree`: Uses tree-based models.
    - `dart`: Similar to `gbtree`, but with dropout.
    - `gblinear`: Uses linear functions.
    
- `eta` [default: `0.3`, alias: `learning_rate`]: 
  - Description: Step size shrinkage used in update to prevent overfitting. 
  - Range: `[0, 1]`

- `max_depth` [default: `6`]: 
  - Description: Maximum depth of a tree. Increasing this value will make the model more complex and likely to overfit. 
  - Range: `[0, ∞]` (0 indicates no limit)

- `subsample` [default: `1`]: 
  - Description: Subsample ratio of the training instances to prevent overfitting. 
  - Range: `(0, 1]`

- `lambda` [default: `1`, alias: `reg_lambda`]: 
  - Description: L2 regularization term on weights. 
  - Range: `[0, ∞]`

- `alpha` [default: `0`, alias: `reg_alpha`]: 
  - Description: L1 regularization term on weights. 
  - Range: `[0, ∞]`

- `eval_metric` [default: according to objective]: 
  - Description: Evaluation metrics for validation data. 
  - Note: A default metric is assigned according to the objective (e.g., `rmse` for regression, `logloss` for classification). Users can add multiple evaluation metrics.


## GridSearch

First only searching different tree level depths:

In [None]:
# hyperparameter tuning with gridsearch

from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train,  y_test = train_test_split(model_df.drop('target', axis=1), model_df['target'], test_size=0.3, random_state=0)

# Define a range of hyperparameters to tune
param_grid = {
    'max_depth': [6, 7, 8, 9],
    'learning_rate': [0.01, 0.1, 0.3, 0.5],
    #'n_estimators': [100, 200, 300, 500],
    #'subsample': [0.7, 0.8, 0.9],
}

# Initialize the XGBRegressor with enable_categorical=True
xgb_reg = XGBRegressor(enable_categorical=True)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_reg, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_absolute_error')

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_


In [None]:
best_params

In [None]:
best_model

In [None]:
import pickle

In [None]:
# save output of grid search (best model) as pickle, so we can call it for the test data in modelling_test_data.ipynb
with open('../models/XGBoost_first_best_model.pickle', 'wb') as file:
    pickle.dump(best_model,  file)

# We should run the grid search again, but with the reduced number of columns
(don't forget)
- reduce overfitting? (how?)

In [None]:
# checking best model's MAE on test set

y_pred = best_model.predict(X_test)

# Calculate the Mean Absolute Error between the actual and predicted values
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")

In [None]:
# checking best model's MAE on train set

y_pred = best_model.predict(X_train)

# Calculate the Mean Absolute Error between the actual and predicted values
mae = mean_absolute_error(y_train, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")

## RandomizedSearch
Different parameters are tuned, and df is split into consumption/production.

In [None]:
# randomized search, but splitting the df into consumption/production, and choosing different parameters for tuning

from sklearn.model_selection import RandomizedSearchCV

drop_columns = [
    'target',
    'hours_ahead_forecast_weather',
    'row_id',
    'data_block_id',
    'prediction_unit_id',
    'longitude_hist_weather',
    'longitude_forecast_weather',
    'latitude_hist_weather',
    'latitude_forecast_weather'
]
# max_depth 15 leads to overfitting
params = {
    'gamma': [0, 0.1, 1, 10],
    'max_depth': [4, 6, 8],
    'min_child_weight': [0, 1, 4, 8],
    'lambda': [0, 0.01, 0.1, 1],
    'num_parallel_tree': [1, 2, 3],
}
# consumption model
X_train, X_test, y_train_cons,  y_test_cons = train_test_split(
    model_df.drop(drop_columns, axis=1).query('is_consumption == 1'),
    model_df.query('is_consumption == 1')['target'],
    test_size=0.3,
    random_state=0
)

bst_cons = RandomizedSearchCV(
    estimator=XGBRegressor(enable_categorical=True),
    param_distributions=params,
    scoring='neg_mean_absolute_error',
    n_iter=10,
    cv=2
)
bst_cons.fit(X_train, y_train_cons)
y_pred_test_cons = bst_cons.predict(X_test)
y_pred_train_cons = bst_cons.predict(X_train)
print('Mean absolute error train consumption', mean_absolute_error(y_train_cons, y_pred_train_cons))
print('Mean absolute error test consumption', mean_absolute_error(y_test_cons, y_pred_test_cons))
# production model
X_train, X_test, y_train_prod,  y_test_prod = train_test_split(
    model_df.drop(drop_columns, axis=1).query('is_consumption == 0'),
    model_df.query('is_consumption == 0')['target'],
    test_size=0.3,
    random_state=0
)

bst_prod = RandomizedSearchCV(
    estimator=XGBRegressor(enable_categorical=True),
    param_distributions=params,
    scoring='neg_mean_absolute_error',
    n_iter=10,
    cv=2
)
bst_prod.fit(X_train, y_train_prod)
y_pred_test_prod = bst_prod.predict(X_test)
y_pred_train_prod = bst_prod.predict(X_train)
print('Mean absolute error train production', mean_absolute_error(y_train_prod, y_pred_train_prod))
print('Mean absolute error test production', mean_absolute_error(y_test_prod, y_pred_test_prod))
# overall score
print(
    'Mean absolute error train overall',
    mean_absolute_error(
          pd.concat([pd.Series(y_train_cons), pd.Series(y_train_prod)]),
          pd.concat([pd.Series(y_pred_train_cons), pd.Series(y_pred_train_prod)])
    )
)
print(
    'Mean absolute error test overall',
    mean_absolute_error(
        pd.concat([pd.Series(y_test_cons), pd.Series(y_test_prod)]),
        pd.concat([pd.Series(y_pred_test_cons), pd.Series(y_pred_test_prod)])
    )
)



MAE is quite similar with the two hyperparameter search, max tree depth level is probably somewhere between 8 and 10.
We need to validate our model on the test dataset, to see its reliability.