# Models tuning

In previous notebooks, we conducted extensive experiments with numerous models and a wide range of features. In this notebook, we narrow our focus to a single model and a specific subset of features that are available in production. We will perform hyperparameter tuning and feature selection, utilizing techniques like cross-validation to refine the model. Our goal is to identify the most effective feature combinations and hyperparameter settings to optimize the model's accuracy for production deployment.


## Import dependencies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import warnings
import os
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from sklearn.metrics import mean_absolute_error, mean_squared_error

warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.float_format', '{:20,.2f}'.format)
pd.set_option('display.max_colwidth', None)

## Load data

### Load the training dataset containing only the features to be used in production + engineered features

In [None]:
GEN_DATA_PATH = os.path.join('..','..', 'data', '5_training_production_model_data.parquet')
df = pd.read_parquet(GEN_DATA_PATH)
TARGET_COL = 'DC Gen. Power'
display(df.head(5))

In [None]:
x = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]
x_scaler = StandardScaler()
y_scaler = StandardScaler()
x_scaled = x_scaler.fit_transform(x)
y_scaled = y_scaler.fit_transform(y.values.reshape(-1, 1))

print('All: ', len(df))

# Training

In [None]:
results = pd.DataFrame()
def calculate_metrics(y_pred, y_val, model_name, hyperparams, fold, features):
    """
    Calculate metrics for the model
    """
    global results
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_val, y_pred)    

    df = pd.DataFrame({
        'model': [model_name],        
        'mae': [mae],
        'mse': [mse],
        'mae': [mae],
        'hyperparams': [hyperparams],
        'fold': [fold],
        'features': [features]
    })
    results = pd.concat([results, df])   

    return mse, rmse, mae

def plot_predictions(y_pred, y_actual, n=200):
    plt.figure(figsize=(20, 8))
    plt.plot(y_actual[-n:], label='Actual')
    plt.plot(y_pred[-n:], label='Predicted')
    plt.legend()
    plt.show()


## Feature Selection

In [None]:


essential_features = ['Temperature', 'Precipitation' 'Shortwave Radiation']

all_features = x.columns.tolist()

temporal_features = [
    'day', 'season_0', 'season_1', 'season_2', 'season_3', 
    'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 
    'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 
    'month_11', 'month_12', 'hour_0', 'hour_1', 'hour_2', 
    'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7', 
    'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 
    'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 
    'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 
    'hour_23', 'Hours Since Last Rain'
]

lag_features = [
    'DC Gen. Power 1 Hour Lag', 'DC Gen. Power 2 Hour Lag', 'DC Gen. Power 4 Hour Lag', 
    'DC Gen. Power 24 Hour Lag', 'DC Gen. Power 720 Hour Lag', 
    'DC Gen. Power 24 Hour Rolling Mean', 'DC Gen. Power 24 Hour Rolling Std', 
    'DC Gen. Power 24 Hour Rolling Max', 'DC Gen. Power 24 Hour Rolling EMA', 
    'DC Gen. Power 48 Hour Rolling Mean', 'DC Gen. Power 48 Hour Rolling Std', 
    'DC Gen. Power 48 Hour Rolling Max', 'DC Gen. Power 48 Hour Rolling EMA', 
    'DC Gen. Power 720 Hour Rolling Mean', 'DC Gen. Power 720 Hour Rolling Std', 
    'DC Gen. Power 720 Hour Rolling Max', 'DC Gen. Power 720 Hour Rolling EMA'
]


rolling_features = [x for x in df.columns if 'Rolling' in x]
seasonal_temporal_features = [
    'day', 'season_0', 'season_1', 'season_2', 'season_3', 
    'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 
    'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 
    'month_11', 'month_12', 'hour_0', 'hour_1', 'hour_2', 
    'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7', 
    'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 
    'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 
    'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 
    'hour_23'
]

domain_knowledge_features =  ['Hours Since Last Rain','Solar Zenith Angle']

FEATURE_SETS = {
    'all': all_features,
    'temporal': essential_features + temporal_features,
    'lag': essential_features + lag_features,
    'rolling': essential_features + rolling_features,
    'seasonal_temporal': essential_features + seasonal_temporal_features,
    'domain_knowledge': essential_features + domain_knowledge_features
}


## K-Fold Cross Validation


In [None]:
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=5, test_size=int(0.12*len(x)))
results = pd.DataFrame()

def run_k_fold_cv(model_factory, space, max_evals=100):
    global best_loss, best_model, y_scaler, results
 
    best_loss = np.inf
    best_model = None
    
    for fold, (train_idx, val_idx) in enumerate(tscv.split(x)):
        for features_name, feature_set in FEATURE_SETS.items(): 
            x_fold = x.drop(columns=[col for col in x.columns if col not in feature_set])
            x_fold_scaled = x_scaler.fit_transform(x_fold)
            print(f"Fold {fold + 1}")
            print(f"Train: {len(train_idx)}")
            print(f"Validation: {len(val_idx)}")
            x_train_fold, x_val_fold = x_fold_scaled[train_idx], x_fold_scaled[val_idx]
            y_train_fold, y_val_fold = y_scaled[train_idx], y[val_idx]
            
            def objective(params):
                model = model_factory(**params)
                model.fit(x_train_fold, y_train_fold)
                y_pred = model.predict(x_val_fold)
                y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1))
                mae = mean_absolute_error(y_val_fold, y_pred)
                model_name = model.__class__.__name__
                calculate_metrics(y_pred, y_val_fold, model_name, params, fold +1, features_name)
                global best_loss, best_model
                if mae < best_loss:
                    best_loss = mae
                    best_model = model
                    
                return {'loss': mae, 'status': STATUS_OK}
            
            trials = Trials()
            fmin(objective, space, algo=tpe.suggest, max_evals=max_evals, trials=trials)
    
    return best_model




### CatBoost Regressor

Given that CatBoost models have delivered the best results in prior experiments, we will focus exclusively on fine-tuning the hyperparameters for this model.

In [None]:
from catboost import CatBoostRegressor

model_factory = lambda **params: CatBoostRegressor(**params, verbose=False)
space = {
    'iterations': hp.choice('iterations', [50, 100, 200, 400]),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'depth': hp.uniformint('depth', 3, 15),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 1, 10),
    'border_count': hp.uniformint('border_count', 32, 255),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'rsm': hp.uniform('rsm', 0.5, 1.0),
    'random_strength': hp.uniform('random_strength', 0.5, 1.0),
    'bagging_temperature': hp.uniform('bagging_temperature', 0.0, 1.0),
}

best_model = run_k_fold_cv(model_factory, space, max_evals=100)


# Results

In [None]:
display(results.sort_values('mae').head(20))
results['hyperparams'] = results['hyperparams'].astype(str)
results_group_all_folds = results.groupby(['model', 'features','hyperparams']).mae.mean().sort_values()

In [43]:
results_group_all_folds.head(100)

model              features  hyperparams                                                                                                                                                                                                                                                                                                                      
CatBoostRegressor  all       {'bagging_temperature': 0.3960590286279437, 'border_count': 188, 'depth': 7, 'iterations': 400, 'l2_leaf_reg': 1.327925877118482, 'learning_rate': 0.0772764990599374, 'random_strength': 0.518174597745601, 'rsm': 0.9381818176925532, 'subsample': 0.848270593585808}                                                                         4,109.71
                             {'bagging_temperature': 0.5370187694440764, 'border_count': 188, 'depth': 7, 'iterations': 400, 'l2_leaf_reg': 1.0118314912996154, 'learning_rate': 0.07470502426645866, 'random_strength': 0.5236926881366905, 'rsm': 0.7214254522691629, 'subsample'

In the next notebook, we will apply the best hyperparameter configurations and the most effective feature set identified in these experiments to train the final production model.