# Prediction Pipeline: Tree-Based

In [None]:
from predictor import BasePredictor
from utils import Utils

config = Utils.read_config_for_env(config_path='../config/config.yml')
predictor = BasePredictor(
    config,
    refresh_monthly=False,
    refresh_ts_features=False,
    clean_strategy='olrem_for_all',
    split_strategy='random',
    num_lag_mon=3,
    val_ratio=0.2,
    scaler_type='standard')

In [None]:
# split the data and do the scaling:
# stores X_train, y_train, X_val, y_val and feature_names in predictor object
predictor.split_scale_X_y()

## Basic LightGBM model

In [None]:
import lightgbm as lgb
# from lightgbm import LGBMRegressor 
import numpy as np
import shap
from sklearn.metrics import mean_squared_error as mse 
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer

In [None]:
# Create a LightGBM dataset for training with features X_train and labels Y_train 
train_data = lgb.Dataset(
    predictor.X_train,
    label=predictor.y_train,
    feature_name=predictor.feature_names) 
# Create a LightGBM dataset for testing with features X_val and labels Y_val, 
# and specify the reference dataset as train_data for consistent evaluation 
val_data = lgb.Dataset(
    predictor.X_val,
    label=predictor.y_val,
    feature_name=predictor.feature_names,
    reference=train_data) 
# Define a dictionary of parameters for configuring the LightGBM regression model. 

In [None]:
params = { 
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 30,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
}
callback_early_stopping = lgb.early_stopping(5)
num_round = 100
model = lgb.train(
    params,
    train_data,
    num_round,
    valid_sets=[val_data],
    callbacks=[callback_early_stopping, lgb.log_evaluation()],
)

In [None]:
lgb.plot_importance(model)

In [None]:
# Make predictions on the training and validation data. 
pred_train = model.predict(predictor.X_train)
pred_val = model.predict(predictor.X_val)

# Calculate and print the Root Mean Squared Error (RMSE) for training and validation predictions. 
print("Training RMSE: ", np.sqrt(mse(predictor.y_train, pred_train)))
print("Validation RMSE: ", np.sqrt(mse(predictor.y_val, pred_val)))

## LightGBM tuned with AutoML

In [None]:
from flaml import AutoML
import numpy as np
import matplotlib.pyplot as plt

In [None]:
automl = AutoML()
settings = {
    "time_budget": 600,  # total running time in seconds
    "metric": "mse",  # primary metrics for regression can be chosen from: ['mae','mse','r2']
    "estimator_list": ["lgbm"],  # list of ML learners; we tune lightgbm in this example
    "task": "regression",  # task type
    "log_file_name": "store_sales_lgbm.log"  # flaml log file
    # "seed": 42,  # random seed
}
automl.fit(
    X_train=predictor.X_train,
    y_train=predictor.y_train,
    **settings)

In [None]:
from flaml.automl.data import get_output_from_log
time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = get_output_from_log(filename=settings['log_file_name'], time_budget=600)
plt.title('Learning Curve')
plt.xlabel('Wall Clock Time (s)')
plt.ylabel('Validation r2')
plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')
plt.show()

In [None]:
print("Best hyperparmeter config:", automl.best_config)
print("Best r2 on validation data: {0:.4g}".format(1 - automl.best_loss))
print("Training duration of best run: {0:.4g} s".format(automl.best_config_train_time))
print(automl.model.estimator)

In [None]:
# Make predictions on the training and validation data. 
pred_train = automl.predict(predictor.X_train)
pred_val = automl.predict(predictor.X_val)

# Calculate and print the Root Mean Squared Error (RMSE) for training and validation predictions. 
print("Training RMSE: ", np.sqrt(mse(predictor.y_train, pred_train)))
print("Validation RMSE: ", np.sqrt(mse(predictor.y_val, pred_val)))

In [None]:
# plt.barh(automl.feature_names_in_, automl.feature_importances_)
plt.barh(predictor.feature_names, automl.feature_importances_)

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
X = np.arange(20).reshape((5, 4))
X.shape
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)