In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline

from bayes_opt import BayesianOptimization
# https://github.com/fmfn/BayesianOptimization

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import xgboost as xgb

In [1]:
import sys
sys.path.append('../src/')
import lib

# XGBoost Regressor

XGBoost the (alleged) king among all decision trees, offers a very potent API for regression tasks like ours.
It is extremly scalable, offers GPU support, a good amount of parameters to tune and overall fantastic results.

#### THE BIGGEST DOWNSIDE
The biggest downside to XGBoost for this particular now-regression task is the fact that we chose to transform
our time variables to a corresponding sin-cos pair. Decision trees will struggle to pick up the intended relation among these two feature columns. Adding XGBoosts feature_constraints via nested lists did not bring the desired fix and overall XGBoost stayed behind our expecations for this regression task.

#### Bayesian Optimization
For this notebook/model we chose to search for optimal parameters using a Python Bayesian Optimization implementation,
which can be a lot more cost effective compared to extensive grid-searches while still delivering sufficient results.

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [7]:
data_path = '../data/'

In [None]:
# df = pd.read_csv("hourly_resampled_contracts_ohlcsv_weather.csv", index_col=[0], header=[0, 1], parse_dates=True)
df = pd.read_pickle(data_path+'hourly_resampled_contracts_ohlcsv_weather.pkl')
features = ['t', 'weekday_sin', 'weekday_cos', 'run_hour', 
            'n_prev_hour_contracts', 'hour_sin', 'hour_cos', 
            'air_temp', 'rel_humidity', 'wind_speed', 'wind_dir',
            'holidays', 'qty_open', 'qty_high', 'qty_low', 'qty_close',
            'qty_var' ,'qty_sum', 'px_open','px_high', 'px_low', 'px_var']
WINDOW_SIZE = 5
forecast_df = lib.create_rolling_windows(df, WINDOW_SIZE, features, save_to_pickle=False)
forecast_df.head()

### Train Test Split

In [None]:
X_train, X_valid, X_test, y_train, y_valid, y_test= lib.train_test_valid_split(forecast_df, WINDOW_SIZE, 
                                                            len(features), test_set=True)

### Feature Distribution:

In [None]:
ax = X_train.hist(bins=(30),figsize=(25,25))

### Plain XGB model test:

In [None]:
constraints = [[5,21,37,53,69,84],[1,2,3,4,17,18,19,20,33,34,35,36,49,50,51,52,65,66,67,68,80,81,82,83],
               [0,6,7,8,9,10,11,12,13,14,15,16,22,23,24,25,26,27,28,29,30,31,32,
                38,39,40,41,42,43,44,45,46,47,48,54,55,56,57,58,59,60,61,62,63,64,70,71,72,73,74,75,76,77,78,79]]
# turns out, feature constraints did not add any value, regardless of "nested list" combinations
#interaction_constraints = constraints

xgbr = xgb.XGBRegressor(verbosity=1) 
xgbr.fit(X_train, y_train,eval_set =[(X_train,y_train),(X_valid,y_valid)],eval_metric="mae", early_stopping_rounds=30)
preds = xgbr.predict(X_test)
mae = mean_absolute_error(y_test,preds)
r2 = r2_score(y_test,preds)
print("Test MAE: ", mae)
print("Test R2: ", r2)

In [None]:
"Training score:  2.868287162863178"

In [None]:
# feature importance:
fig, ax = plt.subplots(figsize=(20,20))
xgb.plot_importance(xgbr, ax=ax)

Unsurprisingly, price values appear to be the most important driver in feature importance.

### XGB Bays. Opt.:

Implementing Bays. Opt. to search for optimal parameter combinations:

In [None]:
def gpu_xgbox_function(n_estimators , max_depth , learning_rate ,gamma , min_child_weight, 
                       subsample, colsample_bytree,max_delta_step , reg_alpha ,
                      reg_lambda,scale_pos_weight):
    
    """Function with unknown internals we wish to maximize.

    This is just serving as an example, for all intents and
    purposes think of the internals of this function, i.e.: the process
    which generates its output values, as unknown.
    """
    n_estimators_xgbr = int(round(n_estimators,0))
    max_depth_xgbr = int(round(max_depth,0))
    max_delta_step_xgbr = int(round(max_delta_step,0))
    
    
    params = {"n_estimators": n_estimators_xgbr,
              "max_depth": max_depth_xgbr,
              "learning_rate": learning_rate,
              "gamma": gamma,
              "min_child_weight": min_child_weight,
              "subsample": subsample,
              "colsample_bytree": colsample_bytree,
              "max_delta_step": max_delta_step_xgbr,
              "reg_alpha": reg_alpha,
              "reg_lambda": reg_lambda,
              "scale_pos_weight": scale_pos_weight,
              "tree_method": "gpu_hist",
              "eval_metric":"mae",
              "verbosity":0,
              "random_state": 42}

    #"deterministic_histogram":"true"
    xgbr = xgb.XGBRegressor(objective='reg:squarederror', **params) 
    xgbr.fit(X_train, y_train,eval_set =[(X_train,y_train),(X_valid,y_valid)], early_stopping_rounds=30)
    preds = xgbr.predict(X_valid)
    mae = mean_absolute_error(y_valid,preds)
    
    return -1*mae

In [None]:
def call_model(X_train, X_valid, X_test, y_train, y_valid, y_test):
    
    pbounds_xgb = {"n_estimators": (100,200),
                  "max_depth": (4,15),
                  "learning_rate": (0.1,0.8),
                  "gamma": (0,0.8),
                  "min_child_weight": (1,2),
                  "subsample": (.5,1.),
                  "colsample_bytree": (.5,1.),
                  "max_delta_step": (0,10),
                  "reg_alpha": (0,1),
                  "reg_lambda": (0,1),
                  "scale_pos_weight": (0,1)}
    
    optimizer = BayesianOptimization(
    f=gpu_xgbox_function,
    pbounds=pbounds_xgb,
    verbose=1, # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent
    random_state=42,)
    
    optimizer.maximize(
    init_points=20,
    n_iter=100,
    acq="ei", 
    xi=1e-1)
    
    best_params = optimizer.max["params"].copy()
    best_params["n_estimators"] = int(round(best_params["n_estimators"],0))
    best_params["max_depth"] = int(round(best_params["max_depth"],0))
    best_params["max_delta_step"] = int(round(best_params["max_delta_step"],0))
    best_params["tree_method"] = "gpu_hist"
    best_params["eval_metric"] = "mae"
    #best_params["deterministic_histogram"] = "true"
    best_params["verbosity"] = 1
    best_params["random_state"] = 42
    
    xgbr2 = xgb.XGBRegressor(objective='reg:squarederror', **best_params) 
    xgbr2.fit(X_train, y_train,eval_set =[(X_train,y_train),(X_valid,y_valid)], early_stopping_rounds=30)
    preds2 = xgbr2.predict(X_test)
    mae = mean_absolute_error(y_test,preds2)
    print("Test MAE: ",mae)
    return best_params, preds, mae

In [None]:
best_params, preds, mae = call_model(X_train, X_valid, X_test, y_train, y_valid, y_test)

In [None]:
best_params

### Some further error analysis and notes on the xgb results:

While results were not as promising as anticipated, the MAE is now down to ~2.8

In [None]:
X_test["preds"] = preds
X_test["true"] = y_test
X_test["error"] = abs(X_test["true"]-X_test["preds"])

In [None]:
ax = X_test.plot(kind="scatter", x="preds",y="true",title="XGB True vs. Pred. Values (size = error size)",
                  s=np.array(X_test.error), figsize=(10,10),c="error",colormap="viridis",
                  colorbar=False,alpha=.5)
ax.plot([-100, 100], [-100, 100], color='black',linewidth=1)
ax.plot([-100, 100], [0, 0], color='black', linestyle="--", linewidth=1)

Compared to the Lasso benchmark, the negative values now have a more even error spread and seem to have improved. The batch between 0 and 10 we predict to be around 15-25 is still as present as before. The record outliers also appear to be the same, pointing towards a more substantial problem withing the events causing these prices. To solve these errors we either need to get back to the data and improve our features or now find an appropriate framework that can find the pattern XGB is so far missing.