In [2]:
# loading packages
import xgboost as xgb
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import PoissonRegressor
from sklearn.model_selection import train_test_split

In [3]:
# Loading our own functions
import Functions as Func

First we load in the data and apply our data processing steps

We define 'model data' as the set collectively used for training & validation.

In [4]:
df = Func.load_and_basic_process()
df_X, df_Y = Func.df_data_process(df)

X_model, X_test, Y_model, Y_test = train_test_split(df_X, df_Y, test_size=0.1, random_state=1)
X_train, X_val, Y_train, Y_val = train_test_split(X_model, Y_model, test_size=0.2, random_state=2)

For purposes of comparison we apply a simple intercept-only poisson model, and then a poisson model using our predictors, to the training data.

In [5]:
# Intercept poisson model

n_samples = X_train.shape[0]  # Number of samples
X_train_intercept = np.ones((n_samples, 1))

glm_int = PoissonRegressor(alpha=1e-4, solver="newton-cholesky")
glm_int.fit(X_train_intercept, Y_train["Frequency"], sample_weight=Y_train["Exposure"])

n_samples_test = Y_test.shape[0]  # Number of samples
X_test_intercept = np.ones((n_samples_test, 1))

# Evaluate the model
scores = Func.score_estimator(
    glm_int,
    X_train_intercept,
    X_test_intercept,
    Y_train,
    Y_test,
    target="Frequency",
    weights="Exposure",
)
print("Evaluation of PoissonRegressor on target Frequency with only the intercept")
print(scores)

Evaluation of PoissonRegressor on target Frequency with only the intercept
subset               train    test
metric                            
D² explained        0.0000 -0.0000
mean abs. error     0.1401  0.1406
mean squared error  0.2391  0.2450


In [6]:
# Next we review a poisson model using predictive features
glm_freq = PoissonRegressor(alpha=1e-4, solver="newton-cholesky")
glm_freq.fit(X_train, Y_train["Frequency"], sample_weight=Y_train["Exposure"])

# Model Evaluation
scores = Func.score_estimator(
    glm_freq,
    X_train,
    X_test,
    Y_train,
    Y_test,
    target="Frequency",
    weights="Exposure",
)
print("Evaluation of PoissonRegressor on target Frequency")
print(scores)

Evaluation of PoissonRegressor on target Frequency
subset               train    test
metric                            
D² explained        0.0191  0.0193
mean abs. error     0.1378  0.1385
mean squared error  0.2375  0.2434


With these models available, we next can apply XGBoost as comparison

In [7]:
# We change the dataframe's into efficient XGBoost Dmatrixies, which contain just the predictors, the labels (frequency) and weights (exposure)
# Since XGBoost applies internal train-validation, we use the 'model' dataset for training

dtrain_reg = xgb.DMatrix(X_model, label=Y_model["Frequency"], weight=Y_model["Exposure"])
dtest_reg = xgb.DMatrix(X_test, label=Y_test["Frequency"], weight=Y_test["Exposure"])



In [8]:
# Here we use very basic hyperparamters

params = {"objective": "reg:absoluteerror", "tree_method": "hist"}
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

In [9]:
# Our initial results look promising

n = 1000
xgb_freq = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   early_stopping_rounds=10,
)

[0]	train-mae:0.07361	validation-mae:0.07423
[1]	train-mae:0.07360	validation-mae:0.07423
[2]	train-mae:0.07360	validation-mae:0.07422
[3]	train-mae:0.07360	validation-mae:0.07422
[4]	train-mae:0.07360	validation-mae:0.07422
[5]	train-mae:0.07360	validation-mae:0.07422
[6]	train-mae:0.07360	validation-mae:0.07422
[7]	train-mae:0.07360	validation-mae:0.07421
[8]	train-mae:0.07360	validation-mae:0.07422
[9]	train-mae:0.07360	validation-mae:0.07422
[10]	train-mae:0.07360	validation-mae:0.07422
[11]	train-mae:0.07360	validation-mae:0.07422
[12]	train-mae:0.07360	validation-mae:0.07422
[13]	train-mae:0.07360	validation-mae:0.07422
[14]	train-mae:0.07360	validation-mae:0.07422
[15]	train-mae:0.07360	validation-mae:0.07422
[16]	train-mae:0.07360	validation-mae:0.07422


In [10]:
# We apply a 5-fold CV on the testing data to see if there is overfitting: the results seem to hold

cv_results = xgb.cv(
   params,
   dtest_reg,
   num_boost_round=n,
   nfold=5,
   metrics = ["mae"],
   early_stopping_rounds=10
)

print(cv_results)

   train-mae-mean  train-mae-std  test-mae-mean  test-mae-std
0        0.074094       0.000742       0.074457      0.002993


We attempt another run using tuned hyperparameters which we attained from our tuning file to compare the results

In [13]:
n = 597

params = {"objective": "reg:absoluteerror",
            "tree_method": "hist",
            "learning_rate": 0.075,
            "max_depth": 1,
            "subsample": 0.5659,
            "colsample_bytree": 0.8832,
            "max_delta_step": 0.3655,
            "min_child_weight": 8.5903,
            "gamma": 2.798}
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

xgb_freq = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   early_stopping_rounds=10,
)

[0]	train-mae:0.07363	validation-mae:0.07422
[1]	train-mae:0.07363	validation-mae:0.07422
[2]	train-mae:0.07363	validation-mae:0.07422
[3]	train-mae:0.07363	validation-mae:0.07422
[4]	train-mae:0.07363	validation-mae:0.07422
[5]	train-mae:0.07363	validation-mae:0.07422
[6]	train-mae:0.07363	validation-mae:0.07422
[7]	train-mae:0.07363	validation-mae:0.07422
[8]	train-mae:0.07363	validation-mae:0.07422
[9]	train-mae:0.07363	validation-mae:0.07422
[10]	train-mae:0.07363	validation-mae:0.07422


In [14]:
# What we see are extremely minor differences. This confirms to us that the default parameters are effective for our purposes.

cv_results = xgb.cv(
   params,
   dtest_reg,
   num_boost_round=n,
   nfold=5,
   metrics = ["mae"],
   early_stopping_rounds=10
)

print(cv_results)

   train-mae-mean  train-mae-std  test-mae-mean  test-mae-std
0        0.074224       0.000731       0.074225      0.002932
