In [12]:
# Import necessary dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor
import xgboost as xgb
import seaborn as sns
import pickle

#### Data Loading

In [2]:
# Load target and weights
temp_datasets = {}

with open('../data/main_datasets.pkl', 'rb+') as f:
    temp_datasets = pickle.load(f)

X_train, X_val, X_oot, features = temp_datasets['X_train'], temp_datasets['X_val'], temp_datasets['X_oot'], temp_datasets['features']
del temp_datasets

y_train, y_val, y_oot = X_train.loc[:, 'sales'].copy(), X_val.loc[:, 'sales'].copy(), X_oot.loc[:, 'sales'].copy()
train_weights, val_weights, oot_weights = X_train.loc[:, 'weight'].copy(), X_val.loc[:, 'weight'].copy(), X_oot.loc[:, 'weight'].copy()
del X_train, X_val, X_oot

In [3]:
# Load features
temp_datasets = {}

with open('../data/main_datasets_scaled.pkl', 'rb+') as f:
    temp_datasets = pickle.load(f)
    
X_train, X_val, X_oot = temp_datasets['X_train'], temp_datasets['X_val'], temp_datasets['X_oot']
del temp_datasets

In [4]:
# Helper function to calculate WMAE
def WMAE(y_true: pd.Series | np.ndarray, y_pred: pd.Series | np.ndarray, weights: pd.Series | np.ndarray) -> float:
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

#### XGBRegressor fine-tuning

In [11]:
model = XGBRegressor().fit(X_train, y_train)
model

In [13]:
# Transform data into DMatrices
DM_train = xgb.DMatrix(data=X_train, label=y_train, weight=train_weights)
DM_val = xgb.DMatrix(data=X_val, label=y_val, weight=val_weights)
DM_oot = xgb.DMatrix(data=X_oot, label=y_oot, weight=oot_weights)

In [18]:
del DM_val, DM_oot

In [14]:
del X_train, X_val, X_oot

In [17]:
param_grid = {
    'learning_rate': 0.1,
    'max_depth': 5,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:squarederror',
    'seed': 42
}

cv_results = xgb.cv(
    params=param_grid,
    dtrain=DM_train,
    num_boost_round=300,
    nfold=5,  # 5 fold
    metrics='mae',
    early_stopping_rounds=10,
    as_pandas=True,
    seed=42
)

XGBoostError: bad allocation