In [2]:
# Import necessary dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor
import xgboost as xgb
import seaborn as sns
import pickle
import optuna

#### Data Loading

In [3]:
# Load target and weights
temp_datasets = {}

with open('/kaggle/input/rohlik-sales-preprocessed-v1/main_datasets.pkl', 'rb') as f:
    temp_datasets = pickle.load(f)

X_train, X_val, X_oot, features = temp_datasets['X_train'], temp_datasets['X_val'], temp_datasets['X_oot'], temp_datasets['features']
del temp_datasets

y_train, y_val, y_oot = X_train.loc[:, 'sales'].copy(), X_val.loc[:, 'sales'].copy(), X_oot.loc[:, 'sales'].copy()
train_weights, val_weights, oot_weights = X_train.loc[:, 'weight'].copy(), X_val.loc[:, 'weight'].copy(), X_oot.loc[:, 'weight'].copy()
del X_train, X_val, X_oot

In [4]:
# Load features
temp_datasets = {}

with open('/kaggle/input/rohlik-sales-preprocessed-v1/main_datasets_scaled.pkl', 'rb') as f:
    temp_datasets = pickle.load(f)
    
X_train, X_val, X_oot = temp_datasets['X_train'], temp_datasets['X_val'], temp_datasets['X_oot']
del temp_datasets

In [5]:
# Helper function to calculate WMAE for training and fine-tuning
def wmae(predt: np.ndarray, dtrain: xgb.DMatrix) -> tuple[str, float]:
    labels = dtrain.get_label()
    weights = dtrain.get_weight()
    return 'WMAE', sum(weights * abs(labels - predt)) / sum(weights)

#### XGBRegressor fine-tuning

In [6]:
# Combine train and validation data together
full_data_train = np.concatenate([X_train, X_val])
y_full_train = np.concatenate([y_train, y_val])
weights_full_train = np.concatenate([train_weights, val_weights])
print(f"X: {full_data_train.shape}, y: {y_full_train.shape}, weights: {weights_full_train.shape}")

X: (3895392, 311), y: (3895392,), weights: (3895392,)


In [7]:
# Transform full data into DMatrix
dfull = xgb.DMatrix(data=full_data_train, label=y_full_train, weight=weights_full_train)
doot = xgb.DMatrix(data=X_oot, label=y_oot, weight=oot_weights)

In [10]:
# Transform data into DMatrices
DM_train = xgb.DMatrix(data=X_train, label=y_train, weight=train_weights)
DM_val = xgb.DMatrix(data=X_val, label=y_val, weight=val_weights)
DM_oot = xgb.DMatrix(data=X_oot, label=y_oot, weight=oot_weights)

In [18]:
del DM_val, DM_oot

In [8]:
del X_train, X_val, X_oot

In [9]:
del full_data_train, y_full_train, weights_full_train

In [10]:
# Define objective for Bayesian optimization
def objective(trial):
    params = {
        'tree_method': 'hist',
        'device': 'cuda',
        'objective': 'reg:squarederror',
        #'eval_metric': wmae,
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1)
    }

    model = xgb.train(
        params,
        dfull,
        num_boost_round=1000,
        evals=[(doot, 'val')],
        early_stopping_rounds=50,
        custom_metric=wmae,
        verbose_eval=False
    )

    predictions = model.predict(doot)

    return mean_absolute_error(y_oot, predictions, sample_weight=oot_weights)

In [12]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

[I 2025-01-05 13:58:35,430] A new study created in memory with name: no-name-c0ed9696-c9f8-412f-a15c-32668569199a
[I 2025-01-05 14:00:38,583] Trial 0 finished with value: 30.63625968934278 and parameters: {'max_depth': 7, 'learning_rate': 0.11330749653490861, 'subsample': 0.7597270145516669, 'colsample_bytree': 0.8882783727160704, 'min_child_weight': 10, 'gamma': 0.03322553033046405, 'reg_alpha': 0.19110280758098985, 'reg_lambda': 0.7608562198713736}. Best is trial 0 with value: 30.63625968934278.
[I 2025-01-05 14:02:27,881] Trial 1 finished with value: 33.13783313074562 and parameters: {'max_depth': 5, 'learning_rate': 0.2437091873842632, 'subsample': 0.9020180485556821, 'colsample_bytree': 0.739729055780925, 'min_child_weight': 4, 'gamma': 0.036000696852847736, 'reg_alpha': 0.42616164785843424, 'reg_lambda': 0.22816598867488436}. Best is trial 0 with value: 30.63625968934278.
[I 2025-01-05 14:04:21,434] Trial 2 finished with value: 35.189914270858296 and parameters: {'max_depth': 7, 

In [14]:
# Take a look at best parameters
best_params = study.best_params
print("Best hyperparameters:", best_params)

Best hyperparameters: {'max_depth': 10, 'learning_rate': 0.2086679987063798, 'subsample': 0.9624417651671341, 'colsample_bytree': 0.7062837350306537, 'min_child_weight': 8, 'gamma': 0.1089398605720271, 'reg_alpha': 0.14489626100058484, 'reg_lambda': 0.12400282572201009}


In [12]:
# Save best parameters
best_parameters = {
    'tree_method': 'hist',
    'device': 'cuda',
    'objective': 'reg:squarederror',
    'max_depth': 10,
    'learning_rate': 0.2537400839413723,
    'subsample': 0.7076939162504635,
    'colsample_bytree': 0.808729670358781,
    'min_child_weight': 3,
    'gamma': 0.06288940827561051,
    'reg_alpha': 0.222048017933006,
    'reg_lambda': 0.81477222217481
}

best_parameters_v1 = {
    'tree_method': 'hist',
    'device': 'cuda',
    'max_depth': 10,
    'learning_rate': 0.2086679987063798,
    'subsample': 0.9624417651671341,
    'colsample_bytree': 0.7062837350306537,
    'min_child_weight': 8,
    'gamma': 0.1089398605720271,
    'reg_alpha': 0.14489626100058484,
    'reg_lambda': 0.12400282572201009
}

# Train model with best parameters
model = xgb.train(
        best_parameters_v1,
        dfull,
        num_boost_round=2500,
        evals=[(dfull, 'train'), (doot, 'val')],
        early_stopping_rounds=50,
        verbose_eval=25,
        custom_metric=wmae,
    )

[0]	train-rmse:140.75461	train-WMAE:62.35947	val-rmse:158.65380	val-WMAE:65.16205
[25]	train-rmse:73.80667	train-WMAE:38.78182	val-rmse:92.99167	val-WMAE:44.71264
[50]	train-rmse:66.97510	train-WMAE:34.26555	val-rmse:86.98203	val-WMAE:40.64583
[75]	train-rmse:62.34600	train-WMAE:31.37366	val-rmse:82.92475	val-WMAE:37.94487
[100]	train-rmse:59.54530	train-WMAE:29.71366	val-rmse:80.95171	val-WMAE:36.57492
[125]	train-rmse:56.36927	train-WMAE:27.84264	val-rmse:78.03118	val-WMAE:34.84648
[150]	train-rmse:52.96979	train-WMAE:26.13685	val-rmse:75.63590	val-WMAE:33.56582
[175]	train-rmse:50.68208	train-WMAE:24.82224	val-rmse:73.86591	val-WMAE:32.41919
[200]	train-rmse:48.76982	train-WMAE:23.70146	val-rmse:72.71485	val-WMAE:31.57559
[225]	train-rmse:47.17004	train-WMAE:22.74996	val-rmse:71.74082	val-WMAE:30.81777
[250]	train-rmse:45.52436	train-WMAE:21.71197	val-rmse:70.61730	val-WMAE:29.92866
[275]	train-rmse:44.47167	train-WMAE:21.15779	val-rmse:69.93582	val-WMAE:29.50963
[300]	train-rmse:43

In [13]:
with open('/kaggle/input/rohlik-sales-preprocessed-v1/test_dataset_scaled.pkl', 'rb') as f:
    X_test = xgb.DMatrix(data=pickle.load(f)['X_test'])

with open('/kaggle/input/rohlik-sales-preprocessed-v1/test_dataset.pkl', 'rb') as f:
    X_test_id = pickle.load(f)['X_test']



test_preds = model.predict(X_test)
submission = pd.DataFrame(
    data={
        'id': X_test_id.loc[:, 'unique_id'] + '_' + X_test_id.loc[:, 'date'],
        'sales_hat': test_preds
    }
)

submission['sales_hat'] = submission.loc[:, 'sales_hat'].apply(lambda x: x if x >= 0.0 else 0.0)
submission.to_csv('submission.csv', index=False)