In [1]:
# Import necessary dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor
import xgboost as xgb
import seaborn as sns
import pickle
import optuna

#### Data Loading

In [2]:
# Load target and weights
temp_datasets = {}

with open('/kaggle/input/rohlik-sales-preprocessed-v1/main_datasets.pkl', 'rb') as f:
    temp_datasets = pickle.load(f)

X_train, X_val, X_oot, features = temp_datasets['X_train'], temp_datasets['X_val'], temp_datasets['X_oot'], temp_datasets['features']
del temp_datasets

y_train, y_val, y_oot = X_train.loc[:, 'sales'].copy(), X_val.loc[:, 'sales'].copy(), X_oot.loc[:, 'sales'].copy()
train_weights, val_weights, oot_weights = X_train.loc[:, 'weight'].copy(), X_val.loc[:, 'weight'].copy(), X_oot.loc[:, 'weight'].copy()
del X_train, X_val, X_oot

In [3]:
# Load features
temp_datasets = {}

with open('/kaggle/input/rohlik-sales-preprocessed-v1/main_datasets_scaled.pkl', 'rb') as f:
    temp_datasets = pickle.load(f)
    
X_train, X_val, X_oot = temp_datasets['X_train'], temp_datasets['X_val'], temp_datasets['X_oot']
del temp_datasets

In [4]:
# Helper function to calculate WMAE
def WMAE(y_true: pd.Series | np.ndarray, y_pred: pd.Series | np.ndarray, weights: pd.Series | np.ndarray) -> float:
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

In [5]:
# Helper function to calculate WMAE for training and fine-tuning
def wmae(predt: np.ndarray, dtrain: xgb.DMatrix) -> tuple[str, float]:
    labels = dtrain.get_label()
    weights = dtrain.get_weight()
    return 'WMAE', sum(weights * abs(labels - predt)) / sum(weights)

#### XGBRegressor fine-tuning

In [6]:
model = XGBRegressor()

In [7]:
model = XGBRegressor().fit(X_train, y_train)
model

In [10]:
# Transform data into DMatrices
DM_train = xgb.DMatrix(data=X_train, label=y_train, weight=train_weights)
DM_val = xgb.DMatrix(data=X_val, label=y_val, weight=val_weights)
DM_oot = xgb.DMatrix(data=X_oot, label=y_oot, weight=oot_weights)

In [18]:
del DM_val, DM_oot

In [9]:
del X_train, X_val, X_oot

In [27]:
# Define objective for Bayesian optimization
def objective(trial):
    params = {
        'tree_method': 'hist',
        'device': 'cuda',
        'objective': 'reg:squarederror',
        #'eval_metric': wmae,
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1)
    }

    model = xgb.train(
        params,
        DM_train,
        num_boost_round=1000,
        evals=[(DM_val, 'val')],
        early_stopping_rounds=50,
        feval=wmae,
        verbose_eval=False
    )

    predictions = model.predict(DM_val)

    return mean_absolute_error(y_val, predictions, sample_weight=val_weights)

In [28]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

[I 2025-01-04 14:35:03,915] A new study created in memory with name: no-name-25181468-6b11-4de1-94c2-f0271d0bf756
[I 2025-01-04 14:40:16,741] Trial 0 finished with value: 21.042291548329416 and parameters: {'max_depth': 9, 'learning_rate': 0.10030640756936818, 'subsample': 0.9533098393777967, 'colsample_bytree': 0.6662756192658198, 'min_child_weight': 6, 'gamma': 0.17016039597364047, 'reg_alpha': 0.6319653168494699, 'reg_lambda': 0.9816708607986481}. Best is trial 0 with value: 21.042291548329416.
[I 2025-01-04 14:45:27,688] Trial 1 finished with value: 28.028851061848275 and parameters: {'max_depth': 9, 'learning_rate': 0.03839191107145719, 'subsample': 0.8514898152345085, 'colsample_bytree': 0.6772352938725226, 'min_child_weight': 9, 'gamma': 0.04110176501736956, 'reg_alpha': 0.06747288206939617, 'reg_lambda': 0.06722734753094672}. Best is trial 0 with value: 21.042291548329416.
[I 2025-01-04 14:50:26,436] Trial 2 finished with value: 21.905485325663648 and parameters: {'max_depth': 

In [29]:
# Take a look at best parameters
best_params = study.best_params
print("Best hyperparameters:", best_params)

Best hyperparameters: {'max_depth': 10, 'learning_rate': 0.2537400839413723, 'subsample': 0.7076939162504635, 'colsample_bytree': 0.808729670358781, 'min_child_weight': 3, 'gamma': 0.06288940827561051, 'reg_alpha': 0.222048017933006, 'reg_lambda': 0.81477222217481}


In [21]:
# Save best parameters
best_parameters = {
    'tree_method': 'hist',
    'device': 'cuda',
    'objective': 'reg:squarederror',
    'max_depth': 10,
    'learning_rate': 0.2537400839413723,
    'subsample': 0.7076939162504635,
    'colsample_bytree': 0.808729670358781,
    'min_child_weight': 3,
    'gamma': 0.06288940827561051,
    'reg_alpha': 0.222048017933006,
    'reg_lambda': 0.81477222217481
}

In [None]:
# Train model with best parameters
model = xgb.train(
        best_parameters,
        DM_train,
        num_boost_round=1000,
        evals=[(DM_val, 'val')],
        early_stopping_rounds=50,
        custom_metric=wmae,
        verbose_eval=False
    )

In [None]:
with open('/kaggle/input/rohlik-sales-preprocessed-v1/test_dataset_scaled.pkl', 'rb') as f:
    X_test = xgb.DMatrix(data=pickle.load(f)['X_test'])

with open('/kaggle/input/rohlik-sales-preprocessed-v1/test_dataset.pkl', 'rb') as f:
    X_test_id = pickle.load(f)['X_test']



test_preds = model.predict(X_test)
submission = pd.DataFrame(
    data={
        'id': X_test_id.loc[:, 'unique_id'] + '_' + X_test_id.loc[:, 'date'],
        'sales_hat': test_preds
    }
)

submission['sales_hat'] = submission.loc[:, 'sales_hat'].apply(lambda x: x if x >= 0.0 else 0.0)
submission.to_csv('submission.csv', index=False)