In [1]:
import pandas as pd
import numpy as np
from store_sales import encode_features, get_mae, get_metrics_cross_validation
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
import optuna

In [2]:
data = pd.read_csv('../data/prepared_data/prepared_data.csv')
data.head()

Unnamed: 0,date,store_number,item_family,items_on_promotion,item_sales,city,state,store_type,store_cluster,oil_price,...,is_popular_cluster,is_non_popular_cluster,is_special_non_working_day,is_national_holiday,is_state_pichincha,is_state_manabi_or_pastaza,is_city_quito_or_cayambe,is_city_manta_or_puyo,is_store_type_A,number_of_days_since_earthquake
0,2013-01-01,1,AUTOMOTIVE,0,0.0,Quito,Pichincha,D,13,93.14,...,0,0,0,1,1,0,1,0,0,-1201
1,2013-01-01,1,BABY CARE,0,0.0,Quito,Pichincha,D,13,93.14,...,0,0,0,1,1,0,1,0,0,-1201
2,2013-01-01,1,BEAUTY,0,0.0,Quito,Pichincha,D,13,93.14,...,0,0,0,1,1,0,1,0,0,-1201
3,2013-01-01,1,BEVERAGES,0,0.0,Quito,Pichincha,D,13,93.14,...,0,0,0,1,1,0,1,0,0,-1201
4,2013-01-01,1,BOOKS,0,0.0,Quito,Pichincha,D,13,93.14,...,0,0,0,1,1,0,1,0,0,-1201


#### We decided to make a prediction for the last 15 days of the dataset

In [3]:
number_of_days_to_predict = 15
min_test_date = pd.to_datetime(data['date'].unique()[-number_of_days_to_predict])
train_data = data[pd.to_datetime(data['date']) < min_test_date]
test_data = data[pd.to_datetime(data['date']) >= min_test_date]

In [5]:
xgboost = xgb.XGBRegressor(max_depth=2, n_estimators=259, learning_rate=0.016911596898275656,
                            subsample=0.966503084641785, colsample_bytree=0.7652975950602404,
                            gamma=0.3662258875815241, reg_lambda=2.254417098310434e-07, alpha=0.5897781687350846, n_jobs=-1, random_state=42)

mae, mean_sales, wmape_in_percentage = get_metrics_cross_validation(train_data, xgboost)

print('Average MAE for XGBoost:', mae)  #96.2
print('Average sales:', mean_sales) #356.81 
print('Average WMAPE for XGBoost:', wmape_in_percentage, '%')  #74.6 %

In [15]:
tscv = TimeSeriesSplit(n_splits=5)
shops_number = 5
random_stores = np.random.choice(train_data['store_number'].unique(), shops_number, replace=False)

def objective(trial):
    parameters = {
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True)
    }

    mae_scores = []
    model = xgb.XGBRegressor(**parameters, random_state=42, n_jobs=-1)

    for store_num in random_stores:
        for item_family in train_data['item_family'].unique():
            data = train_data[(train_data['store_number'] == store_num) & (train_data['item_family'] == item_family)]
            X = data.drop(['item_sales'], axis=1)
            y = data['item_sales']
            for train_index, test_index in tscv.split(X):
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                X_train_encoded, X_test_encoded = encode_features(X_train.copy(), X_test.copy())
                mae = get_mae(X_train_encoded, X_test_encoded, y_train, y_test, model)
                mae_scores.append(mae)

    return mae_scores.mean()

In [16]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

print(f"Best parameters: {study.best_params}")
print(f"Best MAE value: {study.best_value}")

[I 2024-08-03 18:43:57,480] A new study created in memory with name: no-name-698fdb33-fe86-4dbe-9d23-ff9e83b53760
[I 2024-08-03 18:47:49,257] Trial 0 finished with value: 118.22443636363636 and parameters: {'max_depth': 5, 'n_estimators': 349, 'learning_rate': 0.1380931973890562, 'subsample': 0.883647838911324, 'colsample_bytree': 0.5499718353309548, 'gamma': 0.18318479433838625, 'lambda': 0.004622773461775388, 'alpha': 4.0583835381135327e-07}. Best is trial 0 with value: 118.22443636363636.
[I 2024-08-03 18:51:12,079] Trial 1 finished with value: 124.04550303030304 and parameters: {'max_depth': 4, 'n_estimators': 335, 'learning_rate': 0.17519487940723574, 'subsample': 0.739049267948851, 'colsample_bytree': 0.8410650837279278, 'gamma': 0.22047785343612641, 'lambda': 1.1746653436559743e-05, 'alpha': 0.00010963972238816084}. Best is trial 0 with value: 118.22443636363636.
[I 2024-08-03 18:58:57,302] Trial 2 finished with value: 108.77469090909092 and parameters: {'max_depth': 8, 'n_estim

Best parameters: {'max_depth': 2, 'n_estimators': 259, 'learning_rate': 0.016911596898275656, 'subsample': 0.966503084641785, 'colsample_bytree': 0.7652975950602404, 'gamma': 0.3662258875815241, 'lambda': 2.254417098310434e-07, 'alpha': 0.5897781687350846}
Best WMAPE value: 93.22602424242424
