In [12]:
import pandas as pd
import numpy as np
from store_sales import encode_features, get_mae_score_cross_validation
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
import optuna

In [13]:
data = pd.read_csv('../data/prepared_data/prepared_data.csv')
data.head()

Unnamed: 0,date,store_number,item_family,items_on_promotion,item_sales,city,state,store_type,store_cluster,oil_price,...,is_popular_cluster,is_non_popular_cluster,is_special_non_working_day,is_national_holiday,is_state_pichincha,is_state_manabi_or_pastaza,is_city_quito_or_cayambe,is_city_manta_or_puyo,is_store_type_A,number_of_days_since_earthquake
0,2013-01-01,1,AUTOMOTIVE,0,0.0,Quito,Pichincha,D,13,93.14,...,0,0,0,1,1,0,1,0,0,-1201
1,2013-01-01,1,BABY CARE,0,0.0,Quito,Pichincha,D,13,93.14,...,0,0,0,1,1,0,1,0,0,-1201
2,2013-01-01,1,BEAUTY,0,0.0,Quito,Pichincha,D,13,93.14,...,0,0,0,1,1,0,1,0,0,-1201
3,2013-01-01,1,BEVERAGES,0,0.0,Quito,Pichincha,D,13,93.14,...,0,0,0,1,1,0,1,0,0,-1201
4,2013-01-01,1,BOOKS,0,0.0,Quito,Pichincha,D,13,93.14,...,0,0,0,1,1,0,1,0,0,-1201


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 28 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   date                             object 
 1   store_number                     int64  
 2   item_family                      object 
 3   items_on_promotion               int64  
 4   item_sales                       float64
 5   city                             object 
 6   state                            object 
 7   store_type                       object 
 8   store_cluster                    int64  
 9   oil_price                        float64
 10  day_type                         object 
 11  holiday_status                   object 
 12  holiday_location                 object 
 13  holiday_description              object 
 14  is_holiday_transferred           bool   
 15  mean_sales_prev_month            float64
 16  is_during_oil_prices_falling     int64  
 17  is_popul

#### We decided to make a prediction for the last 15 days of the dataset

In [15]:
number_of_days_to_predict = 15
min_test_date = pd.to_datetime(data['date'].unique()[-number_of_days_to_predict])
train_data = data[pd.to_datetime(data['date']) < min_test_date]
test_data = data[pd.to_datetime(data['date']) >= min_test_date]

In [26]:
tscv = TimeSeriesSplit(n_splits=5)
xgboost = xgb.XGBRegressor(max_depth=2, n_estimators=67, learning_rate=0.010189438506896714,
                            subsample=0.6630346896799094, colsample_bytree=0.5569732879432812,
                            gamma=0.41992451365678046, reg_lambda=8.179747622279188e-05, alpha=3.410824751582965e-08, n_jobs=-1, random_state=42)

wmape_scores = np.array([])
mae_scores = np.array([])
n = 5
random_stores = np.random.choice(train_data['store_number'].unique(), n, replace=False)

for store_num in random_stores:
    for item_family in train_data['item_family'].unique():
        data = train_data[(train_data['store_number'] == store_num) & (train_data['item_family'] == item_family)]
        X = data.drop(['item_sales'], axis=1)
        y = data['item_sales']
        for train_index, test_index in tscv.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            X_train_encoded, X_test_encoded = encode_features(X_train.copy(), X_test.copy())
            mae = get_mae_score_cross_validation(X_train_encoded, X_test_encoded, y_train, y_test, xgboost)
            mae_scores = np.append(mae_scores, mae)
            mean_sales_this_series = data['item_sales'].mean()
            if mean_sales_this_series > 0:       
                wmape = mae / mean_sales_this_series
                wmape_scores = np.append(wmape_scores, wmape)

print('Average WMAPE for XGBoost:', np.round(wmape_scores.mean() * 100, 2), '%')  # How to calculate WMAPE?
print('Average MAE for XGBoost:', np.round(mae_scores.mean(), 2))

Average WMAPE for XGBoost: 66.18 %
Average MAE for XGBoost: 157.14


In [17]:
train_data['item_sales'].mean()

356.8107777940146

In [24]:
tscv = TimeSeriesSplit(n_splits=5)
random_stores = np.random.choice(train_data['store_number'].unique(), n, replace=False)

def objective(trial):
    parameters = {
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True)
    }

    wmape_scores = np.array([])
    model = xgb.XGBRegressor(**parameters, random_state=42, n_jobs=-1)

    for store_num in random_stores:
        for item_family in train_data['item_family'].unique():
            data = train_data[(train_data['store_number'] == store_num) & (train_data['item_family'] == item_family)]
            X = data.drop(['item_sales'], axis=1)
            y = data['item_sales']
            for train_index, test_index in tscv.split(X):
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                X_train_encoded, X_test_encoded = encode_features(X_train.copy(), X_test.copy())
                mae = get_mae_score_cross_validation(X_train_encoded, X_test_encoded, y_train, y_test, model)
                mean_sales_this_series = data['item_sales'].mean()
                if mean_sales_this_series > 0:       
                    wmape = mae / mean_sales_this_series
                    wmape_scores = np.append(wmape_scores, wmape)

    return wmape_scores.mean()

In [25]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

print(f"Best parameters: {study.best_params}")
print(f"Best WMAPE value: {study.best_value}")

[I 2024-08-01 15:33:47,730] A new study created in memory with name: no-name-ad814d51-aa03-4738-9393-df03e5d6f443
[I 2024-08-01 15:35:58,765] Trial 0 finished with value: 1.1598592870959716 and parameters: {'max_depth': 2, 'n_estimators': 260, 'learning_rate': 0.0712040861465968, 'subsample': 0.6064532513976995, 'colsample_bytree': 0.7022256588767026, 'gamma': 0.14940364062624645, 'lambda': 3.960163929449356e-06, 'alpha': 0.0011192533642486408}. Best is trial 0 with value: 1.1598592870959716.
[I 2024-08-01 15:39:22,830] Trial 1 finished with value: 1.2480245569625223 and parameters: {'max_depth': 4, 'n_estimators': 384, 'learning_rate': 0.269510519835885, 'subsample': 0.658311571048142, 'colsample_bytree': 0.5551483143750424, 'gamma': 0.4470740171708851, 'lambda': 5.264009962166573e-07, 'alpha': 4.071890764318068e-08}. Best is trial 0 with value: 1.1598592870959716.
[I 2024-08-01 15:43:11,653] Trial 2 finished with value: 1.3183349317939994 and parameters: {'max_depth': 4, 'n_estimator

Best parameters: {'max_depth': 2, 'n_estimators': 67, 'learning_rate': 0.010189438506896714, 'subsample': 0.6630346896799094, 'colsample_bytree': 0.5569732879432812, 'gamma': 0.41992451365678046, 'lambda': 8.179747622279188e-05, 'alpha': 3.410824751582965e-08}
Best WMAPE value: 0.7555954927427805
