In [1]:
import pandas as pd
import numpy as np
import sys
import os
from pathlib import Path
import mlflow
import mlflow.catboost
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll.base import scope
from catboost import CatBoostRegressor
import numpy as np
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
parent_dir = Path.cwd().parents[1]
sys.path.append(os.path.abspath(parent_dir))

In [4]:
from utils.DataPreprocessPipeline import DataPreprocessPipeline
preprocessor = DataPreprocessPipeline(num_lags=3, rolling_window_size=3)

In [5]:
train_df = pd.read_csv('../../../data/training_data.csv')

In [6]:
train_df.head()

Unnamed: 0,date_id,item_dept,item_qty,net_sales,store,item,invoice_num
0,11/1/2021,Grocery,1.0,160.0,XYZ,16620,1475459.0
1,11/1/2021,Grocery,2.0,480.0,XYZ,32365,1475459.0
2,11/1/2021,Grocery,1.0,127.0,XYZ,31349,1475459.0
3,11/1/2021,Household,2.0,110.0,XYZ,1266,1475475.0
4,11/1/2021,Household,1.0,150.0,XYZ,114920,1475475.0


In [7]:
train_df_preprocessed = preprocessor.transform(train_df)
train_df_preprocessed

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,lag_item_qty_1,lag_item_qty_2,lag_item_qty_3,rolling_mean_item_qty_3,rolling_std_item_qty_3,...,expanding_min_net_sales,expanding_max_net_sales,diff_net_sales,diff_net_sales_7,day_of_week,isWeekend,Is_Holiday,store_XYZ,item_dept_Grocery,item_dept_Household
42,2021-11-08,Beverages,ABC,974.000,246984.68200,937.000,884.000,991.000,937.333333,53.500779,...,199145.596010,2.779048e+05,-23294.307000,57.000,0,0,0,0.0,0.0,0.0
43,2021-11-08,Beverages,XYZ,677.000,201386.08100,1043.000,1086.000,910.000,1013.000000,91.755109,...,187931.769020,2.899930e+05,27566.920000,-102.000,0,0,0,1.0,0.0,0.0
44,2021-11-08,Grocery,ABC,2326.638,438841.43801,2646.820,2614.006,2654.582,2638.469333,21.538411,...,397946.965024,5.822066e+05,-43148.157030,-33.761,0,0,0,0.0,1.0,0.0
45,2021-11-08,Grocery,XYZ,2879.414,498037.79100,3608.962,3625.864,3287.828,3507.551333,190.473559,...,491450.723000,6.616550e+05,38798.438006,-107.646,0,0,0,1.0,1.0,0.0
46,2021-11-08,Household,ABC,883.000,240294.89201,998.000,1152.000,986.000,1045.333333,92.570694,...,209100.513996,2.920205e+05,-50544.119984,-160.000,0,0,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547,2022-01-31,Beverages,XYZ,791.000,179122.36501,1048.000,1255.000,1336.000,1213.000000,148.522726,...,149946.090000,5.130570e+05,-97559.236000,22.000,0,0,0,1.0,0.0,0.0
548,2022-01-31,Grocery,ABC,2603.874,542643.93100,2869.215,2587.436,2358.758,2605.136333,255.688411,...,367440.183000,2.035445e+06,33708.435960,286.094,0,0,0,0.0,1.0,0.0
549,2022-01-31,Grocery,XYZ,2996.909,563043.73400,4040.207,3539.002,3162.792,3580.667000,440.188880,...,443513.481000,1.161787e+06,89212.836000,434.602,0,0,0,1.0,1.0,0.0
550,2022-01-31,Household,ABC,1115.000,314800.26999,1170.000,1236.000,1062.000,1156.000000,87.840765,...,181968.999000,4.518945e+05,-26337.569910,180.000,0,0,0,0.0,0.0,1.0


In [8]:
from utils.model_helpers import create_training_testing
train_dict, valid_dict = create_training_testing(train_df_preprocessed, test_date_start='2022-01-15')

In [9]:
train_X = train_dict['train_features']
train_y_sales = train_dict['train_net_sales']
train_y_item_qty = train_dict['train_item_qty']

val_X = valid_dict['train_features']
val_y_sales = valid_dict['train_net_sales']
val_y_item_qty = valid_dict['train_item_qty']

In [10]:
from utils.SalesItemQtyModel import SalesItemQtyModel

In [11]:
def objective(params):
    model_sales = CatBoostRegressor(**params, verbose = 0)
    model_item_qty = CatBoostRegressor(**params, verbose = 0)

    sales_item_qty_model = SalesItemQtyModel(model_sales = model_sales, model_item_qty=model_item_qty)
    sales_item_qty_model.fit(train_X, train_y_sales, train_y_item_qty)

    score_val_sales = sales_item_qty_model.score_sales(val_X, val_y_sales)
    score_val_item_qty = sales_item_qty_model.score_item_qty(val_X, val_y_item_qty)

    with mlflow.start_run(nested=True):
        mlflow.log_params(params)
        mlflow.log_metric("val_sales_MAPE", score_val_sales)
        mlflow.log_metric("val_item_qty_MAPE", score_val_item_qty)
        mlflow.catboost.log_model(model_sales, "model_sales")
        mlflow.catboost.log_model(model_item_qty, "model_item_qty")

    # Objective is to minimize the average of the two MSEs
    return {'loss': (score_val_sales + score_val_item_qty) / 2, 'status': STATUS_OK}

In [12]:
# Define the search space for Hyperopt
search_space = {
    'depth': hp.quniform('depth', 4, 10, 1),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'iterations': scope.int(hp.quniform('iterations', 100, 1000, 50)),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 1e-5, 100),
    'border_count': scope.int(hp.quniform('border_count', 32, 255, 1)),
    'bagging_temperature': hp.uniform('bagging_temperature', 0, 1),
    'random_strength': hp.uniform('random_strength', 1e-9, 10),
    'one_hot_max_size': scope.int(hp.quniform('one_hot_max_size', 2, 255, 1)),
}

In [13]:
# Create a Trials object to store the results of the optimization process
trials = Trials()

# Start an MLflow experiment
with mlflow.start_run(run_name="catboost_hyperopt_tuning"):
    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=100,
        trials=trials
    )

  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]





  1%|          | 1/100 [00:11<18:39, 11.31s/trial, best loss: 0.1509964447667711]





  2%|▏         | 2/100 [00:18<14:03,  8.61s/trial, best loss: 0.1509964447667711]





  3%|▎         | 3/100 [00:24<12:34,  7.77s/trial, best loss: 0.14665684322659833]





  4%|▍         | 4/100 [00:31<12:02,  7.53s/trial, best loss: 0.1368103090361735] 





  5%|▌         | 5/100 [00:38<11:06,  7.02s/trial, best loss: 0.13498364655074835]





  6%|▌         | 6/100 [00:47<12:10,  7.77s/trial, best loss: 0.13498364655074835]





  7%|▋         | 7/100 [00:53<11:16,  7.28s/trial, best loss: 0.13498364655074835]





  8%|▊         | 8/100 [01:26<23:30, 15.33s/trial, best loss: 0.13498364655074835]





  9%|▉         | 9/100 [01:32<18:46, 12.37s/trial, best loss: 0.13498364655074835]





 10%|█         | 10/100 [01:39<16:04, 10.72s/trial, best loss: 0.13498364655074835]





 11%|█         | 11/100 [01:54<17:52, 12.05s/trial, best loss: 0.13498364655074835]





 12%|█▏        | 12/100 [02:16<22:22, 15.26s/trial, best loss: 0.13498364655074835]





 13%|█▎        | 13/100 [02:23<18:35, 12.82s/trial, best loss: 0.12704882105741988]





 14%|█▍        | 14/100 [02:30<15:31, 10.83s/trial, best loss: 0.12704882105741988]





 15%|█▌        | 15/100 [02:35<13:11,  9.31s/trial, best loss: 0.11916991020592088]





 16%|█▌        | 16/100 [02:46<13:45,  9.83s/trial, best loss: 0.11916991020592088]





 17%|█▋        | 17/100 [02:52<11:55,  8.62s/trial, best loss: 0.11916991020592088]





 18%|█▊        | 18/100 [02:58<10:32,  7.72s/trial, best loss: 0.11916991020592088]





 19%|█▉        | 19/100 [03:03<09:32,  7.06s/trial, best loss: 0.11916991020592088]





 20%|██        | 20/100 [03:10<09:07,  6.84s/trial, best loss: 0.11916991020592088]





 21%|██        | 21/100 [03:22<11:20,  8.62s/trial, best loss: 0.11916991020592088]





 22%|██▏       | 22/100 [03:30<10:49,  8.33s/trial, best loss: 0.11916991020592088]





 23%|██▎       | 23/100 [03:36<09:37,  7.50s/trial, best loss: 0.11916991020592088]





 24%|██▍       | 24/100 [03:42<09:00,  7.11s/trial, best loss: 0.11916991020592088]





 25%|██▌       | 25/100 [03:47<08:06,  6.49s/trial, best loss: 0.11916991020592088]





 26%|██▌       | 26/100 [03:53<07:49,  6.34s/trial, best loss: 0.11916991020592088]





 27%|██▋       | 27/100 [04:01<08:21,  6.87s/trial, best loss: 0.11916991020592088]





 28%|██▊       | 28/100 [04:07<07:53,  6.58s/trial, best loss: 0.11916991020592088]





 29%|██▉       | 29/100 [04:13<07:31,  6.36s/trial, best loss: 0.11916991020592088]





 30%|███       | 30/100 [04:19<07:13,  6.19s/trial, best loss: 0.11916991020592088]





 31%|███       | 31/100 [04:25<07:07,  6.20s/trial, best loss: 0.11916991020592088]





 32%|███▏      | 32/100 [04:31<06:55,  6.11s/trial, best loss: 0.11916991020592088]





 33%|███▎      | 33/100 [04:37<06:52,  6.16s/trial, best loss: 0.11916991020592088]





 34%|███▍      | 34/100 [04:43<06:43,  6.11s/trial, best loss: 0.11916991020592088]





 35%|███▌      | 35/100 [04:49<06:36,  6.11s/trial, best loss: 0.11478333007811639]





 36%|███▌      | 36/100 [04:55<06:25,  6.03s/trial, best loss: 0.11478333007811639]





 37%|███▋      | 37/100 [05:01<06:16,  5.98s/trial, best loss: 0.11478333007811639]





 38%|███▊      | 38/100 [05:06<06:03,  5.87s/trial, best loss: 0.11478333007811639]





 39%|███▉      | 39/100 [05:13<06:04,  5.98s/trial, best loss: 0.11478333007811639]





 40%|████      | 40/100 [05:18<05:53,  5.90s/trial, best loss: 0.11478333007811639]





 41%|████      | 41/100 [05:24<05:47,  5.88s/trial, best loss: 0.11478333007811639]





 42%|████▏     | 42/100 [05:30<05:32,  5.74s/trial, best loss: 0.11478333007811639]





 43%|████▎     | 43/100 [05:36<05:43,  6.02s/trial, best loss: 0.11478333007811639]





 44%|████▍     | 44/100 [05:43<05:46,  6.20s/trial, best loss: 0.11478333007811639]





 45%|████▌     | 45/100 [05:48<05:26,  5.93s/trial, best loss: 0.11478333007811639]





 46%|████▌     | 46/100 [05:53<05:08,  5.72s/trial, best loss: 0.11478333007811639]





 47%|████▋     | 47/100 [05:59<05:01,  5.69s/trial, best loss: 0.11478333007811639]





 48%|████▊     | 48/100 [06:06<05:09,  5.95s/trial, best loss: 0.11478333007811639]





 49%|████▉     | 49/100 [06:17<06:24,  7.53s/trial, best loss: 0.11478333007811639]





 50%|█████     | 50/100 [06:24<06:14,  7.49s/trial, best loss: 0.11478333007811639]





 51%|█████     | 51/100 [06:32<06:09,  7.54s/trial, best loss: 0.11478333007811639]





 52%|█████▏    | 52/100 [06:37<05:33,  6.96s/trial, best loss: 0.11478333007811639]





 53%|█████▎    | 53/100 [06:45<05:41,  7.26s/trial, best loss: 0.11478333007811639]





 54%|█████▍    | 54/100 [06:53<05:41,  7.41s/trial, best loss: 0.11478333007811639]





 55%|█████▌    | 55/100 [07:03<06:08,  8.18s/trial, best loss: 0.11478333007811639]





 56%|█████▌    | 56/100 [07:09<05:24,  7.38s/trial, best loss: 0.11478333007811639]





 57%|█████▋    | 57/100 [07:15<05:04,  7.08s/trial, best loss: 0.11478333007811639]





 58%|█████▊    | 58/100 [07:21<04:46,  6.83s/trial, best loss: 0.11478333007811639]





 59%|█████▉    | 59/100 [07:29<04:50,  7.09s/trial, best loss: 0.11478333007811639]





 60%|██████    | 60/100 [07:36<04:41,  7.04s/trial, best loss: 0.11478333007811639]





 61%|██████    | 61/100 [07:54<06:40, 10.28s/trial, best loss: 0.11478333007811639]





 62%|██████▏   | 62/100 [08:00<05:45,  9.10s/trial, best loss: 0.11478333007811639]





 63%|██████▎   | 63/100 [08:05<04:54,  7.96s/trial, best loss: 0.11478333007811639]





 64%|██████▍   | 64/100 [08:11<04:21,  7.26s/trial, best loss: 0.11478333007811639]





 65%|██████▌   | 65/100 [08:30<06:21, 10.91s/trial, best loss: 0.11478333007811639]





 66%|██████▌   | 66/100 [08:36<05:15,  9.27s/trial, best loss: 0.11478333007811639]





 67%|██████▋   | 67/100 [08:41<04:25,  8.06s/trial, best loss: 0.11478333007811639]





 68%|██████▊   | 68/100 [08:46<03:51,  7.23s/trial, best loss: 0.11478333007811639]





 69%|██████▉   | 69/100 [08:52<03:26,  6.66s/trial, best loss: 0.11478333007811639]





 70%|███████   | 70/100 [08:58<03:16,  6.55s/trial, best loss: 0.11478333007811639]





 71%|███████   | 71/100 [09:04<03:01,  6.27s/trial, best loss: 0.11478333007811639]





 72%|███████▏  | 72/100 [09:09<02:46,  5.96s/trial, best loss: 0.11478333007811639]





 73%|███████▎  | 73/100 [09:14<02:34,  5.73s/trial, best loss: 0.11478333007811639]





 74%|███████▍  | 74/100 [09:20<02:32,  5.86s/trial, best loss: 0.11478333007811639]





 75%|███████▌  | 75/100 [09:25<02:21,  5.67s/trial, best loss: 0.11478333007811639]





 76%|███████▌  | 76/100 [09:32<02:20,  5.86s/trial, best loss: 0.11478333007811639]





 77%|███████▋  | 77/100 [09:37<02:10,  5.69s/trial, best loss: 0.11478333007811639]





 78%|███████▊  | 78/100 [09:42<02:03,  5.60s/trial, best loss: 0.11478333007811639]





 79%|███████▉  | 79/100 [09:48<01:58,  5.65s/trial, best loss: 0.11478333007811639]





 80%|████████  | 80/100 [09:54<01:54,  5.71s/trial, best loss: 0.11478333007811639]





 81%|████████  | 81/100 [10:01<01:53,  5.98s/trial, best loss: 0.11478333007811639]





 82%|████████▏ | 82/100 [10:09<01:59,  6.63s/trial, best loss: 0.11478333007811639]





 83%|████████▎ | 83/100 [10:17<02:01,  7.12s/trial, best loss: 0.11478333007811639]





 84%|████████▍ | 84/100 [10:24<01:54,  7.17s/trial, best loss: 0.11478333007811639]





 85%|████████▌ | 85/100 [10:34<01:57,  7.82s/trial, best loss: 0.11478333007811639]





 86%|████████▌ | 86/100 [10:40<01:42,  7.35s/trial, best loss: 0.11478333007811639]





 87%|████████▋ | 87/100 [10:47<01:32,  7.15s/trial, best loss: 0.11478333007811639]





 88%|████████▊ | 88/100 [10:53<01:24,  7.01s/trial, best loss: 0.11478333007811639]





 89%|████████▉ | 89/100 [11:00<01:16,  6.98s/trial, best loss: 0.11478333007811639]





 90%|█████████ | 90/100 [11:06<01:07,  6.72s/trial, best loss: 0.11478333007811639]





 91%|█████████ | 91/100 [11:15<01:04,  7.17s/trial, best loss: 0.11478333007811639]





 92%|█████████▏| 92/100 [11:20<00:53,  6.68s/trial, best loss: 0.11478333007811639]





 93%|█████████▎| 93/100 [11:26<00:45,  6.57s/trial, best loss: 0.11478333007811639]





 94%|█████████▍| 94/100 [11:41<00:54,  9.01s/trial, best loss: 0.11478333007811639]





 95%|█████████▌| 95/100 [11:48<00:41,  8.31s/trial, best loss: 0.11478333007811639]





 96%|█████████▌| 96/100 [11:55<00:32,  8.07s/trial, best loss: 0.11478333007811639]





 97%|█████████▋| 97/100 [12:03<00:24,  8.06s/trial, best loss: 0.11478333007811639]





 98%|█████████▊| 98/100 [12:10<00:15,  7.67s/trial, best loss: 0.11478333007811639]





 99%|█████████▉| 99/100 [12:16<00:07,  7.06s/trial, best loss: 0.11478333007811639]





100%|██████████| 100/100 [12:22<00:00,  7.42s/trial, best loss: 0.11478333007811639]


In [14]:
mlflow.end_run()

In [15]:
print("Best hyperparameters found:", best_params)


Best hyperparameters found: {'bagging_temperature': 0.5447824858171504, 'border_count': 220.0, 'depth': 5.0, 'iterations': 800.0, 'l2_leaf_reg': 13.06135266464145, 'learning_rate': 0.15538685598864, 'one_hot_max_size': 34.0, 'random_strength': 4.115171860221108}


In [16]:
best_params

{'bagging_temperature': 0.5447824858171504,
 'border_count': 220.0,
 'depth': 5.0,
 'iterations': 800.0,
 'l2_leaf_reg': 13.06135266464145,
 'learning_rate': 0.15538685598864,
 'one_hot_max_size': 34.0,
 'random_strength': 4.115171860221108}

In [16]:
trials.best_trial

{'state': 2,
 'tid': 39,
 'spec': None,
 'result': {'loss': 0.1169679988465582, 'status': 'ok'},
 'misc': {'tid': 39,
  'cmd': ('domain_attachment', 'FMinIter_Domain'),
  'workdir': None,
  'idxs': {'bagging_temperature': [39],
   'border_count': [39],
   'depth': [39],
   'iterations': [39],
   'l2_leaf_reg': [39],
   'learning_rate': [39],
   'one_hot_max_size': [39],
   'random_strength': [39]},
  'vals': {'bagging_temperature': [0.4772142988105587],
   'border_count': [200.0],
   'depth': [5.0],
   'iterations': [450.0],
   'l2_leaf_reg': [7.43439511907274],
   'learning_rate': [0.22947940228640154],
   'one_hot_max_size': [124.0],
   'random_strength': [5.316275052028734]}},
 'exp_key': None,
 'owner': None,
 'version': 0,
 'book_time': datetime.datetime(2024, 8, 28, 13, 55, 4, 123000),
 'refresh_time': datetime.datetime(2024, 8, 28, 13, 55, 9, 857000)}

In [17]:
CatBoostRegressor(**best_params, verbose = 0)

<catboost.core.CatBoostRegressor at 0x7fc9e5cb19f0>