In [1]:
import pandas as pd
import numpy as np
import sys
import os
from pathlib import Path
import mlflow
import mlflow.catboost
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll.base import scope
from catboost import CatBoostRegressor
import numpy as np
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
parent_dir = Path.cwd().parents[1]
sys.path.append(os.path.abspath(parent_dir))

In [4]:
from utils.DataPreprocessPipeline import DataPreprocessPipeline
preprocessor = DataPreprocessPipeline(num_lags=3, rolling_window_size=3)

In [5]:
train_df = pd.read_csv('../../../data/training_data.csv')

In [6]:
train_df.head()

Unnamed: 0,date_id,item_dept,item_qty,net_sales,store,item,invoice_num
0,11/1/2021,Grocery,1.0,160.0,XYZ,16620,1475459.0
1,11/1/2021,Grocery,2.0,480.0,XYZ,32365,1475459.0
2,11/1/2021,Grocery,1.0,127.0,XYZ,31349,1475459.0
3,11/1/2021,Household,2.0,110.0,XYZ,1266,1475475.0
4,11/1/2021,Household,1.0,150.0,XYZ,114920,1475475.0


In [7]:
train_df_preprocessed = preprocessor.transform(train_df)
train_df_preprocessed

Unnamed: 0,date_id,item_dept,store,item_qty,net_sales,lag_item_qty_1,lag_item_qty_2,lag_item_qty_3,rolling_mean_item_qty_3,rolling_std_item_qty_3,...,rolling_std_net_sales_3,cumsum_net_sales,cummean_net_sales,expanding_min_net_sales,expanding_max_net_sales,diff_net_sales,diff_net_sales_7,day_of_week,isWeekend,Is_Holiday
42,2021-11-08,Beverages,ABC,974.000,246984.68200,937.000,884.000,991.000,937.333333,53.500779,...,18717.981669,1.703852e+06,243407.457717,199145.596010,2.779048e+05,-23294.307000,57.000,0,0,0
43,2021-11-08,Beverages,XYZ,677.000,201386.08100,1043.000,1086.000,910.000,1013.000000,91.755109,...,36459.734951,1.675697e+06,239385.355431,187931.769020,2.899930e+05,27566.920000,-102.000,0,0,0
44,2021-11-08,Grocery,ABC,2326.638,438841.43801,2646.820,2614.006,2654.582,2638.469333,21.538411,...,34985.300857,3.473914e+06,496273.373853,397946.965024,5.822066e+05,-43148.157030,-33.761,0,0,0
45,2021-11-08,Grocery,XYZ,2879.414,498037.79100,3608.962,3625.864,3287.828,3507.551333,190.473559,...,52868.205452,4.010397e+06,572913.907431,491450.723000,6.616550e+05,38798.438006,-107.646,0,0,0
46,2021-11-08,Household,ABC,883.000,240294.89201,998.000,1152.000,986.000,1045.333333,92.570694,...,25552.215188,1.775416e+06,253630.794154,209100.513996,2.920205e+05,-50544.119984,-160.000,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547,2022-01-31,Beverages,XYZ,791.000,179122.36501,1048.000,1255.000,1336.000,1213.000000,148.522726,...,138435.956258,2.381819e+07,261738.336506,149946.090000,5.130570e+05,-97559.236000,22.000,0,0,0
548,2022-01-31,Grocery,ABC,2603.874,542643.93100,2869.215,2587.436,2358.758,2605.136333,255.688411,...,62659.538296,4.889516e+07,537309.461967,367440.183000,2.035445e+06,33708.435960,286.094,0,0,0
549,2022-01-31,Grocery,XYZ,2996.909,563043.73400,4040.207,3539.002,3162.792,3580.667000,440.188880,...,86452.689747,5.545075e+07,609348.883327,443513.481000,1.161787e+06,89212.836000,434.602,0,0,0
550,2022-01-31,Household,ABC,1115.000,314800.26999,1170.000,1236.000,1062.000,1156.000000,87.840765,...,29504.172160,2.512344e+07,276081.704621,181968.999000,4.518945e+05,-26337.569910,180.000,0,0,0


In [8]:
from utils.model_helpers import create_training_testing
train_dict, valid_dict = create_training_testing(train_df_preprocessed, test_date_start='2022-01-15')

In [9]:
train_X = train_dict['train_features']
train_y_sales = train_dict['train_net_sales']
train_y_item_qty = train_dict['train_item_qty']

val_X = valid_dict['train_features']
val_y_sales = valid_dict['train_net_sales']
val_y_item_qty = valid_dict['train_item_qty']

In [10]:
from utils.SalesItemQtyModel import SalesItemQtyModel

In [11]:
def objective(params):
    model_sales = CatBoostRegressor(**params, verbose = 0)
    model_item_qty = CatBoostRegressor(**params, verbose = 0)

    sales_item_qty_model = SalesItemQtyModel(model_sales = model_sales, model_item_qty=model_item_qty)
    sales_item_qty_model.fit(train_X, train_y_sales, train_y_item_qty)

    score_val_sales = sales_item_qty_model.score_sales(val_X, val_y_sales)
    score_val_item_qty = sales_item_qty_model.score_item_qty(val_X, val_y_item_qty)

    with mlflow.start_run(nested=True):
        mlflow.log_params(params)
        mlflow.log_metric("val_sales_MAPE", score_val_sales)
        mlflow.log_metric("val_item_qty_MAPE", score_val_item_qty)
        mlflow.catboost.log_model(model_sales, "model_sales")
        mlflow.catboost.log_model(model_item_qty, "model_item_qty")

    # Objective is to minimize the average of the two MSEs
    return {'loss': (score_val_sales + score_val_item_qty) / 2, 'status': STATUS_OK}

In [12]:
# Define the search space for Hyperopt
search_space = {
    'depth': hp.quniform('depth', 4, 10, 1),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'iterations': scope.int(hp.quniform('iterations', 100, 1000, 50)),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 1e-5, 100),
    'border_count': scope.int(hp.quniform('border_count', 32, 255, 1)),
    'bagging_temperature': hp.uniform('bagging_temperature', 0, 1),
    'random_strength': hp.uniform('random_strength', 1e-9, 10),
    'one_hot_max_size': scope.int(hp.quniform('one_hot_max_size', 2, 255, 1)),
}

In [13]:
# Create a Trials object to store the results of the optimization process
trials = Trials()

# Start an MLflow experiment
with mlflow.start_run(run_name="catboost_hyperopt_tuning"):
    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=100,
        trials=trials
    )

  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]





  1%|          | 1/100 [00:07<12:57,  7.85s/trial, best loss: 0.1542612484199712]





  2%|▏         | 2/100 [00:13<10:45,  6.59s/trial, best loss: 0.13849002957702072]





  3%|▎         | 3/100 [00:21<11:39,  7.21s/trial, best loss: 0.13849002957702072]





  4%|▍         | 4/100 [00:28<11:06,  6.94s/trial, best loss: 0.13849002957702072]





  5%|▌         | 5/100 [00:34<10:49,  6.83s/trial, best loss: 0.13849002957702072]





  6%|▌         | 6/100 [00:40<10:19,  6.59s/trial, best loss: 0.13849002957702072]





  7%|▋         | 7/100 [00:48<10:44,  6.93s/trial, best loss: 0.13849002957702072]





  8%|▊         | 8/100 [00:54<10:12,  6.66s/trial, best loss: 0.13849002957702072]





  9%|▉         | 9/100 [01:00<09:56,  6.55s/trial, best loss: 0.13849002957702072]





 10%|█         | 10/100 [01:07<09:40,  6.46s/trial, best loss: 0.13436599804807328]





 11%|█         | 11/100 [01:14<10:09,  6.84s/trial, best loss: 0.13436599804807328]





 12%|█▏        | 12/100 [01:20<09:43,  6.63s/trial, best loss: 0.13436599804807328]





 13%|█▎        | 13/100 [01:26<09:06,  6.29s/trial, best loss: 0.13436599804807328]





 14%|█▍        | 14/100 [01:32<08:47,  6.13s/trial, best loss: 0.13436599804807328]





 15%|█▌        | 15/100 [01:37<08:21,  5.90s/trial, best loss: 0.13436599804807328]





 16%|█▌        | 16/100 [01:42<07:59,  5.71s/trial, best loss: 0.13436599804807328]





 17%|█▋        | 17/100 [01:48<07:58,  5.76s/trial, best loss: 0.13121318106733126]





 18%|█▊        | 18/100 [01:56<08:41,  6.35s/trial, best loss: 0.13121318106733126]





 19%|█▉        | 19/100 [02:06<10:10,  7.53s/trial, best loss: 0.13121318106733126]





 20%|██        | 20/100 [02:13<09:53,  7.42s/trial, best loss: 0.13121318106733126]





 21%|██        | 21/100 [02:19<09:08,  6.95s/trial, best loss: 0.13121318106733126]





 22%|██▏       | 22/100 [02:25<08:37,  6.63s/trial, best loss: 0.13121318106733126]





 23%|██▎       | 23/100 [02:31<08:19,  6.48s/trial, best loss: 0.13121318106733126]





 24%|██▍       | 24/100 [02:37<07:56,  6.27s/trial, best loss: 0.13121318106733126]





 25%|██▌       | 25/100 [02:44<07:56,  6.36s/trial, best loss: 0.13121318106733126]





 26%|██▌       | 26/100 [02:50<07:46,  6.31s/trial, best loss: 0.13121318106733126]





 27%|██▋       | 27/100 [02:56<07:32,  6.20s/trial, best loss: 0.12747126752267926]





 28%|██▊       | 28/100 [03:04<08:02,  6.70s/trial, best loss: 0.12747126752267926]





 29%|██▉       | 29/100 [03:09<07:38,  6.45s/trial, best loss: 0.12747126752267926]





 30%|███       | 30/100 [03:16<07:32,  6.47s/trial, best loss: 0.12747126752267926]





 31%|███       | 31/100 [03:22<07:20,  6.39s/trial, best loss: 0.12745887126595506]





 32%|███▏      | 32/100 [03:30<07:46,  6.86s/trial, best loss: 0.12745887126595506]





 33%|███▎      | 33/100 [03:37<07:42,  6.91s/trial, best loss: 0.12745887126595506]





 34%|███▍      | 34/100 [03:46<08:09,  7.41s/trial, best loss: 0.12745887126595506]





 35%|███▌      | 35/100 [03:51<07:27,  6.88s/trial, best loss: 0.12745887126595506]





 36%|███▌      | 36/100 [03:58<07:15,  6.80s/trial, best loss: 0.12745887126595506]





 37%|███▋      | 37/100 [04:04<06:43,  6.41s/trial, best loss: 0.12745887126595506]





 38%|███▊      | 38/100 [04:09<06:23,  6.18s/trial, best loss: 0.12745887126595506]





 39%|███▉      | 39/100 [04:19<07:17,  7.18s/trial, best loss: 0.12745887126595506]





 40%|████      | 40/100 [04:24<06:45,  6.75s/trial, best loss: 0.1169679988465582] 





 41%|████      | 41/100 [04:30<06:16,  6.38s/trial, best loss: 0.1169679988465582]





 42%|████▏     | 42/100 [04:35<05:50,  6.05s/trial, best loss: 0.1169679988465582]





 43%|████▎     | 43/100 [04:41<05:42,  6.01s/trial, best loss: 0.1169679988465582]





 44%|████▍     | 44/100 [04:47<05:35,  5.99s/trial, best loss: 0.1169679988465582]





 45%|████▌     | 45/100 [04:53<05:35,  6.10s/trial, best loss: 0.1169679988465582]





 46%|████▌     | 46/100 [04:59<05:21,  5.96s/trial, best loss: 0.1169679988465582]





 47%|████▋     | 47/100 [05:05<05:12,  5.89s/trial, best loss: 0.1169679988465582]





 48%|████▊     | 48/100 [05:11<05:04,  5.85s/trial, best loss: 0.1169679988465582]





 49%|████▉     | 49/100 [05:16<04:55,  5.80s/trial, best loss: 0.1169679988465582]





 50%|█████     | 50/100 [05:23<05:08,  6.16s/trial, best loss: 0.1169679988465582]





 51%|█████     | 51/100 [05:29<04:54,  6.01s/trial, best loss: 0.1169679988465582]





 52%|█████▏    | 52/100 [05:35<04:42,  5.89s/trial, best loss: 0.1169679988465582]





 53%|█████▎    | 53/100 [05:41<04:38,  5.92s/trial, best loss: 0.1169679988465582]





 54%|█████▍    | 54/100 [05:46<04:20,  5.67s/trial, best loss: 0.1169679988465582]





 55%|█████▌    | 55/100 [05:51<04:13,  5.63s/trial, best loss: 0.1169679988465582]





 56%|█████▌    | 56/100 [05:57<04:14,  5.79s/trial, best loss: 0.1169679988465582]





 57%|█████▋    | 57/100 [06:04<04:19,  6.05s/trial, best loss: 0.1169679988465582]





 58%|█████▊    | 58/100 [06:11<04:23,  6.27s/trial, best loss: 0.1169679988465582]





 59%|█████▉    | 59/100 [06:17<04:22,  6.41s/trial, best loss: 0.1169679988465582]





 60%|██████    | 60/100 [06:24<04:19,  6.48s/trial, best loss: 0.1169679988465582]





 61%|██████    | 61/100 [06:30<04:03,  6.23s/trial, best loss: 0.1169679988465582]





 62%|██████▏   | 62/100 [06:36<03:51,  6.10s/trial, best loss: 0.1169679988465582]





 63%|██████▎   | 63/100 [06:42<03:46,  6.13s/trial, best loss: 0.1169679988465582]





 64%|██████▍   | 64/100 [06:48<03:37,  6.05s/trial, best loss: 0.1169679988465582]





 65%|██████▌   | 65/100 [06:53<03:29,  6.00s/trial, best loss: 0.1169679988465582]





 66%|██████▌   | 66/100 [07:00<03:26,  6.08s/trial, best loss: 0.1169679988465582]





 67%|██████▋   | 67/100 [07:06<03:18,  6.01s/trial, best loss: 0.1169679988465582]





 68%|██████▊   | 68/100 [07:12<03:12,  6.03s/trial, best loss: 0.1169679988465582]





 69%|██████▉   | 69/100 [07:19<03:14,  6.28s/trial, best loss: 0.1169679988465582]





 70%|███████   | 70/100 [07:25<03:11,  6.38s/trial, best loss: 0.1169679988465582]





 71%|███████   | 71/100 [07:31<03:01,  6.27s/trial, best loss: 0.1169679988465582]





 72%|███████▏  | 72/100 [07:38<03:00,  6.46s/trial, best loss: 0.1169679988465582]





 73%|███████▎  | 73/100 [07:45<02:55,  6.49s/trial, best loss: 0.1169679988465582]





 74%|███████▍  | 74/100 [07:51<02:44,  6.34s/trial, best loss: 0.1169679988465582]





 75%|███████▌  | 75/100 [07:57<02:36,  6.24s/trial, best loss: 0.1169679988465582]





 76%|███████▌  | 76/100 [08:03<02:29,  6.21s/trial, best loss: 0.1169679988465582]





 77%|███████▋  | 77/100 [08:08<02:18,  6.03s/trial, best loss: 0.1169679988465582]





 78%|███████▊  | 78/100 [08:15<02:14,  6.12s/trial, best loss: 0.1169679988465582]





 79%|███████▉  | 79/100 [08:20<02:04,  5.92s/trial, best loss: 0.1169679988465582]





 80%|████████  | 80/100 [08:26<01:56,  5.85s/trial, best loss: 0.1169679988465582]





 81%|████████  | 81/100 [08:31<01:48,  5.72s/trial, best loss: 0.1169679988465582]





 82%|████████▏ | 82/100 [08:37<01:42,  5.67s/trial, best loss: 0.1169679988465582]





 83%|████████▎ | 83/100 [08:43<01:39,  5.83s/trial, best loss: 0.1169679988465582]





 84%|████████▍ | 84/100 [09:00<02:25,  9.10s/trial, best loss: 0.1169679988465582]





 85%|████████▌ | 85/100 [09:09<02:17,  9.17s/trial, best loss: 0.1169679988465582]





 86%|████████▌ | 86/100 [09:15<01:55,  8.27s/trial, best loss: 0.1169679988465582]





 87%|████████▋ | 87/100 [09:21<01:36,  7.41s/trial, best loss: 0.1169679988465582]





 88%|████████▊ | 88/100 [09:26<01:20,  6.69s/trial, best loss: 0.1169679988465582]





 89%|████████▉ | 89/100 [09:34<01:17,  7.03s/trial, best loss: 0.1169679988465582]





 90%|█████████ | 90/100 [09:39<01:05,  6.54s/trial, best loss: 0.1169679988465582]





 91%|█████████ | 91/100 [09:47<01:03,  7.01s/trial, best loss: 0.1169679988465582]





 92%|█████████▏| 92/100 [09:53<00:53,  6.63s/trial, best loss: 0.1169679988465582]





 93%|█████████▎| 93/100 [09:58<00:43,  6.27s/trial, best loss: 0.1169679988465582]





 94%|█████████▍| 94/100 [10:05<00:38,  6.37s/trial, best loss: 0.1169679988465582]





 95%|█████████▌| 95/100 [10:11<00:31,  6.22s/trial, best loss: 0.1169679988465582]





 96%|█████████▌| 96/100 [10:17<00:24,  6.18s/trial, best loss: 0.1169679988465582]





 97%|█████████▋| 97/100 [10:23<00:18,  6.17s/trial, best loss: 0.1169679988465582]





 98%|█████████▊| 98/100 [10:30<00:12,  6.37s/trial, best loss: 0.1169679988465582]





 99%|█████████▉| 99/100 [10:38<00:06,  6.97s/trial, best loss: 0.1169679988465582]





100%|██████████| 100/100 [10:44<00:00,  6.45s/trial, best loss: 0.1169679988465582]


In [14]:
mlflow.end_run()

In [14]:
print("Best hyperparameters found:", best_params)


Best hyperparameters found: {'bagging_temperature': 0.4772142988105587, 'border_count': 200.0, 'depth': 5.0, 'iterations': 450.0, 'l2_leaf_reg': 7.43439511907274, 'learning_rate': 0.22947940228640154, 'one_hot_max_size': 124.0, 'random_strength': 5.316275052028734}


In [16]:
trials.best_trial

{'state': 2,
 'tid': 39,
 'spec': None,
 'result': {'loss': 0.1169679988465582, 'status': 'ok'},
 'misc': {'tid': 39,
  'cmd': ('domain_attachment', 'FMinIter_Domain'),
  'workdir': None,
  'idxs': {'bagging_temperature': [39],
   'border_count': [39],
   'depth': [39],
   'iterations': [39],
   'l2_leaf_reg': [39],
   'learning_rate': [39],
   'one_hot_max_size': [39],
   'random_strength': [39]},
  'vals': {'bagging_temperature': [0.4772142988105587],
   'border_count': [200.0],
   'depth': [5.0],
   'iterations': [450.0],
   'l2_leaf_reg': [7.43439511907274],
   'learning_rate': [0.22947940228640154],
   'one_hot_max_size': [124.0],
   'random_strength': [5.316275052028734]}},
 'exp_key': None,
 'owner': None,
 'version': 0,
 'book_time': datetime.datetime(2024, 8, 28, 13, 55, 4, 123000),
 'refresh_time': datetime.datetime(2024, 8, 28, 13, 55, 9, 857000)}

In [17]:
CatBoostRegressor(**best_params, verbose = 0)

<catboost.core.CatBoostRegressor at 0x7fc9e5cb19f0>