In [38]:
import pandas as pd
import numpy as np
import logging
import json

from typing import Dict, Tuple
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from mlforecast import MLForecast
from window_ops.rolling import rolling_mean, rolling_max, rolling_min

In [39]:
catalog.list()


[1m[[0m
    [32m'stores'[0m,
    [32m'calendar'[0m,
    [32m'sales'[0m,
    [32m'preprocessed_stores'[0m,
    [32m'preprocessed_sales'[0m,
    [32m'store_sales_weekly'[0m,
    [32m'train_data'[0m,
    [32m'test_data'[0m,
    [32m'future_data'[0m,
    [32m'forecast_model'[0m,
    [32m'metrics'[0m,
    [32m'predictions'[0m,
    [32m'parameters'[0m,
    [32m'params:model_options'[0m,
    [32m'params:model_options.split_date'[0m,
    [32m'params:model_options.random_state'[0m,
    [32m'params:model_options.n_estimators'[0m,
    [32m'params:model_options.num_threads'[0m,
    [32m'params:model_options.freqency'[0m,
    [32m'params:model_options.horizon'[0m,
    [32m'params:model_options.lags'[0m,
    [32m'params:model_options.date_features'[0m,
    [32m'params:model_options.static_features'[0m,
    [32m'params:model_options.exogenous_features'[0m
[1m][0m

In [40]:
parameters=catalog.load('params:model_options')
parameters


[1m{[0m
    [32m'split_date'[0m: [32m'2009-05-03'[0m,
    [32m'random_state'[0m: [1;36m42[0m,
    [32m'n_estimators'[0m: [1;36m100[0m,
    [32m'num_threads'[0m: [1;36m6[0m,
    [32m'freqency'[0m: [32m'W'[0m,
    [32m'horizon'[0m: [1;36m4[0m,
    [32m'lags'[0m: [1m[[0m[1;36m1[0m, [1;36m2[0m, [1;36m4[0m[1m][0m,
    [32m'date_features'[0m: [1m[[0m[32m'week'[0m, [32m'month'[0m[1m][0m,
    [32m'static_features'[0m: [1m[[0m[32m'Size'[0m, [32m'Type_A'[0m, [32m'Type_B'[0m[1m][0m,
    [32m'exogenous_features'[0m: [1m[[0m[32m'IsHoliday'[0m, [32m'Temperature'[0m, [32m'Fuel_Price'[0m, [32m'Unemployment'[0m, [32m'CPI'[0m[1m][0m
[1m}[0m

In [47]:
# train_data = pd.DataFrame(catalog.load('train_data'))
test_data = pd.read_csv('../tests/data/test_data.csv', parse_dates=['ds'])
test_data

Unnamed: 0,unique_id,ds,y,Size,Type_A,Type_B,IsHoliday,Temperature,Fuel_Price,CPI,Unemployment
0,id_1,2007-09-23,0.107754,90,0,1,0,35.941204,2.406140,7.010738,4.392684
1,id_1,2007-09-30,0.473685,90,0,1,0,43.280131,2.202386,9.655562,3.992608
2,id_1,2007-10-07,0.365428,90,0,1,0,38.400777,3.493784,7.717524,4.424695
3,id_1,2007-10-14,0.126971,90,0,1,0,40.772740,2.917491,7.849567,5.304473
4,id_1,2007-10-21,0.106656,90,0,1,0,39.256674,2.230744,4.252065,8.740808
...,...,...,...,...,...,...,...,...,...,...,...
135,id_8,2009-05-17,0.177690,92,0,1,0,22.363804,2.393108,7.961581,6.360911
136,id_1,2009-05-24,0.336024,90,0,1,0,33.991867,2.652832,3.253850,3.112671
137,id_8,2009-05-24,0.021931,92,0,1,0,33.991867,2.652832,3.253850,3.112671
138,id_1,2009-05-31,0.122684,90,0,1,0,34.640794,2.564955,5.316975,3.989611


In [48]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140 entries, 0 to 139
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   unique_id     140 non-null    object        
 1   ds            140 non-null    datetime64[ns]
 2   y             140 non-null    float64       
 3   Size          140 non-null    int64         
 4   Type_A        140 non-null    int64         
 5   Type_B        140 non-null    int64         
 6   IsHoliday     140 non-null    int64         
 7   Temperature   140 non-null    float64       
 8   Fuel_Price    140 non-null    float64       
 9   CPI           140 non-null    float64       
 10  Unemployment  140 non-null    float64       
dtypes: datetime64[ns](1), float64(5), int64(4), object(1)
memory usage: 12.2+ KB


In [49]:
Y_df = test_data.query(" ds <  '2009-05-03' ")
Y_df.head()

Unnamed: 0,unique_id,ds,y,Size,Type_A,Type_B,IsHoliday,Temperature,Fuel_Price,CPI,Unemployment
0,id_1,2007-09-23,0.107754,90,0,1,0,35.941204,2.40614,7.010738,4.392684
1,id_1,2007-09-30,0.473685,90,0,1,0,43.280131,2.202386,9.655562,3.992608
2,id_1,2007-10-07,0.365428,90,0,1,0,38.400777,3.493784,7.717524,4.424695
3,id_1,2007-10-14,0.126971,90,0,1,0,40.77274,2.917491,7.849567,5.304473
4,id_1,2007-10-21,0.106656,90,0,1,0,39.256674,2.230744,4.252065,8.740808


In [50]:
# num_threads = parameters["num_threads"]
# random_state = parameters["random_state"]
# n_estimators = parameters["n_estimators"]
# freqency = parameters["freqency"]
# lags = parameters["lags"]
# date_features = parameters["date_features"]
# static_features = parameters["static_features"]
# inner_models = [
#                 make_pipeline(SimpleImputer(), RandomForestRegressor(random_state=random_state, n_estimators=n_estimators)), 
#                 XGBRegressor(random_state=random_state, n_estimators=n_estimators),
#                 LGBMRegressor(random_state=random_state, n_estimators=n_estimators),
#                 ]

# model = MLForecast( models=inner_models,
#                     freq=freqency,
#                     lags=lags,
#                     lag_transforms={
#                         1: [(rolling_mean, 4), (rolling_min, 4), (rolling_max, 4)],
#                     },
#                     date_features=date_features,
#                     num_threads=num_threads
#                    )
# model.fit(train_data, static_features=static_features)

In [51]:
num_threads = parameters["num_threads"]
random_state = parameters["random_state"]
n_estimators = parameters["n_estimators"]
freqency = parameters["freqency"]
lags = parameters["lags"]
date_features = parameters["date_features"]
static_features = parameters["static_features"]
inner_models = [
                make_pipeline(SimpleImputer(), RandomForestRegressor(random_state=random_state, n_estimators=n_estimators)), 
                XGBRegressor(random_state=random_state, n_estimators=n_estimators),
                LGBMRegressor(random_state=random_state, n_estimators=n_estimators),
                ]

model = MLForecast( models=inner_models,
                    freq=freqency,
                    lags=lags,
                    lag_transforms={
                        1: [(rolling_mean, 4), (rolling_min, 4), (rolling_max, 4)],
                    },
                    date_features=date_features,
                    num_threads=num_threads
                   )

In [52]:
crossvalidation_df = model.cross_validation(
    df=Y_df,
    h=4,
    n_windows=3,
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000047 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 363
[LightGBM] [Info] Number of data points in the train set: 98, number of used features: 13
[LightGBM] [Info] Start training from score 0.252460
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000033 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 391
[LightGBM] [Info] Number of data points in the train set: 106, number of used features: 13
[LightGBM] [Info] Start training from score 0.254799


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 414
[LightGBM] [Info] Number of data points in the train set: 114, number of used features: 13
[LightGBM] [Info] Start training from score 0.249878


In [62]:
Y_df

Unnamed: 0,unique_id,ds,y,Size,Type_A,Type_B,IsHoliday,Temperature,Fuel_Price,CPI,Unemployment
0,id_1,2007-09-23,0.107754,90,0,1,0,35.941204,2.406140,7.010738,4.392684
1,id_1,2007-09-30,0.473685,90,0,1,0,43.280131,2.202386,9.655562,3.992608
2,id_1,2007-10-07,0.365428,90,0,1,0,38.400777,3.493784,7.717524,4.424695
3,id_1,2007-10-14,0.126971,90,0,1,0,40.772740,2.917491,7.849567,5.304473
4,id_1,2007-10-21,0.106656,90,0,1,0,39.256674,2.230744,4.252065,8.740808
...,...,...,...,...,...,...,...,...,...,...,...
125,id_8,2009-04-12,0.487362,92,0,1,0,42.370074,2.719997,7.732305,9.631493
126,id_1,2009-04-19,0.005714,90,0,1,0,30.055729,2.941485,4.665521,7.707117
127,id_8,2009-04-19,0.340185,92,0,1,0,30.055729,2.941485,4.665521,7.707117
128,id_1,2009-04-26,0.385290,90,0,1,0,20.030857,2.518157,3.671697,3.745701


In [63]:
Y_df.groupby('unique_id')['ds'].agg(['min', 'max'])

Unnamed: 0_level_0,min,max
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
id_1,2007-09-23,2009-04-26
id_8,2008-06-15,2009-04-26


In [61]:
crossvalidation_df

Unnamed: 0,unique_id,ds,cutoff,y,Pipeline,XGBRegressor,LGBMRegressor
0,id_1,2009-02-08,2009-02-01,0.480417,0.27334,0.200106,0.184039
1,id_1,2009-02-15,2009-02-01,0.453278,0.280492,0.271973,0.181468
2,id_1,2009-02-22,2009-02-01,0.387024,0.260926,0.271973,0.270804
3,id_1,2009-03-01,2009-02-01,0.166573,0.225866,0.199963,0.306804
4,id_8,2009-02-08,2009-02-01,0.122995,0.297214,0.271802,0.173825
5,id_8,2009-02-15,2009-02-01,0.291569,0.24863,0.196361,0.270988
6,id_8,2009-02-22,2009-02-01,0.129018,0.237302,0.217619,0.254675
7,id_8,2009-03-01,2009-02-01,0.236693,0.268382,0.269686,0.11483
8,id_1,2009-03-08,2009-03-01,0.040551,0.247576,0.226337,0.203717
9,id_1,2009-03-15,2009-03-01,0.203621,0.21861,0.219392,0.203717


In [54]:
future_data = test_data.query(" ds >=  '2009-05-03' ")

In [55]:
horizon = parameters["horizon"]
exogenous_features = parameters["exogenous_features"]
expected_columns = ["unique_id", "ds"] + exogenous_features
future_data = future_data[expected_columns]


In [56]:
pred = model.predict(h=horizon, X_df=future_data)


In [64]:
import optuna