In [55]:
import pandas as pd
import pytest

from kedro_sales_forecast.pipelines.train.nodes import split_data, train_model, evaluate_model

In [2]:
def dummy_data():
    data =  pd.DataFrame({"unique_id": ["id_1", "id_1", "id_1", "id_1", "id_1"],
            "ds": ["2009-04-12", "2009-04-19", "2009-04-26", "2009-05-03", "2009-05-10"],
            "y": [0.613262, 0.442162, 0.307173, 0.059010, 0.113098],
            "Size": [98, 84, 66, 59, 78],
            "Type_A": [1, 0, 0, 1, 1],
            "Type_B": [0, 1, 1, 0, 0],
            "IsHoliday": [0, 1, 0, 0, 1],
            "Temperature": [45.539704, 30.207879, 37.307495, 24.212213, 24.291916],
            "Fuel_Price": [2.410365, 2.837114, 2.810372, 3.104646, 3.108914],
            "CPI": [3.010402, 6.039538, 5.034800, 6.648689, 3.530509],
            "Unemployment": [7.475086, 6.395689, 4.980317, 6.949863, 3.012275]}
            )
    data["ds"] = pd.to_datetime(data["ds"])
    return data

In [3]:
def dummy_parameters():
    parameters = {"model_options": {
                        'split_date': '2009-05-03',
                        'random_state': 0,
                        'n_estimators': 100,
                        'num_threads': 6,
                        'freqency': 'W',
                        'horizon': 2,
                        'lags': [1, 2, 4],
                        'date_features': ['week', 'month'],
                        'static_features': ['Size', 'Type_A', 'Type_B'],
                        'exogenous_features': ['IsHoliday', 'Temperature', 'Fuel_Price', 'Unemployment', 'CPI']
                        }
                 }
    return parameters

In [4]:
dummy_data = dummy_data()
dummy_parameters = dummy_parameters()

In [5]:
train_data, test_data = split_data(dummy_data, dummy_parameters["model_options"])
assert len(train_data) == 3
assert len(test_data) == 2

In [6]:
dummy_data

Unnamed: 0,unique_id,ds,y,Size,Type_A,Type_B,IsHoliday,Temperature,Fuel_Price,CPI,Unemployment
0,id_1,2009-04-12,0.613262,98,1,0,0,45.539704,2.410365,3.010402,7.475086
1,id_1,2009-04-19,0.442162,84,0,1,1,30.207879,2.837114,6.039538,6.395689
2,id_1,2009-04-26,0.307173,66,0,1,0,37.307495,2.810372,5.0348,4.980317
3,id_1,2009-05-03,0.05901,59,1,0,0,24.212213,3.104646,6.648689,6.949863
4,id_1,2009-05-10,0.113098,78,1,0,1,24.291916,3.108914,3.530509,3.012275


In [7]:
train_data, _ = split_data(dummy_data, dummy_parameters["model_options"])

In [8]:
train_data

Unnamed: 0,unique_id,ds,y,Size,Type_A,Type_B,IsHoliday,Temperature,Fuel_Price,CPI,Unemployment
0,id_1,2009-04-12,0.613262,98,1,0,0,45.539704,2.410365,3.010402,7.475086
1,id_1,2009-04-19,0.442162,84,0,1,1,30.207879,2.837114,6.039538,6.395689
2,id_1,2009-04-26,0.307173,66,0,1,0,37.307495,2.810372,5.0348,4.980317


In [9]:
train_data.shape

[1m([0m[1;36m3[0m, [1;36m11[0m[1m)[0m

In [10]:
train_data.shape[0]

[1;36m3[0m

In [11]:
model = train_model(train_data, dummy_parameters["model_options"])

In [None]:
assert model.freq == 'W'
assert model.models["XGBRegressor"].n_estimators == 100

In [33]:
store_sales_weekly = pd.DataFrame(catalog.load('store_sales_weekly'))

In [34]:
store_sales_weekly = store_sales_weekly.sort_values(by=['ds', 'unique_id'])
store_sales_weekly.head()

Unnamed: 0,unique_id,ds,y,Size,Type_A,Type_B,IsHoliday,Temperature,Fuel_Price,CPI,Unemployment
0,id_7,2001-09-30,0.09876,84,0,1,0,35.67862,2.270796,5.321664,9.99081
1,id_7,2001-10-07,0.264158,84,0,1,0,24.870336,2.632353,4.597994,8.46722
2,id_7,2001-10-14,0.335845,84,0,1,0,42.804482,3.006857,4.401153,8.10367
3,id_7,2001-10-21,0.235161,84,0,1,0,21.19712,2.380959,8.386135,5.155749
4,id_7,2001-10-28,0.479848,84,0,1,0,38.324511,2.277695,5.638826,7.920773


In [35]:
store_sales_weekly.groupby('unique_id')['ds'].count()


unique_id
id_0    [1;36m222[0m
id_1     [1;36m97[0m
id_2    [1;36m167[0m
id_3    [1;36m242[0m
id_4    [1;36m373[0m
id_5    [1;36m301[0m
id_6    [1;36m245[0m
id_7    [1;36m409[0m
id_8     [1;36m59[0m
id_9    [1;36m261[0m
Name: ds, dtype: int64

In [36]:
store_sales_weekly_sample = store_sales_weekly.query(" unique_id in('id_1', 'id_8') ")

In [38]:
store_sales_weekly_sample.shape

[1m([0m[1;36m156[0m, [1;36m11[0m[1m)[0m

In [37]:
store_sales_weekly_sample.to_csv('../data/05_model_input/future_data.csv', encoding='utf-8', index=False)

In [39]:
store_sales_weekly_sample.to_dict()


[1m{[0m
    [32m'unique_id'[0m: [1m{[0m
        [1;36m1452[0m: [32m'id_1'[0m,
        [1;36m1461[0m: [32m'id_1'[0m,
        [1;36m1470[0m: [32m'id_1'[0m,
        [1;36m1479[0m: [32m'id_1'[0m,
        [1;36m1488[0m: [32m'id_1'[0m,
        [1;36m1497[0m: [32m'id_1'[0m,
        [1;36m1506[0m: [32m'id_1'[0m,
        [1;36m1515[0m: [32m'id_1'[0m,
        [1;36m1524[0m: [32m'id_1'[0m,
        [1;36m1533[0m: [32m'id_1'[0m,
        [1;36m1542[0m: [32m'id_1'[0m,
        [1;36m1551[0m: [32m'id_1'[0m,
        [1;36m1560[0m: [32m'id_1'[0m,
        [1;36m1569[0m: [32m'id_1'[0m,
        [1;36m1578[0m: [32m'id_1'[0m,
        [1;36m1587[0m: [32m'id_1'[0m,
        [1;36m1596[0m: [32m'id_1'[0m,
        [1;36m1605[0m: [32m'id_1'[0m,
        [1;36m1614[0m: [32m'id_1'[0m,
        [1;36m1623[0m: [32m'id_1'[0m,
        [1;36m1632[0m: [32m'id_1'[0m,
        [1;36m1641[0m: [32m'id_1'[0m,
        [1;36m1650[0m: [32

In [46]:
import json

metrics = {'RandomForestRegressor': 37.03840070678769, 'XGBRegressor': 37.25298017954496, 'LGBMRegressor': 38.51687419786843}       

In [47]:
for item in metrics:
    print(item)

RandomForestRegressor
XGBRegressor
LGBMRegressor


In [53]:
for key, value in metrics.items():
    print(f"SMAPE of {key}: {value}") 

SMAPE of RandomForestRegressor: 37.03840070678769
SMAPE of XGBRegressor: 37.25298017954496
SMAPE of LGBMRegressor: 38.51687419786843
