In [3]:
#@title IMPORT LIBRARY
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import optuna
import gc


from utilsforecast.plotting import plot_series
from mlforecast import MLForecast
from mlforecast.feature_engineering import transform_exog

import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import LinearRegression

from utilsforecast.evaluation import evaluate
from utilsforecast.losses import rmse

from numba import njit
from window_ops.expanding import expanding_mean
from window_ops.shift import shift_array
from mlforecast.lag_transforms import ExpandingMean, RollingMean, ExpandingMax, ExpandingMin, ExpandingStd, Combine

from mlforecast.auto import (
    AutoLightGBM,
    AutoMLForecast,
    AutoModel,
    AutoRidge,
    ridge_space,
)

@njit
def ratio_over_previous(x, offset=1):
    """Computes the ratio between the current value and its `offset` lag"""
    return x / shift_array(x, offset=offset)

@njit
def diff_over_previous(x, offset=1):
    """Computes the difference between the current value and its `offset` lag"""
    return x - shift_array(x, offset=offset)

In [5]:
#@title RETRIEVE DATASET
def retrieve_data(sales_dir, price_dir):
    sales_raw = pd.read_csv(sales_dir)
    price_raw = pd.read_csv(price_dir)

    cols = sales_raw.columns
    dates = cols[3:]

    sales = sales_raw.melt(id_vars=cols[:3], var_name='Date', value_vars=dates, value_name='Sales')
    price = price_raw.melt(id_vars=cols[:3], var_name='Date', value_vars=dates, value_name='Sales')

    sales['unique_id'] = sales['Client'].astype(str) + '-' + sales['Warehouse'].astype(str) + '-' + sales['Product'].astype(str)
    price['unique_id'] = price['Client'].astype(str) + '-' + price['Warehouse'].astype(str) + '-' + price['Product'].astype(str)
    # sales.drop(['Client', 'Warehouse', 'Product'], axis=1, inplace=True)

    sales = sales.rename(columns={'Date': 'ds', 'Sales': 'y'})
    price = price.rename(columns={'Date': 'ds', 'Sales': 'price'})
    sales['ds'] = pd.to_datetime(sales['ds'])
    price['ds'] = pd.to_datetime(price['ds'])

    return sales, price

sales_dir = '../phase_0/Phase 0 - Sales.csv'
price_dir = '../phase_0/Phase 0 - Price.csv'
sales, price = retrieve_data(sales_dir, price_dir)

In [6]:
sales_dir = '../phase_1/Phase 1 - Sales.csv'
price_dir = '../phase_1/Phase 1 - Price.csv'
sales_1, price_1 = retrieve_data(sales_dir, price_dir)

In [7]:
price = price.dropna()
first_sale = price.groupby('unique_id')['ds'].min()
sales = sales.merge(first_sale, on='unique_id', how='left')
sales = sales.rename(columns={'ds_x': 'ds', 'ds_y': 'first_sale'})

In [8]:
sales = sales[sales['ds'] >= sales['first_sale']]
sales.tail()

Unnamed: 0,Client,Warehouse,Product,ds,y,unique_id,first_sale
2559005,46,318,13485,2023-10-02,80.0,46-318-13485,2022-09-12
2559006,46,318,13582,2023-10-02,39.0,46-318-13582,2021-11-15
2559007,46,318,13691,2023-10-02,1.0,46-318-13691,2022-09-19
2559008,46,318,13946,2023-10-02,3.0,46-318-13946,2023-06-19
2559009,46,318,14294,2023-10-02,0.0,46-318-14294,2022-12-12


In [9]:
sales

Unnamed: 0,Client,Warehouse,Product,ds,y,unique_id,first_sale
0,0,1,367,2020-07-06,7.0,0-1-367,2020-07-06
2,0,1,655,2020-07-06,21.0,0-1-655,2020-07-06
3,0,1,1149,2020-07-06,7.0,0-1-1149,2020-07-06
5,0,1,1965,2020-07-06,21.0,0-1-1965,2020-07-06
8,0,1,3234,2020-07-06,7.0,0-1-3234,2020-07-06
...,...,...,...,...,...,...,...
2559005,46,318,13485,2023-10-02,80.0,46-318-13485,2022-09-12
2559006,46,318,13582,2023-10-02,39.0,46-318-13582,2021-11-15
2559007,46,318,13691,2023-10-02,1.0,46-318-13691,2022-09-19
2559008,46,318,13946,2023-10-02,3.0,46-318-13946,2023-06-19


In [10]:
sales = pd.concat([sales, sales_1])

In [11]:
sales = sales.drop(columns='first_sale')

In [12]:
sales

Unnamed: 0,Client,Warehouse,Product,ds,y,unique_id
0,0,1,367,2020-07-06,7.0,0-1-367
2,0,1,655,2020-07-06,21.0,0-1-655
3,0,1,1149,2020-07-06,7.0,0-1-1149
5,0,1,1965,2020-07-06,21.0,0-1-1965
8,0,1,3234,2020-07-06,7.0,0-1-3234
...,...,...,...,...,...,...
195684,46,318,13485,2024-01-01,0.0,46-318-13485
195685,46,318,13582,2024-01-01,67.0,46-318-13582
195686,46,318,13691,2024-01-01,2.0,46-318-13691
195687,46,318,13946,2024-01-01,0.0,46-318-13946


In [13]:
from mlforecast.feature_engineering import transform_exog

# Treating price as a historic feature

In [11]:
# price_lag = {}

# for i in range(13, 78):
#     price_lag[i] = [ratio_over_previous]

In [12]:
# transformed_prices = transform_exog(sales[['unique_id', 'ds', 'price']], 
#                                     lag_transforms=price_lag)

In [13]:
# train_df = sales.merge(transformed_prices, on=['unique_id', 'ds'], how='left')

# Training

In [14]:
core_features = ['unique_id', 'ds', 'y']
future_exogs = []
# historic_exogs = transformed_prices.head().drop(columns=['unique_id', 'ds', 'price']).columns.tolist()
historic_exogs = []
static_features = ['Client', 'Warehouse', 'Product']
static_features = []

train_df = sales[core_features + static_features + future_exogs + historic_exogs]

In [15]:
train_df

Unnamed: 0,unique_id,ds,y
0,0-1-367,2020-07-06,7.0
2,0-1-655,2020-07-06,21.0
3,0-1-1149,2020-07-06,7.0
5,0-1-1965,2020-07-06,21.0
8,0-1-3234,2020-07-06,7.0
...,...,...,...
195684,46-318-13485,2024-01-01,0.0
195685,46-318-13582,2024-01-01,67.0
195686,46-318-13691,2024-01-01,2.0
195687,46-318-13946,2024-01-01,0.0


In [16]:
lgb_params = {
    'n_jobs': -1, 
    'random_state': 19, 
    'verbosity': 1, 
    'objective' : 'tweedie',
    'num_iterations': 1000,
    'learning_rate': 0.01,
    'num_leaves': 127,
    'reg_lambda': 0.1
}
recursive = True

In [17]:
fcst = MLForecast(
    models=[
        lgb.LGBMRegressor(**lgb_params),
        # xgb.XGBRegressor(random_state=0, objective='tweedie')
        ],
    freq='W-MON',
    lags=range(1, 79),
    lag_transforms={
        1: [diff_over_previous, ratio_over_previous,
            ExpandingMean(), ExpandingMax(), ExpandingStd()],
        # 2: [ExpandingMean(), ExpandingMax(), ExpandingStd(), RollingMean(window_size=4), diff_over_previous],
        # 3: [ExpandingMean(), ExpandingMax(), ExpandingStd(), RollingMean(window_size=4), diff_over_previous],
        # 4: [ExpandingMean(), ExpandingMax(), ExpandingStd(), RollingMean(window_size=4), diff_over_previous]
    },
    # date_features=['month', 'quarter', 'week'],
)

if recursive==True:
    fcst.fit(train_df, static_features=static_features)
else:
    fcst.fit(train_df, static_features=static_features, max_horizon=13)


These series won't show up if you use `MLForecast.forecast_fitted_values()`.
You can set `dropna=False` or use transformations that require less samples to mitigate this


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.428948 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21165
[LightGBM] [Info] Number of data points in the train set: 361167, number of used features: 83
[LightGBM] [Info] Start training from score 3.816120


In [None]:
forecast_df = fcst.predict(h=13)
forecast_df.head()

In [148]:
import sys
sys.path.append('../')
from utils import *

In [19]:
#@title MAKE SUBMISSION
def make_submission(forecast_df):
    submission = forecast_df.copy()
    values = forecast_df.columns[-1]

    submission[values] = submission[values].clip(0)
    submission[['Client', 'Warehouse', 'Product']] = submission['unique_id'].str.split('-', expand=True)
    submission = submission.drop(columns='unique_id')
    submission['ds'] = submission['ds'].astype(str)

    submission = submission.pivot(columns='ds', index=['Client', 'Warehouse', 'Product'], values=values).reset_index()
    submission[['Client', 'Warehouse', 'Product']] = submission[['Client', 'Warehouse', 'Product']].astype(int)
    submission = submission.sort_values(['Client', 'Warehouse', 'Product'])

    # submission = submission.map(lambda x: x if x>=0 else 0)

    current_datetime = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
    submission.to_csv('../lightgbm.csv', index=None)

    return submission

submission = make_submission(forecast_df)