In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install darts --quiet
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime, timedelta
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df_train = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv')
display(df_train.head())

In [None]:
df_holidays_events = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv')

In [None]:
df_oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv')

In [None]:
df_stores = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')

In [None]:
df_transactions = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv')

In [None]:
df_test = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv')

In [None]:
family_list = df_train['family'].unique()
store_list = df_stores['store_nbr'].unique()

In [None]:
train_merged = pd.merge(df_train, df_stores, on ='store_nbr')
train_merged = train_merged.sort_values(["store_nbr","family","date"])
train_merged = train_merged.astype({"store_nbr":'str', "family":'str', "city":'str',
                          "state":'str', "type":'str', "cluster":'str'})

In [None]:
!pip installl darts
from darts import TimeSeries
from tqdm import tqdm

In [None]:
family_TS_dict = {}

for family in tqdm(family_list):
    df_family = train_merged.loc[train_merged['family'] == family]

    list_of_TS_family = TimeSeries.from_group_dataframe(
                                df_family,
                                time_col="date",
                                group_cols=["store_nbr","family"], # columns for grouping time series
                                static_cols=["city","state","type","cluster"], # static covariates
                                value_cols="sales", # target
                                fill_missing_dates=True, # filling missing dates, remember Dec 25th
                                freq='D' # days
                                )
    for ts in list_of_TS_family:
            ts = ts.astype(np.float32)
    list_of_TS_family = sorted(list_of_TS_family, key=lambda ts: int(ts.static_covariates_values()[0,0]))
    
    family_TS_dict[family] = list_of_TS_family

In [None]:
from darts.dataprocessing import Pipeline
from darts.dataprocessing.transformers import Scaler, StaticCovariatesTransformer, MissingValuesFiller, InvertibleMapper
from sklearn.preprocessing import OrdinalEncoder

In [None]:
family_pipeline_dict = {}
family_TS_transformed_dict = {}

for key in tqdm(family_TS_dict):
    train_filler = MissingValuesFiller(verbose=False, n_jobs=-1, name="Fill NAs")
    static_cov_transformer = StaticCovariatesTransformer(verbose=False, transformer_cat = OrdinalEncoder(), name="Encoder")
    log_transformer = InvertibleMapper(np.log1p, np.expm1, verbose=False, n_jobs=-1, name="Log-Transform")   
    train_scaler = Scaler(verbose=False, n_jobs=-1, name="Scaling")

    train_pipeline = Pipeline([train_filler,
                             static_cov_transformer,
                             log_transformer,
                             train_scaler])

    training_transformed = train_pipeline.fit_transform(family_TS_dict[key])
    family_pipeline_dict[key] = train_pipeline
    family_TS_transformed_dict[key] = training_transformed

In [None]:
from darts.utils.timeseries_generation import datetime_attribute_timeseries

In [None]:
full_time_period = pd.date_range(start='2013-01-01', end='2017-08-31', freq='D')

year = datetime_attribute_timeseries(time_index = full_time_period, attribute='year')
month = datetime_attribute_timeseries(time_index= full_time_period, attribute='month')
day = datetime_attribute_timeseries(time_index = full_time_period, attribute='day')
dayofyear = datetime_attribute_timeseries(time_index = full_time_period, attribute = 'dayofyear')
weekday = datetime_attribute_timeseries(time_index = full_time_period , attribute='dayofweek')
weekofyear = datetime_attribute_timeseries(time_index = full_time_period, attribute='weekofyear')
timesteps = TimeSeries.from_times_and_values(times = full_time_period,
                                            values = np.arange(len(full_time_period)),
                                            columns= ['linear_increase'])

time_cov = year.stack(month).stack(day).stack(dayofyear).stack(weekday).stack(weekofyear).stack(timesteps)
time_cov = time_cov.astype(np.float32)

In [None]:
time_cov_scaler = Scaler(verbose=False, n_jobs=-1, name="Scaler")
time_cov_train, time_cov_val = time_cov.split_before(pd.Timestamp('20170816'))
time_cov_scaler.fit(time_cov_train)
time_cov_transformed = time_cov_scaler.transform(time_cov)

In [None]:
from darts.models.filtering.moving_average_filter import MovingAverageFilter

In [None]:
oil = TimeSeries.from_dataframe(df_oil, 
                                time_col = 'date', 
                                value_cols = ['dcoilwtico'],
                                freq = 'D')

oil = oil.astype(np.float32)

# Transform
oil_filler = MissingValuesFiller(verbose=False, n_jobs=-1, name="Filler")
oil_scaler = Scaler(verbose=False, n_jobs=-1, name="Scaler")
oil_pipeline = Pipeline([oil_filler, oil_scaler])
oil_transformed = oil_pipeline.fit_transform(oil)

# Moving Averages for Oil Price
oil_moving_average_7 = MovingAverageFilter(window=7)
oil_moving_average_28 = MovingAverageFilter(window=28)

oil_moving_averages = []
ma_7 = oil_moving_average_7.filter(oil_transformed).astype(np.float32)
ma_7 = ma_7.with_columns_renamed(col_names=ma_7.components, col_names_new='oil_ma_7')
ma_28 = oil_moving_average_28.filter(oil_transformed).astype(np.float32)
ma_28 = ma_28.with_columns_renamed(col_names=ma_28.components, col_names_new='oil_ma_28')
oil_moving_averages = ma_7.stack(ma_28)

In [None]:
def holiday_list(df_stores):

    listofseries = []
    
    for i in range(0,len(df_stores)):        
            df_holiday_dummies = pd.DataFrame(columns=['date'])
            df_holiday_dummies["date"] = df_holidays_events["date"]
    
            df_holiday_dummies["national_holiday"] = np.where(((df_holidays_events["type"] == "Holiday") & (df_holidays_events["locale"] == "National")), 1, 0)

            df_holiday_dummies["earthquake_relief"] = np.where(df_holidays_events['description'].str.contains('Terremoto Manabi'), 1, 0)

            df_holiday_dummies["christmas"] = np.where(df_holidays_events['description'].str.contains('Navidad'), 1, 0)

            df_holiday_dummies["football_event"] = np.where(df_holidays_events['description'].str.contains('futbol'), 1, 0)

            df_holiday_dummies["national_event"] = np.where(((df_holidays_events["type"] == "Event") & (df_holidays_events["locale"] == "National") & (~df_holidays_events['description'].str.contains('Terremoto Manabi')) & (~df_holidays_events['description'].str.contains('futbol'))), 1, 0)

            df_holiday_dummies["work_day"] = np.where((df_holidays_events["type"] == "Work Day"), 1, 0)

            df_holiday_dummies["local_holiday"] = np.where(((df_holidays_events["type"] == "Holiday") & ((df_holidays_events["locale_name"] == df_stores['state'][i]) | (df_holidays_events["locale_name"] == df_stores['city'][i]))), 1, 0)
                     
            listofseries.append(df_holiday_dummies)

    return listofseries

In [None]:
def remove_0_and_duplicates(holiday_list):

    listofseries = []
    
    for i in range(0,len(holiday_list)):         
            df_holiday_per_store = list_of_holidays_per_store[i].set_index('date')

            df_holiday_per_store = df_holiday_per_store.loc[~(df_holiday_per_store==0).all(axis=1)]
            
            df_holiday_per_store = df_holiday_per_store.groupby('date').agg({'national_holiday':'max', 'earthquake_relief':'max', 
                                   'christmas':'max', 'football_event':'max', 
                                   'national_event':'max', 'work_day':'max', 
                                   'local_holiday':'max'}).reset_index()

            listofseries.append(df_holiday_per_store)

    return listofseries

In [None]:
def holiday_TS_list_54(holiday_list):
    listofseries = []
    
    for i in range(0,54):
            holidays_TS = TimeSeries.from_dataframe(list_of_holidays_per_store[i], 
                                        time_col = 'date',
                                        fill_missing_dates=True,
                                        fillna_value=0,
                                        freq='D')
            
            holidays_TS = holidays_TS.slice(pd.Timestamp('20130101'),pd.Timestamp('20170831'))
            holidays_TS = holidays_TS.astype(np.float32)
            listofseries.append(holidays_TS)

    return listofseries

In [None]:
list_of_holidays_per_store = holiday_list(df_stores)
list_of_holidays_per_store = remove_0_and_duplicates(list_of_holidays_per_store)
list_of_holidays_store = holiday_TS_list_54(list_of_holidays_per_store)

holidays_filler = MissingValuesFiller(verbose=False, n_jobs=-1, name="Filler")
holidays_scaler = Scaler(verbose=False, n_jobs=-1, name="Scaler")

holidays_pipeline = Pipeline([holidays_filler, holidays_scaler])
holidays_transformed = holidays_pipeline.fit_transform(list_of_holidays_store)

In [None]:
df_promotion = pd.concat([df_train, df_test], axis=0)
df_promotion = df_promotion.sort_values(["store_nbr","family","date"])
display(df_promotion.tail())

family_promotion_dict = {}

for family in tqdm(family_list):
    df_family = df_promotion.loc[df_promotion['family'] == family]

    list_of_TS_promo = TimeSeries.from_group_dataframe(
                                df_family,
                                time_col="date",
                                group_cols=["store_nbr","family"],
                                value_cols="onpromotion",
                                fill_missing_dates=True,
                                freq='D')

    for ts in list_of_TS_promo:
        ts = ts.astype(np.float32)

    family_promotion_dict[family] = list_of_TS_promo

In [None]:
promotion_transformed_dict = {}

for key in tqdm(family_promotion_dict):
    promo_filler = MissingValuesFiller(verbose=False, n_jobs=-1, name="Fill NAs")
    promo_scaler = Scaler(verbose=False, n_jobs=-1, name="Scaling")

    promo_pipeline = Pipeline([promo_filler,
                             promo_scaler])

    promotion_transformed = promo_pipeline.fit_transform(family_promotion_dict[key])

    # Moving Averages for Promotion Family Dictionaries
    promo_moving_average_7 = MovingAverageFilter(window=7)
    promo_moving_average_28 = MovingAverageFilter(window=28)

    promotion_covs = []

    for ts in promotion_transformed:
        ma_7 = promo_moving_average_7.filter(ts)
        ma_7 = TimeSeries.from_series(ma_7.pd_series())  
        ma_7 = ma_7.astype(np.float32)
        ma_7 = ma_7.with_columns_renamed(col_names=ma_7.components, col_names_new="promotion_ma_7")
        ma_28 = promo_moving_average_28.filter(ts)
        ma_28 = TimeSeries.from_series(ma_28.pd_series())  
        ma_28 = ma_28.astype(np.float32)
        ma_28 = ma_28.with_columns_renamed(col_names=ma_28.components, col_names_new="promotion_ma_28")
        promo_and_mas = ts.stack(ma_7).stack(ma_28)
        promotion_covs.append(promo_and_mas)

    promotion_transformed_dict[key] = promotion_covs

In [None]:
general_covariates = time_cov_transformed.stack(oil_transformed).stack(oil_moving_averages)

In [None]:
store_covariates_future = []

for store in range(0,len(store_list)):
    stacked_covariates = holidays_transformed[store].stack(general_covariates)  
    store_covariates_future.append(stacked_covariates)

In [None]:
future_covariates_dict = {}

for key in tqdm(promotion_transformed_dict):
    promotion_family = promotion_transformed_dict[key]
    covariates_future = [promotion_family[i].stack(store_covariates_future[i]) for i in range(0,len(promotion_family))]
    future_covariates_dict[key] = covariates_future

In [None]:
df_transactions.sort_values(['store_nbr','date'], inplace= True)

TS_transactions_list = TimeSeries.from_group_dataframe(
                                df_transactions,
                                time_col="date",
                                group_cols=["store_nbr"],
                                value_cols="transactions",
                                fill_missing_dates=True,
                                freq='D')

transactions_list = []

for ts in TS_transactions_list:
            series = TimeSeries.from_series(ts.pd_series())
            series = series.astype(np.float32)
            transactions_list.append(series)
            
# as the transactions dataframe have for store_nbr 24 transactions from 01-01-2013 but every store_nbr have data from 02-01-2013
transactions_list[24] = transactions_list[24].slice(start_ts=pd.Timestamp('20130102'), end_ts=pd.Timestamp('20170815'))

transactions_list_full = []
for ts in transactions_list:
    if ts.start_time() > pd.Timestamp('20130101'):
        end_time = (ts.start_time() - timedelta(days=1))
        delta = end_time - pd.Timestamp('20130101')
        zero_series = TimeSeries.from_times_and_values(
                                  times=pd.date_range(start=pd.Timestamp('20130101'), 
                                  end=end_time, freq="D"),
                                  values=np.zeros(delta.days+1))
        ts = zero_series.append(ts)
        ts = ts.with_columns_renamed(col_names=ts.components, col_names_new="transactions")
        transactions_list_full.append(ts)

transactions_filler = MissingValuesFiller(verbose=False, n_jobs=-1, name="Filler")
transactions_scaler = Scaler(verbose=False, n_jobs=-1, name="Scaler")

transactions_pipeline = Pipeline([transactions_filler, transactions_scaler])
transactions_transformed = transactions_pipeline.fit_transform(transactions_list_full)

In [None]:
df_indexes = pd.concat([df_train, df_test])
df_indexes = df_indexes.drop(['onpromotion'], axis=1)
df_indexes = df_indexes.sort_values(by=['store_nbr', 'family'])
df_indexes.date = pd.to_datetime(df_indexes.date)

In [None]:
df_indexes = df_indexes.set_index('date')

In [None]:
date_range = pd.date_range(start=df_indexes.index.min(), end=df_indexes.index.max(), freq='D')
df_indexes_filled = pd.DataFrame(columns=df_indexes.columns)

for family in tqdm(family_list):
    for store in store_list:
        temp_df = df_indexes.iloc[np.where((df_indexes.family == family) & (df_indexes.store_nbr == store))]
        temp_df = temp_df.reindex(date_range).fillna({'id': np.nan, 'store_nbr': store, 'family': family, 'sales': np.nan})
        df_indexes_filled = pd.concat([df_indexes, temp_df])
        
df_indexes_filled

In [None]:
df_indexes_filled.index.name = 'date'
df_indexes_filled = df_indexes_filled.reset_index()
df_indexes_filled = df_indexes_filled.sort_values(['store_nbr','family'])
df_indexes_filled = df_indexes_filled.drop_duplicates()
df_indexes_filled

In [None]:
last_train_date = pd.to_datetime(df_train.date.max())

In [None]:
import gc

In [None]:
del(df_train)
del(df_test)
del(df_stores)
del(df_holidays_events)
del(df_oil)
del(df_transactions)
del(df_indexes)
del(train_merged)

gc.collect()

In [None]:
from darts.models import LightGBMModel

In [None]:
def lgbm_predictions(model_params, val_df_size = 0):
    l_train_date = last_train_date - np.timedelta64(val_df_size, 'D')
    local_df_indexes = df_indexes_filled.iloc[np.where(df_indexes_filled.date > l_train_date)]
    
    submission_kaggle_list = []    
    cnt = 1
    
    for params in model_params:
        LGBM_Models_Submission = {}
        display("Training...")
            
        # Fit Model
        print(f'Start fit model {cnt}')
        for family in tqdm(family_list):    
            sales_family = family_TS_transformed_dict[family]
            # training_data: represents the number of sales in the training sample minus the sales for the val
            training_data = [ts[:1688-val_df_size] for ts in sales_family]
            # TCN_covariates: represents the future covariates associated with the target product family
            TCN_covariates = future_covariates_dict[family]
            # train_sliced: represents the number of sales associated with the target product family.
            # slice_intersect: function that you can see used simply ensures that the components span the same time interval. 
            # In the case of different time intervals an error message will appear if we try to combine them.
            train_sliced = [training_data[i].slice_intersect(TCN_covariates[i]) for i in range(0,len(training_data))]
            

            LGBM_Model_Submission = LightGBMModel(lags = params["lags"],
                                                  lags_future_covariates = params["lags_future_covariates"],
                                                  lags_past_covariates = params["lags_past_covariates"],
                                                  output_chunk_length=1,
                                                  random_state=2022,
                                                  gpu_use_dp= "false")


            LGBM_Model_Submission.fit(series=train_sliced, 
                                  future_covariates=TCN_covariates,
                                  # transactions_transformed: the past covariates do not need to be indexed on the target 
                                  # family because there is only one global `TimeSeries` per store.
                                  past_covariates=transactions_transformed)

            LGBM_Models_Submission[family] = LGBM_Model_Submission

        display("Predictions...")
        LGBM_Forecasts_Families_Submission = {}

        # Predict
        print(f'Start predict model {cnt}')
        for family in tqdm(family_list):
            sales_family = family_TS_transformed_dict[family]
            training_data = [ts[:1688-val_df_size] for ts in sales_family]
            LGBM_covariates = future_covariates_dict[family]
            train_sliced = [training_data[i].slice_intersect(TCN_covariates[i]) for i in range(0,len(training_data))]

            forecast_LGBM = LGBM_Models_Submission[family].predict(
                                                  n = 16 + val_df_size,
                                                  series=train_sliced,
                                                  future_covariates=LGBM_covariates,
                                                  past_covariates=transactions_transformed
                                                 )

            LGBM_Forecasts_Families_Submission[family] = forecast_LGBM

        # Transform Back
        print(f'Start transform Back {cnt}')
        LGBM_Forecasts_Families_back_Submission = {}

        for family in tqdm(family_list):
            LGBM_Forecasts_Families_back_Submission[family] = family_pipeline_dict[family].inverse_transform(LGBM_Forecasts_Families_Submission[family], partial=True)
        print(f'Start Prepare Submission {cnt}')
        for family in tqdm(LGBM_Forecasts_Families_back_Submission):
            for n in range(0,len(LGBM_Forecasts_Families_back_Submission[family])):
                if (family_TS_dict[family][n].univariate_values()[-21:] == 0).all():
                    LGBM_Forecasts_Families_back_Submission[family][n] = LGBM_Forecasts_Families_back_Submission[family][n].map(lambda x: x * 0)

        listofseries = []

        for store in tqdm(range(0,54)):
            for family in family_list:
                oneforecast = LGBM_Forecasts_Families_back_Submission[family][store].pd_dataframe()
                oneforecast.columns = ['y_pred']
                listofseries.append(oneforecast)

        df_forecasts = pd.concat(listofseries) 
        df_forecasts.reset_index(drop=True, inplace=True)

        # No Negative Forecasts
        print(f'Start No Negative Forecasts {cnt}')
        df_forecasts[df_forecasts < 0] = 0
        forecasts_kaggle = pd.concat([local_df_indexes['id'], df_forecasts.set_index(local_df_indexes.index)], axis=1)
        forecasts_kaggle = forecasts_kaggle.reset_index(drop=True)

        # Submission
        print(f'Start Submission {cnt}')
        submission_kaggle_list.append(forecasts_kaggle)
        cnt += 1
    
    return submission_kaggle_list, local_df_indexes

In [None]:
model_params = [
    {"lags" : 63, "lags_future_covariates" : (14,1), "lags_past_covariates" : [-16,-17,-18,-19,-20,-21,-22]},
    {"lags" : 7, "lags_future_covariates" : (16,1), "lags_past_covariates" : [-16,-17,-18,-19,-20,-21,-22]},  
    {"lags" : 31, "lags_future_covariates" : (14,1), "lags_past_covariates" : [-16,-17,-18,-19,-20,-21,-22]},
    {"lags" : 365, "lags_future_covariates" : (14,1), "lags_past_covariates" : [-16,-17,-18,-19,-20,-21,-22]}, 
    {"lags" : 730, "lags_future_covariates" : (14,1), "lags_past_covariates" : [-16,-17,-18,-19,-20,-21,-22]}, 
    {"lags" : 1095, "lags_future_covariates" : (14,1), "lags_past_covariates" : [-16,-17,-18,-19,-20,-21,-22]}
]

In [None]:
submission_kaggle_list, clipped_indexes = lgbm_predictions(model_params)

In [None]:
submissions = submission_kaggle_list[0].copy()
submissions = submissions.rename(columns={'y_pred': 'y_pred_0'})

if len(submission_kaggle_list) > 1:
    for i in range(1, len(submission_kaggle_list)):
        y_pred = submission_kaggle_list[i]
        y_pred = y_pred.rename(columns={'y_pred': f'y_pred_{i}'})
        submissions = pd.concat([submissions, y_pred.drop(['id'], axis=1)], axis=1)

submissions['sales'] = submissions.loc[:, submissions.columns!='id'].mean(axis=1)
submissions.head()

In [None]:
submission = submissions[['id', 'sales']]
submission = submission.sort_values('id')
submission.id = submission.id.astype('int32')
submission.head()

In [None]:
submission.to_csv('/kaggle/working/submission.csv', index=False)