In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Import widgets
from ipywidgets import widgets, interactive, interact
import ipywidgets as widgets
from IPython.display import display

from sklearn.ensemble import RandomForestRegressor

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/m5-forecasting-accuracy/calendar.csv
/kaggle/input/m5-forecasting-accuracy/sell_prices.csv
/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv
/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv
/kaggle/input/m5-forecasting-accuracy/sample_submission.csv


In [2]:
from typing import Union

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm


class WRMSSEEvaluator(object):

    def __init__(self, train_df: pd.DataFrame, valid_df: pd.DataFrame, calendar: pd.DataFrame, prices: pd.DataFrame):
        train_y = train_df.loc[:, train_df.columns.str.startswith('d_')]
        train_target_columns = train_y.columns.tolist()
        weight_columns = train_y.iloc[:, -28:].columns.tolist()

        train_df['all_id'] = 0  # for lv1 aggregation

        id_columns = train_df.loc[:, ~train_df.columns.str.startswith('d_')].columns.tolist()
        valid_target_columns = valid_df.loc[:, valid_df.columns.str.startswith('d_')].columns.tolist()

        if not all([c in valid_df.columns for c in id_columns]):
            valid_df = pd.concat([train_df[id_columns], valid_df], axis=1, sort=False)

        self.train_df = train_df
        self.valid_df = valid_df
        self.calendar = calendar
        self.prices = prices

        self.weight_columns = weight_columns
        self.id_columns = id_columns
        self.valid_target_columns = valid_target_columns

        weight_df = self.get_weight_df()

        self.group_ids = (
            'all_id',
            'state_id',
            'store_id',
            'cat_id',
            'dept_id',
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            'item_id',
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
        )

        for i, group_id in enumerate(tqdm(self.group_ids)):
            train_y = train_df.groupby(group_id)[train_target_columns].sum()
            scale = []
            for _, row in train_y.iterrows():
                series = row.values[np.argmax(row.values != 0):]
                scale.append(((series[1:] - series[:-1]) ** 2).mean())
            setattr(self, f'lv{i + 1}_scale', np.array(scale))
            setattr(self, f'lv{i + 1}_train_df', train_y)
            setattr(self, f'lv{i + 1}_valid_df', valid_df.groupby(group_id)[valid_target_columns].sum())

            lv_weight = weight_df.groupby(group_id)[weight_columns].sum().sum(axis=1)
            setattr(self, f'lv{i + 1}_weight', lv_weight / lv_weight.sum())

    def get_weight_df(self) -> pd.DataFrame:
        day_to_week = self.calendar.set_index('d')['wm_yr_wk'].to_dict()
        weight_df = self.train_df[['item_id', 'store_id'] + self.weight_columns].set_index(['item_id', 'store_id'])
        weight_df = weight_df.stack().reset_index().rename(columns={'level_2': 'd', 0: 'value'})
        weight_df['wm_yr_wk'] = weight_df['d'].map(day_to_week)

        weight_df = weight_df.merge(self.prices, how='left', on=['item_id', 'store_id', 'wm_yr_wk'])
        weight_df['value'] = weight_df['value'] * weight_df['sell_price']
        weight_df = weight_df.set_index(['item_id', 'store_id', 'd']).unstack(level=2)['value']
        weight_df = weight_df.loc[zip(self.train_df.item_id, self.train_df.store_id), :].reset_index(drop=True)
        weight_df = pd.concat([self.train_df[self.id_columns], weight_df], axis=1, sort=False)
        return weight_df

    def rmsse(self, valid_preds: pd.DataFrame, lv: int) -> pd.Series:
        valid_y = getattr(self, f'lv{lv}_valid_df')
        score = ((valid_y - valid_preds) ** 2).mean(axis=1)
        scale = getattr(self, f'lv{lv}_scale')
        return (score / scale).map(np.sqrt)

    def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds], axis=1, sort=False)

        all_scores = []
        for i, group_id in enumerate(self.group_ids):
            lv_scores = self.rmsse(valid_preds.groupby(group_id)[self.valid_target_columns].sum(), i + 1)
            weight = getattr(self, f'lv{i + 1}_weight')
            lv_scores = pd.concat([weight, lv_scores], axis=1, sort=False).prod(axis=1)
            all_scores.append(lv_scores.sum())

        return np.mean(all_scores)

In [3]:
#valid_fold_df

In [4]:
sales_df = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv')
calendar_df = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv')
submission_df = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sample_submission.csv')
prices_df = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv')


In [5]:

train_fold_df = sales_df.iloc[:, :-56]
valid_fold_df = sales_df.iloc[:, -28:]


evaluator = WRMSSEEvaluator(train_fold_df, valid_fold_df, calendar_df, prices_df)


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [6]:
# test evaluator
valid_preds = valid_fold_df.copy() + np.random.randint(2, size=valid_fold_df.shape)
print(evaluator.score(valid_preds))
valid_preds = np.zeros(valid_fold_df.shape)
print(evaluator.score(valid_preds))

1.9380265920125581
5.447476163459111


# Calendar Features

In [7]:
# Correct data types for "calendar.csv"
calendarDTypes = {"event_name_1": "category", 
                  "event_name_2": "category", 
                  "event_type_1": "category", 
                  "event_type_2": "category", 
                  "weekday": "category", 
                  'wm_yr_wk': 'int16', 
                  "wday": "int16",
                  "month": "int16", 
                  "year": "int16", 
                  "snap_CA": "int16", 
                  'snap_TX': 'int16', 
                  'snap_WI': 'int16' }

# Read csv file
calendar = pd.read_csv("../input/m5-forecasting-accuracy/calendar.csv", 
                       dtype = calendarDTypes)


calendar.head(10)

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1
5,2011-02-03,11101,Thursday,6,2,2011,d_6,,,,,1,1,1
6,2011-02-04,11101,Friday,7,2,2011,d_7,,,,,1,0,0
7,2011-02-05,11102,Saturday,1,2,2011,d_8,,,,,1,1,1
8,2011-02-06,11102,Sunday,2,2,2011,d_9,SuperBowl,Sporting,,,1,1,1
9,2011-02-07,11102,Monday,3,2,2011,d_10,,,,,1,1,0


In [8]:
# Create dataframe to hold features
features = pd.DataFrame() 

## Add features

In [9]:
features[["snap_CA", "snap_TX", "snap_WI"]] = calendar[["snap_CA", "snap_TX", "snap_WI"]]
features.head()

Unnamed: 0,snap_CA,snap_TX,snap_WI
0,0,0,0
1,0,0,0
2,0,0,0
3,1,1,0
4,1,0,1


## Transform categorical features to enhot encodings (or embeddings)

### Events

In [10]:
# add boolean column indicating an event takes place:
features["event_occurs"] = ~calendar["event_name_1"].isna()*1
features[5:10]

Unnamed: 0,snap_CA,snap_TX,snap_WI,event_occurs
5,1,1,1,0
6,1,0,0,0
7,1,1,1,0
8,1,1,1,1
9,1,1,0,0


Er zijn 4 verschillende **event types**: sport, cultuur, nationaal, religeus.
Als er twee eventen op 1 dag zijn dan komt eentje in column 1 andere in event 2.
Gebruik hier one hot encoding:

In [11]:
print(calendar["event_type_1"].unique())
print(calendar["event_type_2"].unique())


[NaN, Sporting, Cultural, National, Religious]
Categories (4, object): [Sporting, Cultural, National, Religious]
[NaN, Cultural, Religious]
Categories (2, object): [Cultural, Religious]


In [12]:
event1_one_hot= pd.DataFrame(calendar["event_type_1"])
# convert to onehot encoding
event1_one_hot = pd.get_dummies(event1_one_hot)
event1_one_hot.head()

Unnamed: 0,event_type_1_Cultural,event_type_1_National,event_type_1_Religious,event_type_1_Sporting
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0


In [13]:
event1_one_hot.columns = ['cultural','national','religious',
                     'sporting']
event1_one_hot.head()

Unnamed: 0,cultural,national,religious,sporting
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0


In [14]:
event2_one_hot = pd.DataFrame(calendar["event_type_2"])
# convert to onehot encoding
event2_one_hot = pd.get_dummies(event2_one_hot)
print(event2_one_hot.head())
event2_one_hot.columns = ['cultural','religious']
event2_one_hot.head()

   event_type_2_Cultural  event_type_2_Religious
0                      0                       0
1                      0                       0
2                      0                       0
3                      0                       0
4                      0                       0


Unnamed: 0,cultural,religious
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [15]:
event_one_hot = pd.concat([event1_one_hot,event2_one_hot],axis=1)
event_one_hot[83:88]

Unnamed: 0,cultural,national,religious,sporting,cultural.1,religious.1
83,0,0,0,0,0,0
84,0,0,0,0,0,0
85,0,0,1,0,1,0
86,0,0,0,0,0,0
87,0,0,1,0,0,0


In [16]:
# group columns with same name
event_one_hot=event_one_hot.groupby(level=0,axis=1).sum()
event_one_hot[83:88]

Unnamed: 0,cultural,national,religious,sporting
83,0,0,0,0
84,0,0,0,0
85,1,0,1,0
86,0,0,0,0
87,0,0,1,0


In [17]:
# check if at any time there two of the same types of event occuring:
print(event_one_hot.max())

# add to features
features =  pd.concat([features, event_one_hot], axis=1)
features.head()


cultural     1
national     1
religious    1
sporting     1
dtype: uint8


Unnamed: 0,snap_CA,snap_TX,snap_WI,event_occurs,cultural,national,religious,sporting
0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0
4,1,0,1,0,0,0,0,0


***Event names***

there are 31 different events. One hot encoding does not seem like a good idea giving the sparsety of the events occuring. Maybe an embedding would be more usefull?  TODO

In [18]:
events = pd.unique(calendar[["event_name_1","event_name_2"]].values.ravel("K"))
print(events)
print(len(events))

[nan 'SuperBowl' 'ValentinesDay' 'PresidentsDay' 'LentStart' 'LentWeek2'
 'StPatricksDay' 'Purim End' 'OrthodoxEaster' 'Pesach End' 'Cinco De Mayo'
 "Mother's day" 'MemorialDay' 'NBAFinalsStart' 'NBAFinalsEnd'
 "Father's day" 'IndependenceDay' 'Ramadan starts' 'Eid al-Fitr'
 'LaborDay' 'ColumbusDay' 'Halloween' 'EidAlAdha' 'VeteransDay'
 'Thanksgiving' 'Christmas' 'Chanukah End' 'NewYear' 'OrthodoxChristmas'
 'MartinLutherKingDay' 'Easter']
31


### Add date features

In [19]:
weekday = pd.get_dummies(calendar["weekday"])
features =  pd.concat([features, weekday], axis=1)
features.head()

Unnamed: 0,snap_CA,snap_TX,snap_WI,event_occurs,cultural,national,religious,sporting,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0
4,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1


In [20]:
month = pd.get_dummies(calendar["month"], prefix ="month")
features =  pd.concat([features, month], axis=1)
features.head()

Unnamed: 0,snap_CA,snap_TX,snap_WI,event_occurs,cultural,national,religious,sporting,Friday,Monday,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
print("number of years in data:", len(calendar["year"].unique()))
year = pd.get_dummies(calendar["year"], prefix ="year")
features =  pd.concat([features, year], axis=1)
features.head()

number of years in data: 6


Unnamed: 0,snap_CA,snap_TX,snap_WI,event_occurs,cultural,national,religious,sporting,Friday,Monday,...,month_9,month_10,month_11,month_12,year_2011,year_2012,year_2013,year_2014,year_2015,year_2016
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [22]:
features.shape

(1969, 33)

In [23]:
firstDay = 1
lastDay = 1941

# Use x sales days (columns) for training
numCols = [f"d_{day}" for day in range(firstDay, lastDay+1)]

# Define all categorical columns
catCols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']

# Define the correct data types for "sales_train_validation.csv"
dtype = {numCol: "float32" for numCol in numCols} 
dtype.update({catCol: "category" for catCol in catCols if catCol != "id"})

# Read csv file
sales = pd.read_csv("../input/m5-forecasting-accuracy/sales_train_evaluation.csv", 
                 usecols = catCols + numCols, dtype = dtype)

# Transform categorical features into integers
for col in catCols:
    if col != "id":
        sales[col] = sales[col].cat.codes.astype("int16")
        sales[col] -= sales[col].min()
        


In [24]:
#days = range(1, 1913 + 1)
days = range(1, 1941 + 1)
time_series_columns = [f'd_{i}' for i in days]

ids = np.random.choice(sales_df['id'].unique().tolist(), 1000)
#print(ids)
aggregate = sales_df.groupby('dept_id')[numCols].agg('sum')

departments = sales_df['dept_id'].unique().tolist()

series_ids = widgets.Dropdown(
    options=ids,
    value=ids[0],
    description='series_ids:'
)

def plot_data(series_ids):
    df = sales_df.loc[sales_df['id'] == series_ids][time_series_columns]
    df = pd.Series(df.values.flatten())

    df.plot(figsize=(20, 10), lw=2, marker='*')
    df.rolling(7).mean().plot(figsize=(20, 10), lw=2, marker='o', color='orange')
   # plt.axhline(df.mean(), lw=3, color='red')
    plt.grid()
    

  silent = bool(old_value == new_value)


In [25]:

#days = range(1, 1913 + 1)
days = range(firstDay, lastDay + 1)
time_series_columns = [f'd_{i}' for i in days]

ids = np.random.choice(sales_df['id'].unique().tolist(), 1000)
#print(ids)
aggregate = sales_df.groupby('dept_id')[numCols].agg('sum')

departments = sales_df['dept_id'].unique().tolist()

series_ids = widgets.Dropdown(
    options=ids,
    value=ids[0],
    description='series_ids:'
)

def plot_data(series_ids):
    df = sales_df.loc[sales_df['id'] == series_ids][time_series_columns]
    df = pd.Series(df.values.flatten())

    df.plot(figsize=(20, 10), lw=2, marker='*')
    df.rolling(7).mean().plot(figsize=(20, 10), lw=2, marker='o', color='orange')
   # plt.axhline(df.mean(), lw=3, color='red')
    plt.grid()
    

In [26]:

# features = calendar[{'year','snap_TX','snap_CA','wday','month', 'snap_WI', "event_name_1","event_type_1", "event_name_2","event_type_2"}].values


In [27]:
features.shape

(1969, 33)

In [28]:
# aggregate date:
agg_dept = sales.groupby('dept_id')[numCols].agg('sum')
agg_dept_avg = sales.groupby('dept_id')[numCols].agg('mean')
#df.groupby('Country')[columns].agg('sum')
print(agg_dept.shape)
print(agg_dept_avg.shape)

agg_features = agg_dept.append(agg_dept_avg)
#pd.stack((agg_dept,agg_dept_avg))
agg_features[agg_features.index.name] =agg_features.index
print(agg_features.shape)
agg_features.head()

(7, 1941)
(7, 1941)
(14, 1942)


Unnamed: 0_level_0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941,dept_id
dept_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3610.0,3172.0,2497.0,2531.0,1714.0,3133.0,2855.0,3831.0,2741.0,2514.0,...,4414.0,4464.0,3109.0,3142.0,3291.0,3027.0,3761.0,4832.0,4720.0,0
1,154.0,185.0,185.0,138.0,100.0,87.0,89.0,155.0,158.0,101.0,...,479.0,552.0,419.0,487.0,449.0,448.0,382.0,501.0,560.0,1
2,4105.0,3858.0,2827.0,2732.0,1802.0,2664.0,3161.0,4334.0,3494.0,2857.0,...,10042.0,10572.0,7609.0,6999.0,6641.0,6759.0,7974.0,9668.0,10165.0,2
3,1584.0,1776.0,1100.0,1133.0,927.0,1234.0,1415.0,1861.0,1481.0,1199.0,...,2797.0,2886.0,2067.0,2001.0,1925.0,1992.0,2299.0,2918.0,2926.0,3
4,2343.0,2216.0,1657.0,1508.0,1209.0,1897.0,1903.0,2235.0,1925.0,1586.0,...,4257.0,4225.0,3264.0,3119.0,3073.0,3251.0,3735.0,4195.0,4031.0,4


ROLLING MEAND AND LAG INSTEAD OF LAST 14 DAYS , ALSO INTRODUCES EVENTS 

## Sell Price features

In [29]:
print(prices_df.shape)
prices_df.head(15)

(6841121, 4)


Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26
5,CA_1,HOBBIES_1_001,11330,8.26
6,CA_1,HOBBIES_1_001,11331,8.26
7,CA_1,HOBBIES_1_001,11332,8.26
8,CA_1,HOBBIES_1_001,11333,8.26
9,CA_1,HOBBIES_1_001,11334,8.26


In [30]:
# df_sell_prices = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv')
# df_sell_prices['id'] = df_sell_prices['item_id'] + '_' + df_sell_prices['store_id']

# df_sell_prices = df_sell_prices.pivot(index='id', columns='wm_yr_wk', values='sell_price')
# df_sell_prices = df_sell_prices.fillna(method='bfill', axis=1)

# # df_prices = pd.DataFrame(index=df_train.index.values)
# # df_prices = list()
# # for i in df_sell_prices.columns:
# #     cols = df_calendar['d'][df_calendar['wm_yr_wk']==i]
# #     t = pd.concat([df_sell_prices[i] for j in cols], axis=1)
# #     t.columns = cols
# #     df_prices.append(t)
# # df_prices = pd.concat(df_prices, axis=1)

In [31]:
# print(df_sell_prices.shape)
# df_sell_prices.head()

# LSTM

In [32]:
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import LSTM, Dropout, Dense, concatenate, Flatten
from tensorflow.keras.utils import plot_model
from tensorflow.keras import backend as K

# simple LSTM: RMSSE scores: 
# 1.289
# 1.354
# na stijns fix:
# 1.1
# met meer epochs en agg feature
# 0.65

def simple_LSTM_model(input_shape_features, input_shape_LSTM, output_shape, model_name="simple_model"):
    K.clear_session()
    # timeseries input stream
    mdl_input1 = Input(shape=input_shape_LSTM, name="timeseries_input")
    layer_1_units= 16
    x = LSTM(layer_1_units,return_sequences=False)(mdl_input1)
    
    
    # feature input stream
    mdl_input2 = Input(shape=input_shape_features, name="feature_input")
    y = Dense(16)(mdl_input2)

    
    # Combine the two streams
    z = concatenate([x,y])
    z = Dropout(0.5)(z)
    
    output = Dense(output_shape)(z)

    model = Model(inputs=[mdl_input1, mdl_input2], outputs=output)
    # Compiling the RNN
    model.compile(optimizer = 'adam', loss = 'mean_squared_error')
    model.summary()
    plot_model(model, "simple{}.png".format(model_name))
    return model

def only_LSTM_model(input_shape_features, input_shape_LSTM, output_shape, model_name="test_model"):
    K.clear_session()
    # timeseries input stream
    mdl_input1 = Input(shape=input_shape_LSTM, name="timeseries_input")
    layer_1_units= 16
    x = LSTM(layer_1_units,return_sequences=False,dropout=0.2)(mdl_input1)
    
    
    # feature input stream
    mdl_input2 = Input(shape=input_shape_features, name="feature_input")
  #  y = Dense(16)(mdl_input2)

    
    # Combine the two streams
   # z = concatenate([x,y])
    z=x
    z = Dropout(0.2)(z)
    output = Dense(output_shape)(z)


    model = Model(inputs=[mdl_input1, mdl_input2], outputs=output)
    # Compiling the RNN
    model.compile(optimizer = 'adam', loss = 'mean_squared_error')
    model.summary()
    return model

def only_feature_model(input_shape_features, input_shape_LSTM, output_shape, model_name="test_model"):
    K.clear_session()
    # timeseries input stream
    mdl_input1 = Input(shape=input_shape_LSTM, name="timeseries_input")
#     layer_1_units= 16
#     x = LSTM(layer_1_units,return_sequences=False,dropout=0.2)(mdl_input1)
    
    
    # feature input stream
    mdl_input2 = Input(shape=input_shape_features, name="feature_input")
    y = Dense(16)(mdl_input2)

    
    # Combine the two streams
    z = y#concatenate([x,y])
    z = Dropout(0.2)(z)
    output = Dense(output_shape)(z)


    model = Model(inputs=[mdl_input1, mdl_input2], outputs=output)
    # Compiling the RNN
    model.compile(optimizer = 'adam', loss = 'mean_squared_error')
    model.summary()
    return model

In [33]:
#for each time series, take 10 random indices. for each indices, take previous 14 days and calander data of day 15 as featuers and day 15 as y. 
# now this is done for each dept_id

#for itemid in unique_values1:
#    itemseries = train_sales.loc[(train_sales['dept_id'] == itemid)]
#    print(len(itemseries))

column_values1 = sales[['store_id']].values.ravel()
stores = pd.unique(column_values1)

column_values1 = sales[['dept_id']].values.ravel()
departments = pd.unique(column_values1)

#column_values1 = sales[['dept_id']].values.ravel()
#unique_values1 =  pd.unique(column_values1)

days = range(firstDay, lastDay + 1)
val_days = range(lastDay+1-28, lastDay+1)


time_series_columns = [f'd_{i}' for i in days]
val_days_columns = [f'd_{i}' for i in val_days]

# shape is number of series, number of predicted days
predict_total3 = np.zeros((30490, 56))

#print(predict_total3.shape)
length = 0 
index = 0 
#for storeid in stores:
for deptid in departments:   
        model_name = "dept{}_model".format(deptid)
        print(model_name)
        itemseries2 = sales.loc[(sales['dept_id'] == deptid)] 
        indexen = itemseries2.index.values.tolist()
        itemseries = itemseries2[time_series_columns].values
        agg_features_dept = agg_features.loc[(agg_features['dept_id'] == deptid)].values

        length = length +  len(itemseries)

        n1 = 100
        xo =  1
        xi = 6
        train_x_data = np.zeros((len(itemseries)* n1,xi+features.shape[1]))
        
        nr_days_past = 14
        
        # number of lstm feauters are sales + agg features
        nr_lstm_features = 1 + len(agg_features_dept) 
        train_x2_data = np.zeros((len(itemseries)* n1,nr_lstm_features, nr_days_past))

        train_y_data =  np.zeros(len(train_x_data))

        for i in range(len(itemseries)):
            x = itemseries[i]
           
            # kies random n1 indexen om training samples van te maken, -84 om niet in validatie deel te komen
            n_random = np.random.choice( np.arange(firstDay + 60 ,len(x)-84), n1, replace = False)

            x_train = np.zeros((n1,xi+features.shape[1]))
            # timeseries data:
            x2_train = np.zeros((n1, nr_lstm_features, nr_days_past))
            y_train = np.zeros(len(x_train))

            #creating training  samples
            for k, j in enumerate(n_random):
                lag_7 = x[j-7]
                lag_28 = x[j-28]
                rmean_7_7 = np.mean(x[j-14:j-7])
                rmean_28_7 = np.mean(x[j-35:j-28])
                rmean_7_28 = np.mean(x[j-35:j-7])
                rmean_28_28 = np.mean(x[j-56:j-28])
                means = np.array([lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28])
                x_train[k] = np.concatenate((means, features.iloc[j]), axis = 0) 
                # timeseries data:
                # last 14 days
                x2_sales = x[j - nr_days_past: j]
                x2_agg_dept = agg_features_dept[:,j - nr_days_past: j]
                x2_train[k] =  np.vstack((x2_sales,x2_agg_dept)) 
                                
                y_train[k] = x[j]
            train_x_data[i*n1:i*n1+n1] = x_train
            train_y_data[i*n1:i*n1+n1] = y_train


       # train_x2_data = train_x2_data.reshape(*train_x2_data.shape,1) 
          
        input_model = [train_x2_data, train_x_data]

        # LSTM Model:
        model = simple_LSTM_model(input_shape_LSTM = train_x2_data.shape[1:], 
                            input_shape_features = train_x_data.shape[1:],
                           output_shape = 1, model_name=model_name)
#         model = only_feature_model(input_shape_LSTM = train_x2_data.shape[1:], 
#                            input_shape_features = train_x_data.shape[1:],
#                            output_shape = 1, model_name=model_name) 

        epochs=100
        BATCH_SIZE= 512
        model.fit(input_model, train_y_data, epochs=epochs, batch_size= BATCH_SIZE)
        
        predict_features = features[-56:] # calander data of the 56 days to predict and 28 days of validation


        predict_total2 = np.zeros((len(itemseries),56))

        
        k = itemseries[:,:-28]
        k_agg = agg_features_dept[:,:-28]
        q = itemseries
        q_agg = agg_features_dept
 
        #making predictions for 56 days (28 validation and 28 for submission)
        for i in range(56):
            if i < 28:
                #k  = len(itemseries[0])
                lag_7 = k[:,-7]
                lag_28 = k[:,-28]
                rmean_7_7 = np.mean(k[:,-14:-7],axis=1)
                rmean_28_7 = np.mean(k[:,-35:-28],axis=1)
                rmean_7_28 = np.mean(k[:,-35:-7],axis=1)
                rmean_28_28 = np.mean(k[:,-56:-28],axis=1)
                means = np.column_stack((lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28)) #np.array([lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28])

                predict_total = np.zeros((len(itemseries), len(means[0])+predict_features.shape[1]))

                # timeseries data:
                lstm_input = np.zeros((len(itemseries), nr_lstm_features, nr_days_past))
                lstm_input_agg = k_agg[:,-nr_days_past:]
                for z in range(len(itemseries)):
                    predict_total[z] = np.concatenate((means[z],predict_features.iloc[i]), axis= 0)

                    lstm_input_sales = k[z][- nr_days_past:]
        
                    lstm_input[z] = np.vstack((lstm_input_sales, lstm_input_agg))
                predict = model.predict([lstm_input, predict_total]).squeeze()
                
                new_agg_dept = np.sum(predict)
                new_agg_dept_avg = np.mean(predict)
                new_agg = np.vstack((new_agg_dept, new_agg_dept_avg))
                
                k = np.column_stack((k, predict))
                k_agg = np.column_stack((k_agg, new_agg))
                predict_total2[:,i] = predict
            else:
                #k  = len(itemseries[0])
                lag_7 = q[:,-7]
                lag_28 = q[:,-28]
                rmean_7_7 = np.mean(q[:,-14:-7],axis=1)
                rmean_28_7 = np.mean(q[:,-35:-28],axis=1)
                rmean_7_28 = np.mean(q[:,-35:-7],axis=1)
                rmean_28_28 = np.mean(q[:,-56:-28],axis=1)
                means = np.column_stack((lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28)) #np.array([lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28])

                predict_total = np.zeros((len(itemseries), len(means[0])+predict_features.shape[1]))

                # timeseries data:
                lstm_input = np.zeros((len(itemseries), nr_lstm_features, nr_days_past))
                lstm_input_agg = q_agg[:,-nr_days_past:]
                for z in range(len(itemseries)):
                    predict_total[z] = np.concatenate((means[z],predict_features.iloc[i]), axis= 0)

                    lstm_input_sales = q[z][- nr_days_past:]
                    lstm_input[z] = np.vstack((lstm_input_sales, lstm_input_agg))

                predict = model.predict([lstm_input, predict_total]).squeeze()

                new_agg_dept = np.sum(predict)
                new_agg_dept_avg = np.mean(predict)
                new_agg = np.vstack((new_agg_dept, new_agg_dept_avg))
                
                q = np.column_stack((q, predict))
                q_agg = np.column_stack((q_agg, new_agg))
                predict_total2[:,i] = predict
                
        predict_total3[indexen] = predict_total2

dept0_model
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
timeseries_input (InputLayer)   [(None, 3, 14)]      0                                            
__________________________________________________________________________________________________
feature_input (InputLayer)      [(None, 39)]         0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 16)           1984        timeseries_input[0][0]           
__________________________________________________________________________________________________
dense (Dense)                   (None, 16)           640         feature_input[0][0]              
__________________________________________________________________________________

# Evaluate the Model

In [34]:
predict_val = predict_total3[:,:-28]
predict_sub = predict_total3[:,-28:]

In [35]:
val_days = range(lastDay+1-28, lastDay+1)
val_days_columns = [f'd_{i}' for i in val_days]
valid_preds2 = pd.DataFrame(predict_val ,columns = val_days_columns)
valid_preds2.head()

Unnamed: 0,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,0.930874,0.907512,0.907189,1.175665,1.098084,1.282609,1.273337,1.10218,1.21016,1.08673,...,1.244413,1.398359,1.247497,1.123616,1.093563,1.05021,1.122707,1.151175,1.329816,1.272513
1,0.083618,0.010749,0.081462,0.05592,0.151584,0.274078,0.261155,0.096658,0.068076,0.041104,...,0.164395,0.283131,0.233828,0.094542,0.061176,0.054668,0.039153,0.151866,0.27201,0.209231
2,0.630106,0.643991,0.664488,0.614266,0.770359,0.892962,0.867795,0.619596,0.601952,0.556907,...,0.793709,0.861669,0.837794,0.65679,0.62567,0.597813,0.602216,0.766995,0.884631,0.824329
3,1.944382,1.753974,1.676904,1.727058,2.147525,2.689654,2.523072,2.224973,2.198232,2.113267,...,2.420514,2.531961,2.633796,2.4336,2.273937,2.254748,2.3267,2.498366,2.74403,2.468734
4,1.104362,1.071525,1.175568,1.255107,1.275109,1.473839,1.782381,1.3482,1.386525,1.340753,...,1.562809,1.635952,1.591449,1.416902,1.417618,1.3757,1.436919,1.569004,1.701745,1.706163


In [36]:
# getting error of 28 validation days
val_score = evaluator.score(valid_preds2)
print(val_score)
with open('val_score.txt', 'w') as output:
    output.write(str(val_score))

0.6777108982283074


# Prepare submission

In [37]:
forecast_val= pd.DataFrame(predict_val)
forecast_eval = pd.DataFrame(predict_sub)
forecast_val.columns = [f'F{i}' for i in range(1, forecast_val.shape[1] + 1)]
forecast_eval.columns = [f'F{i}' for i in range(1, forecast_eval.shape[1] + 1)]
evaluation_ids  = sales_df['id'].values
validation_ids = [i.replace('evaluation', 'validation') for i in evaluation_ids]
ids = np.concatenate([validation_ids, evaluation_ids])
predictions = pd.DataFrame(ids, columns=['id'])
forecast = pd.concat([forecast_val, forecast_eval]).reset_index(drop=True)
final_rf =  pd.concat([predictions, forecast], axis=1)
final_rf.head(10)

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.930874,0.907512,0.907189,1.175665,1.098084,1.282609,1.273337,1.10218,1.21016,...,1.244413,1.398359,1.247497,1.123616,1.093563,1.05021,1.122707,1.151175,1.329816,1.272513
1,HOBBIES_1_002_CA_1_validation,0.083618,0.010749,0.081462,0.05592,0.151584,0.274078,0.261155,0.096658,0.068076,...,0.164395,0.283131,0.233828,0.094542,0.061176,0.054668,0.039153,0.151866,0.27201,0.209231
2,HOBBIES_1_003_CA_1_validation,0.630106,0.643991,0.664488,0.614266,0.770359,0.892962,0.867795,0.619596,0.601952,...,0.793709,0.861669,0.837794,0.65679,0.62567,0.597813,0.602216,0.766995,0.884631,0.824329
3,HOBBIES_1_004_CA_1_validation,1.944382,1.753974,1.676904,1.727058,2.147525,2.689654,2.523072,2.224973,2.198232,...,2.420514,2.531961,2.633796,2.4336,2.273937,2.254748,2.3267,2.498366,2.74403,2.468734
4,HOBBIES_1_005_CA_1_validation,1.104362,1.071525,1.175568,1.255107,1.275109,1.473839,1.782381,1.3482,1.386525,...,1.562809,1.635952,1.591449,1.416902,1.417618,1.3757,1.436919,1.569004,1.701745,1.706163
5,HOBBIES_1_006_CA_1_validation,0.838011,0.755644,0.661011,0.566598,0.847219,0.85975,0.791376,0.710823,0.706126,...,0.767071,0.922423,0.838847,0.748744,0.701188,0.65585,0.692935,0.849622,0.905872,0.855187
6,HOBBIES_1_007_CA_1_validation,0.381265,0.300431,0.266079,0.240585,0.363367,0.545025,0.57995,0.394521,0.365862,...,0.451336,0.563667,0.522167,0.411145,0.336929,0.334701,0.325169,0.444893,0.606354,0.54541
7,HOBBIES_1_008_CA_1_validation,9.646862,8.238441,8.08948,8.16595,8.405257,9.092033,8.136514,8.012653,7.350353,...,8.039035,8.184278,8.190062,9.155047,7.875573,7.917787,8.0279,8.151815,8.414825,8.504071
8,HOBBIES_1_009_CA_1_validation,0.761088,1.31606,0.745143,0.77086,0.890957,0.93552,1.002075,0.764332,0.806788,...,0.839236,0.979363,0.980134,0.846037,0.964606,0.690585,0.733209,0.873863,1.02176,0.989285
9,HOBBIES_1_010_CA_1_validation,0.462279,0.42223,0.379704,0.380417,0.668002,0.620443,0.878846,0.596538,0.572381,...,0.678696,0.762222,0.709462,0.547684,0.521783,0.499353,0.53617,0.727466,0.786724,0.798235


In [38]:
final_rf.shape

(60980, 29)

In [39]:
final_rf.to_csv('submission.csv', index=False)