# Neural networks
In this notebook, we explore 2 different neural network architectures that succeeded in predicting sales volumes for a kaggle competition.

## Notebook settings

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from scipy.stats import describe
import seaborn as sns
from pylab import rcParams

pd.options.display.max_columns = 12
pd.options.display.max_rows = 24

# disable warnings in Anaconda
warnings.simplefilter('ignore')

# plots inisde jupyter notebook
%matplotlib inline

sns.set(style='darkgrid', palette='muted')
color_scheme = {
    'red': '#F1637A',
    'green': '#6ABB3E',
    'blue': '#3D8DEA',
    'black': '#000000'
}

# use svg for all plots within inline backend
%config InlineBackend.figure_format = 'svg'

# increase default plot size
rcParams['figure.figsize'] = 8, 6

## Loading data
In order for this to work, download all datasets from kaggle competition:  
https://www.kaggle.com/c/demand-forecasting-kernels-only  
and place them in `../data/` folder.  
We could not append datasets to our repo because of copyrights.

In [2]:
df_train = pd.read_csv('DemandData/train.csv')
df_test = pd.read_csv('DemandData/test.csv')

Before we do anything, we want to have both sets in a format with 500 sales columns and 1 row for each day.

In [3]:
df_train.index = pd.to_datetime(df_train['date'])
df_train.drop('date', axis=1, inplace=True)
df_test.index = pd.to_datetime(df_test['date'])
df_test.drop('date', axis=1, inplace=True)

In [4]:
df_train.head(10)

Unnamed: 0_level_0,store,item,sales
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-01-01,1,1,13
2013-01-02,1,1,11
2013-01-03,1,1,14
2013-01-04,1,1,13
2013-01-05,1,1,10
2013-01-06,1,1,12
2013-01-07,1,1,10
2013-01-08,1,1,9
2013-01-09,1,1,12
2013-01-10,1,1,9


In [5]:
from itertools import product, starmap


def storeitems():
    return product(range(1,51), range(1,11))


def storeitems_column_names():
    return list(starmap(lambda i,s: f'item_{i}_store_{s}_sales', storeitems()))


def sales_by_storeitem(df):
    ret = pd.DataFrame(index=df.index.unique())
    for i, s in storeitems():
        ret[f'item_{i}_store_{s}_sales'] = df[(df['item'] == i) & (df['store'] == s)]['sales'].values
    return ret

In [6]:
df_train.head(5)

Unnamed: 0_level_0,store,item,sales
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-01-01,1,1,13
2013-01-02,1,1,11
2013-01-03,1,1,14
2013-01-04,1,1,13
2013-01-05,1,1,10


In [7]:
df_train = sales_by_storeitem(df_train)
# for test set, we just fill y values with zeros (they won't be used anyway)
df_test['sales'] = np.zeros(df_test.shape[0])
df_test = sales_by_storeitem(df_test)

In [8]:
df_test.head()

Unnamed: 0_level_0,item_1_store_1_sales,item_1_store_2_sales,item_1_store_3_sales,item_1_store_4_sales,item_1_store_5_sales,item_1_store_6_sales,...,item_50_store_5_sales,item_50_store_6_sales,item_50_store_7_sales,item_50_store_8_sales,item_50_store_9_sales,item_50_store_10_sales
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2018-01-01,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-02,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-03,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-04,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-05,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df_train.head(10)

Unnamed: 0_level_0,item_1_store_1_sales,item_1_store_2_sales,item_1_store_3_sales,item_1_store_4_sales,item_1_store_5_sales,item_1_store_6_sales,...,item_50_store_5_sales,item_50_store_6_sales,item_50_store_7_sales,item_50_store_8_sales,item_50_store_9_sales,item_50_store_10_sales
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2013-01-01,13,12,19,10,11,20,...,19,20,21,45,36,33
2013-01-02,11,16,8,12,9,6,...,25,23,30,54,44,37
2013-01-03,14,16,10,8,12,11,...,28,38,20,54,29,46
2013-01-04,13,20,15,15,8,7,...,27,33,27,52,43,51
2013-01-05,10,16,22,19,13,12,...,31,33,18,48,53,41
2013-01-06,12,18,22,14,15,9,...,24,28,26,51,38,41
2013-01-07,10,12,7,12,6,11,...,12,25,14,37,34,38
2013-01-08,9,11,15,16,11,9,...,27,27,27,44,34,33
2013-01-09,12,16,9,18,13,13,...,25,26,20,46,24,35
2013-01-10,9,10,7,16,9,9,...,36,31,20,44,40,41


In [10]:
df_train.shape

(1826, 500)

In [11]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1826 entries, 2013-01-01 to 2017-12-31
Columns: 500 entries, item_1_store_1_sales to item_50_store_10_sales
dtypes: int64(500)
memory usage: 7.0 MB


In [12]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 90 entries, 2018-01-01 to 2018-03-31
Columns: 500 entries, item_1_store_1_sales to item_50_store_10_sales
dtypes: float64(500)
memory usage: 352.3 KB


We combine data to prepare it for the model, and later split back into train and test set, not to repeat same transformations twice.

In [13]:
# make sure all column names are the same and in the same order
col_names = list(zip(df_test.columns, df_train.columns))
for cn in col_names:
    assert cn[0] == cn[1]

In [14]:
df_test['is_test'] = np.repeat(True, df_test.shape[0])
df_train['is_test'] = np.repeat(False, df_train.shape[0])
df_total = pd.concat([df_train, df_test])
df_total.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1916 entries, 2013-01-01 to 2018-03-31
Columns: 501 entries, item_1_store_1_sales to is_test
dtypes: bool(1), float64(500)
memory usage: 7.3 MB


### Features
One-hot encoding day of week and month of year to make sure the networks recognize the seasonality in the data.

In [15]:
weekday_df = pd.get_dummies(df_total.index.weekday, prefix='weekday')
weekday_df.index = df_total.index

In [16]:
month_df = pd.get_dummies(df_total.index.month, prefix='month')
month_df.index =  df_total.index

In [17]:
weekday_df.head()

Unnamed: 0_level_0,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-01,0,1,0,0,0,0,0
2013-01-02,0,0,1,0,0,0,0
2013-01-03,0,0,0,1,0,0,0
2013-01-04,0,0,0,0,1,0,0
2013-01-05,0,0,0,0,0,1,0


In [18]:
month_df.head()

Unnamed: 0_level_0,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2013-01-01,1,0,0,0,0,0,0,0,0,0,0,0
2013-01-02,1,0,0,0,0,0,0,0,0,0,0,0
2013-01-03,1,0,0,0,0,0,0,0,0,0,0,0
2013-01-04,1,0,0,0,0,0,0,0,0,0,0,0
2013-01-05,1,0,0,0,0,0,0,0,0,0,0,0


In [19]:
df_total.head()

Unnamed: 0_level_0,item_1_store_1_sales,item_1_store_2_sales,item_1_store_3_sales,item_1_store_4_sales,item_1_store_5_sales,item_1_store_6_sales,...,item_50_store_6_sales,item_50_store_7_sales,item_50_store_8_sales,item_50_store_9_sales,item_50_store_10_sales,is_test
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2013-01-01,13.0,12.0,19.0,10.0,11.0,20.0,...,20.0,21.0,45.0,36.0,33.0,False
2013-01-02,11.0,16.0,8.0,12.0,9.0,6.0,...,23.0,30.0,54.0,44.0,37.0,False
2013-01-03,14.0,16.0,10.0,8.0,12.0,11.0,...,38.0,20.0,54.0,29.0,46.0,False
2013-01-04,13.0,20.0,15.0,15.0,8.0,7.0,...,33.0,27.0,52.0,43.0,51.0,False
2013-01-05,10.0,16.0,22.0,19.0,13.0,12.0,...,33.0,18.0,48.0,53.0,41.0,False


In [20]:
df_total = pd.concat([weekday_df, month_df, df_total], axis=1)

In [21]:
df_total.columns

Index(['weekday_0', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4',
       'weekday_5', 'weekday_6', 'month_1', 'month_2', 'month_3',
       ...
       'item_50_store_2_sales', 'item_50_store_3_sales',
       'item_50_store_4_sales', 'item_50_store_5_sales',
       'item_50_store_6_sales', 'item_50_store_7_sales',
       'item_50_store_8_sales', 'item_50_store_9_sales',
       'item_50_store_10_sales', 'is_test'],
      dtype='object', length=520)

In [22]:
# assert df_total.isna().any().any() == False
assert df_total.isnull().any().any() == False

We also want to append sales from previous day to each row, which we will then use as input data.

In [23]:
def shift_series(series, days):
    return series.transform(lambda x: x.shift(days))


def shift_series_in_df(df, series_names=[], days_delta=90):
    ret = pd.DataFrame(index=df.index.copy())
    str_sgn = 'future' if np.sign(days_delta) < 0 else 'past'
    for sn in series_names:
        ret[f'{sn}_{str_sgn}_{np.abs(days_delta)}'] = shift_series(df[sn], days_delta)
    return ret

    
def stack_shifted_sales(df, days_deltas=[1, 90, 360]):
    names = storeitems_column_names()
    dfs = [df.copy()]
    for delta in days_deltas:
        shifted = shift_series_in_df(df, series_names=names, days_delta=delta)
        dfs.append(shifted)
#     return pd.concat(dfs, axis=1, sort=False, copy=False)
    return pd.concat(dfs, axis=1, copy=False)

In [24]:
df_total = stack_shifted_sales(df_total, days_deltas=[1])
df_total.dropna(inplace=True)

In [25]:
df_total.iloc[:,19:]

Unnamed: 0_level_0,item_1_store_1_sales,item_1_store_2_sales,item_1_store_3_sales,item_1_store_4_sales,item_1_store_5_sales,item_1_store_6_sales,...,item_50_store_5_sales_past_1,item_50_store_6_sales_past_1,item_50_store_7_sales_past_1,item_50_store_8_sales_past_1,item_50_store_9_sales_past_1,item_50_store_10_sales_past_1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2013-01-02,11.0,16.0,8.0,12.0,9.0,6.0,...,19.0,20.0,21.0,45.0,36.0,33.0
2013-01-03,14.0,16.0,10.0,8.0,12.0,11.0,...,25.0,23.0,30.0,54.0,44.0,37.0
2013-01-04,13.0,20.0,15.0,15.0,8.0,7.0,...,28.0,38.0,20.0,54.0,29.0,46.0
2013-01-05,10.0,16.0,22.0,19.0,13.0,12.0,...,27.0,33.0,27.0,52.0,43.0,51.0
2013-01-06,12.0,18.0,22.0,14.0,15.0,9.0,...,31.0,33.0,18.0,48.0,53.0,41.0
2013-01-07,10.0,12.0,7.0,12.0,6.0,11.0,...,24.0,28.0,26.0,51.0,38.0,41.0
2013-01-08,9.0,11.0,15.0,16.0,11.0,9.0,...,12.0,25.0,14.0,37.0,34.0,38.0
2013-01-09,12.0,16.0,9.0,18.0,13.0,13.0,...,27.0,27.0,27.0,44.0,34.0,33.0
2013-01-10,9.0,10.0,7.0,16.0,9.0,9.0,...,25.0,26.0,20.0,46.0,24.0,35.0
2013-01-11,9.0,15.0,19.0,14.0,9.0,11.0,...,36.0,31.0,20.0,44.0,40.0,41.0


In [26]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1915 entries, 2013-01-02 to 2018-03-31
Freq: D
Columns: 1020 entries, weekday_0 to item_50_store_10_sales_past_1
dtypes: bool(1), float64(1000), uint8(19)
memory usage: 14.7 MB


We need to make sure that stacked and not-stacked sales columns appar in the same order. We do this by sorting the names (as strings), which works fine because we only need 1 past day for the network (see regression models and their rolling mechanism if you are interested in rolling more than 1 past days of sales as input).

In [27]:
sales_cols = [col for col in df_total.columns if '_sales' in col and '_sales_' not in col]
stacked_sales_cols = [col for col in df_total.columns if '_sales_' in col]
other_cols = [col for col in df_total.columns if col not in set(sales_cols) and col not in set(stacked_sales_cols)]

sales_cols = sorted(sales_cols)
stacked_sales_cols = sorted(stacked_sales_cols)

new_cols = other_cols + stacked_sales_cols + sales_cols

In [28]:
new_cols

['weekday_0',
 'weekday_1',
 'weekday_2',
 'weekday_3',
 'weekday_4',
 'weekday_5',
 'weekday_6',
 'month_1',
 'month_2',
 'month_3',
 'month_4',
 'month_5',
 'month_6',
 'month_7',
 'month_8',
 'month_9',
 'month_10',
 'month_11',
 'month_12',
 'is_test',
 'item_10_store_10_sales_past_1',
 'item_10_store_1_sales_past_1',
 'item_10_store_2_sales_past_1',
 'item_10_store_3_sales_past_1',
 'item_10_store_4_sales_past_1',
 'item_10_store_5_sales_past_1',
 'item_10_store_6_sales_past_1',
 'item_10_store_7_sales_past_1',
 'item_10_store_8_sales_past_1',
 'item_10_store_9_sales_past_1',
 'item_11_store_10_sales_past_1',
 'item_11_store_1_sales_past_1',
 'item_11_store_2_sales_past_1',
 'item_11_store_3_sales_past_1',
 'item_11_store_4_sales_past_1',
 'item_11_store_5_sales_past_1',
 'item_11_store_6_sales_past_1',
 'item_11_store_7_sales_past_1',
 'item_11_store_8_sales_past_1',
 'item_11_store_9_sales_past_1',
 'item_12_store_10_sales_past_1',
 'item_12_store_1_sales_past_1',
 'item_12_stor

In [29]:
df_total = df_total.reindex(columns=new_cols)

In [30]:
df_total.head()

Unnamed: 0_level_0,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,...,item_9_store_4_sales,item_9_store_5_sales,item_9_store_6_sales,item_9_store_7_sales,item_9_store_8_sales,item_9_store_9_sales
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2013-01-02,0,0,1,0,0,0,...,21.0,20.0,17.0,20.0,28.0,36.0
2013-01-03,0,0,0,1,0,0,...,25.0,15.0,28.0,18.0,31.0,25.0
2013-01-04,0,0,0,0,1,0,...,37.0,20.0,33.0,24.0,46.0,31.0
2013-01-05,0,0,0,0,0,1,...,37.0,23.0,27.0,14.0,35.0,30.0
2013-01-06,0,0,0,0,0,0,...,37.0,29.0,20.0,24.0,34.0,35.0


In [31]:
# assert df_total.isna().any().any() == False
assert df_total.isnull().any().any() == False

### Scaling
With combined datasets and shifted sales, we can now correctly min-max scale all data.
This is necessary for neural networks to work as intended.

In [32]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [33]:
scaler = MinMaxScaler(feature_range=(0,1))
cols_to_scale = [col for col in df_total.columns if 'weekday' not in col and 'month' not in col]
scaled_cols = scaler.fit_transform(df_total[cols_to_scale])
df_total[cols_to_scale] = scaled_cols

In [34]:
df_total.head()

Unnamed: 0_level_0,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,...,item_9_store_4_sales,item_9_store_5_sales,item_9_store_6_sales,item_9_store_7_sales,item_9_store_8_sales,item_9_store_9_sales
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2013-01-02,0,0,1,0,0,0,...,0.189189,0.238095,0.209877,0.25641,0.208955,0.327273
2013-01-03,0,0,0,1,0,0,...,0.225225,0.178571,0.345679,0.230769,0.231343,0.227273
2013-01-04,0,0,0,0,1,0,...,0.333333,0.238095,0.407407,0.307692,0.343284,0.281818
2013-01-05,0,0,0,0,0,1,...,0.333333,0.27381,0.333333,0.179487,0.261194,0.272727
2013-01-06,0,0,0,0,0,0,...,0.333333,0.345238,0.246914,0.307692,0.253731,0.318182


### Split back to train and test set

In [35]:
df_train = df_total[df_total['is_test'] == False].drop('is_test', axis=1)
df_test = df_total[df_total['is_test'] == True].drop('is_test', axis=1)

## Training models
First, we need to separate training data into input and target vectors, and separate part of training data as validation data for our models.

In [36]:
X_cols_stacked = [col for col in df_train.columns if '_past_' in col]
X_cols_caldata = [col for col in df_train.columns if 'weekday_' in col or 'month_' in col or 'year' in col]
X_cols = X_cols_stacked + X_cols_caldata

X = df_train[X_cols]

In [37]:
X_cols

['item_10_store_10_sales_past_1',
 'item_10_store_1_sales_past_1',
 'item_10_store_2_sales_past_1',
 'item_10_store_3_sales_past_1',
 'item_10_store_4_sales_past_1',
 'item_10_store_5_sales_past_1',
 'item_10_store_6_sales_past_1',
 'item_10_store_7_sales_past_1',
 'item_10_store_8_sales_past_1',
 'item_10_store_9_sales_past_1',
 'item_11_store_10_sales_past_1',
 'item_11_store_1_sales_past_1',
 'item_11_store_2_sales_past_1',
 'item_11_store_3_sales_past_1',
 'item_11_store_4_sales_past_1',
 'item_11_store_5_sales_past_1',
 'item_11_store_6_sales_past_1',
 'item_11_store_7_sales_past_1',
 'item_11_store_8_sales_past_1',
 'item_11_store_9_sales_past_1',
 'item_12_store_10_sales_past_1',
 'item_12_store_1_sales_past_1',
 'item_12_store_2_sales_past_1',
 'item_12_store_3_sales_past_1',
 'item_12_store_4_sales_past_1',
 'item_12_store_5_sales_past_1',
 'item_12_store_6_sales_past_1',
 'item_12_store_7_sales_past_1',
 'item_12_store_8_sales_past_1',
 'item_12_store_9_sales_past_1',
 'item_

In [38]:
X_colset = set(X_cols)
y_cols = [col for col in df_train.columns if col not in X_colset]

y = df_train[y_cols]

In [39]:
X.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1825 entries, 2013-01-02 to 2017-12-31
Freq: D
Columns: 519 entries, item_10_store_10_sales_past_1 to month_12
dtypes: float64(500), uint8(19)
memory usage: 7.0 MB


In [40]:
y.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1825 entries, 2013-01-02 to 2017-12-31
Freq: D
Columns: 500 entries, item_10_store_10_sales to item_9_store_9_sales
dtypes: float64(500)
memory usage: 7.0 MB


In [41]:
X.columns

Index(['item_10_store_10_sales_past_1', 'item_10_store_1_sales_past_1',
       'item_10_store_2_sales_past_1', 'item_10_store_3_sales_past_1',
       'item_10_store_4_sales_past_1', 'item_10_store_5_sales_past_1',
       'item_10_store_6_sales_past_1', 'item_10_store_7_sales_past_1',
       'item_10_store_8_sales_past_1', 'item_10_store_9_sales_past_1',
       ...
       'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8',
       'month_9', 'month_10', 'month_11', 'month_12'],
      dtype='object', length=519)

In [42]:
y.columns

Index(['item_10_store_10_sales', 'item_10_store_1_sales',
       'item_10_store_2_sales', 'item_10_store_3_sales',
       'item_10_store_4_sales', 'item_10_store_5_sales',
       'item_10_store_6_sales', 'item_10_store_7_sales',
       'item_10_store_8_sales', 'item_10_store_9_sales',
       ...
       'item_9_store_10_sales', 'item_9_store_1_sales', 'item_9_store_2_sales',
       'item_9_store_3_sales', 'item_9_store_4_sales', 'item_9_store_5_sales',
       'item_9_store_6_sales', 'item_9_store_7_sales', 'item_9_store_8_sales',
       'item_9_store_9_sales'],
      dtype='object', length=500)

In [43]:
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=False)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

To achieve similar error values as on Kaggle, we will use only 1st quarter of 2017 for validation (test set contains 1st quarter of 2018 and the data is highly seasonal).

In [44]:
X_valid, y_valid = X_valid.head(90), y_valid.head(90)

For Keras, we need to make further transformations on input values:

In [45]:
X_train_vals = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_valid_vals = X_valid.values.reshape((X_valid.shape[0], 1, X_valid.shape[1]))

In [46]:
X_train_vals.shape

(1460, 1, 519)

In [47]:
X_train_vals

array([[[0.53846154, 0.39694656, 0.48      , ..., 0.        ,
         0.        , 0.        ]],

       [[0.42307692, 0.33587786, 0.45714286, ..., 0.        ,
         1.        , 0.        ]],

       [[0.75      , 0.79389313, 0.80571429, ..., 0.        ,
         0.        , 0.        ]],

       ...,

       [[0.33333333, 0.38931298, 0.38857143, ..., 0.        ,
         0.        , 0.        ]],

       [[0.78846154, 0.89312977, 0.77714286, ..., 0.        ,
         0.        , 0.        ]],

       [[0.80128205, 0.75572519, 0.79428571, ..., 0.        ,
         0.        , 0.        ]]])

### Defining models

In [48]:
from keras.models import Sequential, Model
from keras.layers import LSTM, Dense, Conv1D, Input, Dropout, AvgPool1D, Reshape, Concatenate

Using TensorFlow backend.


In [49]:
X_train_vals.shape

(1460, 1, 519)

Simple LSTM model:

In [50]:
X_train_vals.shape[1], X_train_vals.shape[2]

(1, 519)

In [51]:
basic_model = Sequential()
basic_model.add(LSTM(500, input_shape=(X_train_vals.shape[1], X_train_vals.shape[2])))
basic_model.add(Dense(500))
basic_model.compile(loss='mean_absolute_error', optimizer='adam')

More complex model combining LSTM with convolutional layers:

In [52]:
inputs = Input(shape=(X_train_vals.shape[1], X_train_vals.shape[2]))
# top pipeline
top_lstm = LSTM(500)(inputs)
top_dense = Dense(500, activation='relu')(top_lstm)
top_dropout = Dropout(500)(top_dense)
# bottom pipeline
bottom_dense = Dense(500)(inputs)
bottom_conv1 = Conv1D(
    500, 
    kernel_size=1,
    input_shape=(X_train_vals.shape[1], X_train_vals.shape[2])
)(bottom_dense)
bottom_conv2 = Conv1D(
    1000,
    kernel_size=50,
    padding='same',
    activation='relu'
)(bottom_conv1)
bottom_conv3 = Conv1D(
    500,
    kernel_size=10,
    padding='same',
    activation='relu'
)(bottom_conv2)
bottom_pooling = AvgPool1D(
    pool_size=60, 
    padding='same'
)(bottom_conv3)
bottom_reshape = Reshape(
    target_shape=[500]
)(bottom_conv3)
# concat output from both pipelines
final_concat = Concatenate()([top_dropout, bottom_reshape])
final_dense = Dense(500)(final_concat)
# compile and return
complex_model = Model(inputs=inputs, outputs=final_dense)
complex_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mape'])

In [53]:
print(complex_model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1, 519)       0                                            
__________________________________________________________________________________________________
dense_3 (Dense)                 (None, 1, 500)       260000      input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 1, 500)       250500      dense_3[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, 500)          2040000     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_2 (

Now we can fit the models:

In [None]:
basic_history = basic_model.fit(
    X_train_vals, 
    y_train.values, 
    epochs=60, 
    batch_size=30,
    validation_data=(X_valid_vals, y_valid.values),
    verbose=2,
    shuffle=False
)

In [None]:
def plot_history(history):
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='test')
    plt.legend()
    plt.show()

In [None]:
plot_history(basic_history)

In [None]:
complex_history = complex_model.fit(
    X_train_vals, 
    y_train.values, 
    epochs=130, 
    batch_size=70,
    validation_data=(X_valid_vals, y_valid.values),
    verbose=2,
    shuffle=False
)

In [None]:
plot_history(complex_history)

## Evaluating model predictions

In [None]:
def model_eval(model, X_test, y_test):
    """
    Evaluate (step-by-step) model predictions from X_test and return predictions and real values in comparable format.
    """
    # prepare data
    sales_x_cols = [col for col in X_test.columns if 'sales' in col]
    sales_x_idxs = [X_test.columns.get_loc(col) for col in sales_x_cols]
    sales_y_cols = [col for col in y_test.columns if 'sales' in col]
    sales_y_idxs = [y_test.columns.get_loc(col) for col in sales_y_cols]
    n_samples = y_test.shape[0]
    y_pred = np.zeros(y_test.shape)
    # iterate
    x_next = X_test.iloc[0].values
    for i in range(0, n_samples):
        x_arr = np.array([x_next])
        x_arr = x_arr.reshape(x_arr.shape[0], 1, x_arr.shape[1])
        y_pred[i] = model.predict(x_arr)[0] # input for prediction must be 2d, output is immediately extracted from 2d to 1d
        try:
            x_next = X_test.iloc[i+1].values
            x_next[sales_x_idxs] = y_pred[i][sales_y_idxs]
        except IndexError:
            pass  # this happens on last iteration, and x_next does not matter anymore
    return y_pred, y_test.values


def unscale(y_arr, scaler, template_df, toint=False):
    """
    Unscale array y_arr of model predictions, based on a scaler fitted 
    to template_df.
    """
    tmp = template_df.copy()
    tmp[y_cols] = pd.DataFrame(y_arr, index=tmp.index)
    tmp[cols_to_scale] = scaler.inverse_transform(tmp[cols_to_scale])
    if toint:
        return tmp[y_cols].astype(int)
    return tmp[y_cols]


def vector_smape(y_pred, y_real):
    nom = np.abs(y_pred-y_real)
    denom = (np.abs(y_pred) + np.abs(y_real)) / 2
    results = nom / denom
    return 100*np.mean(results)  # in percent, same as at kaggle

In [None]:
y_pred_basic, y_real = model_eval(basic_model, X_valid, y_valid)

In [None]:
y_pred_complex = model_eval(complex_model, X_valid, y_valid)[0]

In [None]:
# this is just for unscaling
template_df = pd.concat([X_valid, y_valid], axis=1)
template_df['is_test'] = np.repeat(True, template_df.shape[0])

In [None]:
basic_pred = unscale(y_pred_basic, scaler, template_df, toint=True)
complex_pred = unscale(y_pred_complex, scaler, template_df, toint=True)
real = unscale(y_real, scaler, template_df, toint=True)

In [None]:
basic_smapes = [vector_smape(basic_pred[col], real[col]) for col in basic_pred.columns]
complex_smapes = [vector_smape(complex_pred[col], real[col]) for col in complex_pred.columns]

In [None]:
sns.distplot(basic_smapes, label='Basic model')
sns.distplot(complex_smapes, label='Complex model')
plt.legend(loc='upper right')
plt.savefig('smape_basic_vs_complex.svg')
plt.show()

Depending on training parameters, basic model can sometimes achieve results comparable to the complex one, however after uploading to Kaggle complex model achieved better (and more consistent) results.

In [None]:
describe(basic_smapes)

In [None]:
describe(complex_smapes)

### Visualizing model prediction
We will plot predictions of 2 models for a sample store and item.

In [None]:
store, item = 1,1
plot_lengths = [7, 30, 90]
rolling_mean_windows = [1, 1, 2]  # to make plots more readable

In [None]:
basic_pred

In [None]:
storeitem_col = f'item_{item}_store_{store}_sales'

for pl, mw in zip(plot_lengths, rolling_mean_windows):
    plt.plot(basic_pred[storeitem_col].rolling(mw).mean().values[:pl],
             color_scheme['blue'],
             lw=2,
             label='Basic model prediction')
    plt.plot(complex_pred[storeitem_col].rolling(mw).mean().values[:pl],
             color_scheme['green'],
             lw=2, 
             label='Complex model prediction')
    plt.plot(real[storeitem_col].rolling(mw).mean().values[:pl],
             color_scheme['black'],
             lw=2, 
             label='Real values')
    plt.legend(loc='upper left')
    plt.savefig(f'plot_prediction_{pl}_{mw}.svg')
    plt.show()

In [None]:
df1 = basic_pred['item_1_store_1_sales']
df1.rolling(1).mean().values

In [None]:
storeitem_col

In [None]:
for pl, mw in zip(plot_lengths, rolling_mean_windows):
    print('pl: '+ str(pl) + ' mw: '+ str(mw))

In [None]:
complex_pred['item_1_store_1_sales']

In [None]:
inputs = Input(shape=(X_train_vals.shape[1], X_train_vals.shape[2]))
# top pipeline
top_lstm = LSTM(500)(inputs)
top_dense = Dense(500, activation='relu')(top_lstm)
top_dropout = Dropout(500)(top_dense)
# bottom pipeline
bottom_dense = Dense(500)(inputs)
bottom_conv1 = Conv1D(500,kernel_size=1,input_shape=(X_train_vals.shape[1], X_train_vals.shape[2]))(bottom_dense)
bottom_conv2 = Conv1D(1000,kernel_size=50,padding='same',activation='relu')(bottom_conv1)
bottom_conv3 = Conv1D(500,kernel_size=10,padding='same',activation='relu')(bottom_conv2)
bottom_pooling = AvgPool1D(pool_size=60,padding='same')(bottom_conv3)
bottom_reshape = Reshape(target_shape=[500])(bottom_conv3)
# concat output from both pipelines
final_concat = Concatenate()([top_dropout, bottom_reshape])
final_dense = Dense(500)(final_concat)
# compile and return
complex_model = Model(inputs=inputs, outputs=final_dense)
complex_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mape'])

In [None]:
X_train_vals.shape[1], X_train_vals.shape[2]
# 1, 519