# Part 1: Import Data and EDA

In [None]:
import warnings 
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd 

from math import sqrt
from sklearn.metrics import mean_squared_error

%matplotlib inline
import matplotlib.pyplot as plt  
import seaborn as sns
import statsmodels.api as sm

import plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

In [None]:
sample = pd.read_csv('/kaggle/input/demand-forecasting-kernels-only/sample_submission.csv')
train = pd.read_csv('/kaggle/input/demand-forecasting-kernels-only/train.csv',parse_dates=['date'])
test = pd.read_csv('/kaggle/input/demand-forecasting-kernels-only/test.csv',parse_dates=['date'])

In [None]:
print("Train and Test shape are {} and {} respectively".format(train.shape,test.shape))

In [None]:
print("Min date from train set: {}".format(train.date.min()))
print("Max date from train set: {}".format(train.date.max()))
print("Min date from test set: {}".format(test.date.min()))
print("Max date from test set: {}".format(test.date.max()))

forecast period is 3 months. There are 10 stores and 50 items. 500 combinations of stores and items.

## Overall daily sales

In [None]:
daily_sales = train.groupby('date', as_index=False)['sales'].sum()
daily_sales_sc = go.Scatter(x = daily_sales['date'], y = daily_sales['sales'])
fig = go.Figure(data = daily_sales_sc)
iplot(fig)

Yearly pattern with larger scale near July and lower volume near beginning of years and uptrend is shown.

# Overall weekly sales

In [None]:
train['weekday'] = train['date'].dt.weekday
week_sales = train.groupby('weekday', as_index = False).sales.mean()
week_sales_sc = go.Scatter(x = week_sales['weekday'], y = week_sales['sales'])
fig = go.Figure(data = week_sales_sc)
iplot(fig)

Saturday and Sunday showing larger sales than weekdays.

## Daily Sales by Store

In [None]:
store_sales = train.groupby(['date','store'], as_index = False)['sales'].sum()
store_sales_sc = []
for store in store_sales.store.unique():
    current_store = store_sales[(store_sales['store'] == store)]
    store_sales_sc.append(go.Scatter(x = current_store['date'], y = current_store['sales'],
                                    name = ('store %s' % store)))
fig = go.Figure(data = store_sales_sc)
iplot(fig)

Seasonality and trend for each store are similar

## Daily sales by item

In [None]:
item_sales = train.groupby(['date','item'], as_index = False)['sales'].sum()
item_sales_sc = []
for item in item_sales['item'].unique():
    current_item = item_sales[item_sales['item'] == item]
    item_sales_sc.append(go.Scatter(x = current_item['date'], y = current_item['sales'],
                                   name = 'item %s' % item))
fig = go.Figure(data = item_sales_sc)
iplot(fig)

Seasonality and trend for each item are similar

# Part 2: ARIMA, SARIMA, SARIMAX

## Method 1: auto.arima find p, d, q

In [None]:
#As store and item following similar patterns, take store 1 and item 1 to study the time series parameters
train_1 = train[(train['store']==1) & (train['item'] == 1)]

In [None]:
train_1_df = train_1[:(len(train_1)-90)]
valid_1_df = train_1[len(train_1)-90:len(train_1)]

train_1_df.drop(columns = ['store','item','weekday'], inplace=True)
valid_1_df.drop(columns = ['store','item','weekday'], inplace=True)

train_1_df = train_1_df.set_index('date')
valid_1_df = valid_1_df.set_index('date')

In [None]:
#built the model
!pip install pmdarima
from pmdarima import auto_arima

stepwise_model = auto_arima(train_1_df,m=7,
                           seasonal=True,
                           trace=True,
                           error_action='ignore',
                           suppress_warnings=True,
                           stepwise=True)

In [None]:
stepwise_model.fit(train_1_df)
forecast_stp = stepwise_model.predict(n_periods=len(valid_1_df))
forecast_stp = pd.DataFrame(forecast_stp, index = valid_1_df.index, columns=['Prediction'])

fig, ax = plt.subplots(figsize=(12,6))
# plt.plot(train_1_df, label='Train')
plt.plot(valid_1_df, label='Valid')
plt.plot(forecast_stp, label='Prediction')
plt.show()

In [None]:
#calculate mse, mape, smape
mse1 = "{:.2%}".format(sqrt(mean_squared_error(valid_1_df,forecast_stp))/100)
y_true = valid_1_df.sales
y_pred = forecast_stp.Prediction
mape1 ="{:.2%}".format(np.mean(abs((y_true-y_pred)/y_true)))
smape1 = "{:.2%}".format(np.mean((np.abs(y_pred - y_true) * 2/ (np.abs(y_pred) + np.abs(y_true))).fillna(0)))

In [None]:
stepwise_model.plot_diagnostics(figsize=(16, 8))
plt.show()

Residuals following in Normal and no correlation.

In [None]:
model_eval = pd.DataFrame(data = np.array([['(3,1,1)(1,0,1)[7]',10335.10,mse1,mape1,smape1,'Pass']]),
                         columns = ['model','AIC','MSE','MAPE','MAPE','Residual Test'])
model_eval = model_eval.set_index('model')
model_eval

## Method 2: ACF and PACF results determine p, d, q

In [None]:
train_1 = train_1.set_index('date')
train_1.head()

### a. stationarity check

In [None]:
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries, window = 12, cutoff = 0.01):
    #Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC', maxlag = 20 )
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    pvalue = dftest[1]
    if pvalue < cutoff:
        print('p-value = %.4f. The series is likely stationary.' % pvalue)
    else:
        print('p-value = %.4f. The series is likely non-stationary.' % pvalue)
    
    print(dfoutput)

In [None]:
test_stationarity(train_1['sales'])

Differencing is needed for this time series

### b. Differencing

In [None]:
first_diff = train_1.sales - train_1.sales.shift(1)
first_diff = first_diff.dropna(inplace = False)
test_stationarity(first_diff)

Differencing once is good enough to stationarize the time serise.

In [None]:
import statsmodels.api as sm
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(first_diff, lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(first_diff, lags=40, ax=ax2)

Here we can see the acf and pacf both has a recurring pattern every 7 periods. Indicating a weekly pattern exists. 

1. AR lags significant at 6.
2. Spike at lag 7,14,21,28,35.. in the ACF 
   Exponential decay in the seasonal lags of the PACF (i.e., at lags 7, 14, …).
   Suggested seasonal order of (0,1,1,7)

In [None]:
train_1 = train[(train['store']==1)&(train['item']==1)]
train_1 = train_1.set_index('date')
start_index = '2017-10-01'
end_index = '2017-12-31'
end_index1 = '2017-12-30'

sarima_mod7 = sm.tsa.statespace.SARIMAX(endog = train_1.sales[:start_index],
                                         order=(7,1,0),
                                         seasonal_order=(0,1,1,7),
                                         freq='D').fit()

train_1['forecast'] = sarima_mod7.predict(start = pd.to_datetime(start_index), 
                                           end= pd.to_datetime(end_index),
                                           dynamic= True) 

In [None]:
train_1[start_index:end_index1][['sales', 'forecast']].plot(figsize=(12, 8))

In [None]:
#calculate mse, mape, smape
mse2 = "{:.2%}".format(sqrt(mean_squared_error(train_1[start_index:end_index]['sales'],
                                                             train_1[start_index:end_index]['forecast']))/100)
y_true = train_1[start_index:end_index1]['sales']
y_pred = train_1[start_index:end_index1]['forecast']

mape2 ="{:.2%}".format(np.mean(abs((y_true-y_pred)/y_true)))
smape2 = "{:.2%}".format(np.mean((np.abs(y_pred - y_true) * 2/ (np.abs(y_pred) + np.abs(y_true))).fillna(0)))

In [None]:
mse = sqrt(mean_squared_error(train_1[start_index:end_index]['sales'],
                                                             train_1[start_index:end_index]['forecast']))
print('MSE: %.2f %% \nAIC: %.2f'% (mse,sarima_mod7.aic))

In [None]:
sarima_mod7.plot_diagnostics(figsize=(16, 8))
plt.show()

In [None]:
# ACF and PACF
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(sarima_mod7.resid, lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(sarima_mod7.resid, lags=40, ax=ax2)

Residuals correlated

In [None]:
model2 = pd.DataFrame(data = [[sarima_mod7.aic,mse2,mape2,smape2,'Fail']],
                     columns = model_eval.columns,
                     index = ['(7,1,0)(0,1,1)[7]'])
model_eval = model_eval.append(model2)
model_eval

## Method 3: grid search find p,d,q

In [None]:
import itertools

p = range(2,5)
d = range(1,2)
q = range(0,2)
sp = sd = sq = range(0,2)
pdq = list(itertools.product(p, d, q))
seasonal_pdq = list(itertools.product(sp, sd, sq))
print('Examples of parameter combinations for Seasonal ARIMA...')
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[0]))
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[1]))
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[2]))

In [None]:
start_index = '2017-10-01'
for param in pdq:
    for param_seasonal in seasonal_pdq:
        try:
            mod = sm.tsa.statespace.SARIMAX(train_1.sales[:start_index],order=param,
                                            seasonal_order=param_seasonal,
                                            enforce_stationarity=False,
                                            enforce_invertibility=False,
                                            freq='D')
            
            results = mod.fit()
            print('ARIMA{}x{}7 - AIC:{}'.format(param, param_seasonal, results.aic))
        except:
            continue


In [None]:
train_1 = train[(train['store']==1)&(train['item']==1)]
train_1 = train_1.set_index('date')
start_index = '2017-10-01'
end_index = '2017-12-31'
end_index1 = '2017-12-30'

sarima_mod7 = sm.tsa.statespace.SARIMAX(endog = train_1.sales[:start_index],
                                         order=(7,1,0),
                                         seasonal_order=(0,1,1,7),
                                         freq='D').fit()

train_1['forecast'] = sarima_mod7.predict(start = pd.to_datetime(start_index), 
                                           end= pd.to_datetime(end_index),
                                           dynamic= True) 

## SARIMAX: adding external variables

### c.SARIMAX external variables: month, day of week

From the EDA results, yearly pattern will be taken into account in the seasonality parameters, weekly effect and monthly effect will be put as external variables.

In [None]:
train['year'] = train['date'].dt.year - 2012
train['month'] = train['date'].dt.month
train['weekday'] = train['date'].dt.weekday

train = pd.get_dummies(train, columns = ['year','month','weekday'] , prefix = ['year','month','weekday'])

In [None]:
train.head()

In [None]:
train_1 = train[(train['store']==1)&(train['item']==1)]

In [None]:
train_1.head()

In [None]:
ext_var_list = [ 'date', 'year_1','year_2','year_3','year_4','year_5',
       'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',
       'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'weekday_0',
       'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5',
       'weekday_6']

exog_data = train_1[ext_var_list]
exog_data.head()

In [None]:
train_1 = train_1.set_index('date')
exog_data = exog_data.set_index('date')

In [None]:
train_1.head()

In [None]:
exog_data.head()

In [None]:
start_index = '2017-10-01'
end_index = '2017-12-31'
end_index1 = '2017-12-30'#predict 90 days

In [None]:
sarimax_mod7 = sm.tsa.statespace.SARIMAX(endog = train_1.sales[:start_index],
                                         exog = exog_data[:start_index],
                                         order=(7,1,0),
                                         seasonal_order=(0,1,1,7),
                                         freq='D').fit()
sarimax_mod7.summary()

In [None]:
sarimax_mod3 = sm.tsa.statespace.SARIMAX(endog = train_1.sales[:start_index],
                                         exog = exog_data[:start_index],
                                         order=(3,1,1),
                                         seasonal_order=(1,0,1,7),
                                         freq='D').fit()
sarimax_mod3.summary()

In [None]:
train_1['forecast_7'] = sarimax_mod7.predict(start = pd.to_datetime(start_index), 
                                           end= pd.to_datetime(end_index), 
                                           exog = exog_data[start_index:end_index1], 
                                           dynamic= True)  
train_1['forecast_3'] = sarimax_mod3.predict(start = pd.to_datetime(start_index), 
                                           end= pd.to_datetime(end_index), 
                                           exog = exog_data[start_index:end_index1], 
                                           dynamic= True)  

In [None]:
train_1[start_index:end_index1][['sales', 'forecast_7','forecast_3']].plot(figsize=(12, 8))

In [None]:
#calculate mse, mape, smape
mse7 = "{:.2%}".format(sqrt(mean_squared_error(train_1[start_index:end_index]['sales'],
                                                             train_1[start_index:end_index]['forecast_7']))/100)
y_true = train_1[start_index:end_index1]['sales']
y_pred = train_1[start_index:end_index1]['forecast_7']

mape7 ="{:.2%}".format(np.mean(abs((y_true-y_pred)/y_true)))
smape7 = "{:.2%}".format(np.mean((np.abs(y_pred - y_true) * 2/ (np.abs(y_pred) + np.abs(y_true))).fillna(0)))
print('SARIMAX (7,1,0)(0,1,1,7) \nAIC: %.2f'% (sarimax_mod7.aic), '\nMSE: ',mse7,'\nMAPE: ',mape7, '\nSMAPE: ', smape7)

In [None]:
#calculate mse, mape, smape
mse3 = "{:.2%}".format(sqrt(mean_squared_error(train_1[start_index:end_index]['sales'],
                                                             train_1[start_index:end_index]['forecast_3']))/100)
y_true = train_1[start_index:end_index1]['sales']
y_pred = train_1[start_index:end_index1]['forecast_3']

mape3 ="{:.2%}".format(np.mean(abs((y_true-y_pred)/y_true)))
smape3 = "{:.2%}".format(np.mean((np.abs(y_pred - y_true) * 2/ (np.abs(y_pred) + np.abs(y_true))).fillna(0)))
print('SARIMAX (7,1,0)(0,1,1,7) \nAIC: %.2f'% (sarimax_mod7.aic), '\nMSE: ',mse3,'\nMAPE: ',mape3, '\nSMAPE: ', smape3)

In [None]:
from scipy import stats
from scipy.stats import normaltest

resid = sarimax_mod7.resid
print(normaltest(resid))

fig = plt.figure(figsize=(12,8))
ax0 = fig.add_subplot(111)

sns.distplot(resid ,fit = stats.norm, ax = ax0) 
(mu, sigma) = stats.norm.fit(resid)

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best')
plt.ylabel('Frequency')
plt.title('Residual distribution')

In [None]:
resid = sarimax_mod3.resid
print(normaltest(resid))

fig = plt.figure(figsize=(12,8))
ax0 = fig.add_subplot(111)

sns.distplot(resid ,fit = stats.norm, ax = ax0) 
(mu, sigma) = stats.norm.fit(resid)

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best')
plt.ylabel('Frequency')
plt.title('Residual distribution')

In [None]:
sarimax_mod7.plot_diagnostics(figsize=(16, 8))
plt.show()

In [None]:
sarimax_mod3.plot_diagnostics(figsize=(16, 8))
plt.show()

## Prediction on test dataset

In [None]:
train = pd.read_csv('/kaggle/input/demand-forecasting-kernels-only/train.csv' ,parse_dates=['date'],index_col='date')
test = pd.read_csv('/kaggle/input/demand-forecasting-kernels-only/test.csv', parse_dates=['date'],index_col='date')
sample = pd.read_csv('/kaggle/input/demand-forecasting-kernels-only/sample_submission.csv')

In [None]:
df = pd.concat([train,test],sort=True)

In [None]:
#month one hot encoding
df['year'] = df.index.year - 2012
df['month'] = df.index.month
df['dayofweek'] = df.index.weekday
df = pd.get_dummies(df, columns = ['year','month','dayofweek'],prefix = ['year','month','dayofweek'])

In [None]:
df.head()

In [None]:
results = []
tr_start,tr_end = '2013-01-01','2017-09-30'
te_start,te_end = '2017-10-01','2017-12-31'
for i in range(1,51):
    for s in range(1,11):
        cur_df = df[(df.item==i)&(df.store==s)].copy()
        
        #train_test_split
        tra = cur_df['sales'][tr_start:tr_end]
        tes = cur_df['sales'][te_start:te_end]
        exog_train = cur_df.drop(['id','store','item','sales'],axis = 1)[tr_start:tr_end]
        exog_test = cur_df[te_start:].drop(['id','store','item','sales'],axis = 1)#exog for predict.
        
        
        #fitting
        mod = sm.tsa.statespace.SARIMAX(tra,order=(3,1,1),seasonal_order=(1,0,1,7),exog = exog_train,freq='D',
                                       enforce_stationarity=False, enforce_invertibility=False).fit()
        pred = mod.get_prediction(tr_end,'2018-03-31',exog =exog_test)#pd.concat([exog_test,target_exog]))
        results.extend(pred.predicted_mean['2018-01-01':])
        print('item:',i,'store:',s,'Finished.')

In [None]:
results

In [None]:
sample['sales'] = results
sample.to_csv('submission.csv',index=False)