# Importing Data

In [None]:
import pandas as pd
cal = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv')
stval = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv')
price = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv') 

In [None]:
stval.info()

In [None]:
cal.info()

In [None]:
price.info()

# Downcasting (Reducing Size)

In [None]:
# Taken from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
import numpy as np
def reduce_mem_usage(df):
   
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
reduce_mem_usage(stval)

In [None]:
reduce_mem_usage(price)

In [None]:
reduce_mem_usage(cal)

# Re-Structuring and Merging Data

For convenience of work and subsequent merging of data with other dataframes "price" and "cal", the melt() function is used to unpivot "sales" dataframe from wide format to long format.

In [None]:
sales = pd.melt(stval, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name='d', value_name='demand').dropna()

In [None]:
sales = pd.merge(sales, cal, on='d', how='left')
sales = pd.merge(sales, price, on=['store_id','item_id','wm_yr_wk'], how='left') 

In [None]:
sales = sales.drop(['d','event_name_1','event_type_1','event_name_2','event_type_2','snap_CA','snap_TX','snap_WI','wm_yr_wk','weekday'], axis=1)

In [None]:
sales = sales.dropna()

For working in details, we will work with data for only one food item FOODS_3_555 from location CA_1.

In [None]:
sale_food_ca = sales[sales['id'] == 'FOODS_3_555_CA_1_validation']

In [None]:
del sales, price, cal #free some space

In [None]:
sale_food_ca['date'] = pd.to_datetime(sale_food_ca['date'])

# Train-Test Split

The last transaction date found in validation data set is 24th April 2016. For last 28 days (spanning from 28th March 2016 till 24th April 2016), all records will go to the test dataset. Except those records, all records from the start till 27th March 2016 will be used for training. 

In [None]:
import matplotlib.pyplot as plt
plt.rc('xtick', labelsize=16) 
plt.rc('ytick', labelsize=16) 
params = {'legend.fontsize': 16,'legend.handlelength': 2}
plt.rcParams.update(params)

train = sale_food_ca[sale_food_ca['date'] <= '2016-03-27']
test = sale_food_ca[(sale_food_ca['date'] > '2016-03-27') & (sale_food_ca['date'] <= '2016-04-24')]

fig, ax = plt.subplots(figsize=(25,5))
train.plot(x='date',y='demand',label='Train',ax=ax)
test.plot(x='date',y='demand',label='Test',ax=ax)

# Check for Stationarity (ADH Test, KPSS Test) and Granger Causality

We will use Augmented Dickey–Fuller test which tests the null hypothesis that a unit root is present in an autoregressive model. If data has no unit root then it is stationary. If the data is stationary, then only we can apply time series on it. If non-stationary, we need to make it stationary first through differencing.

In [None]:
# Taken from https://gist.github.com/Deffro/e54bdb90dd1c1392fc85e1db1cfbab7d

import pandas as pd
from statsmodels.tsa.stattools import adfuller

def adf_test(series,title=''):
    """
    Pass in a time series and an optional title, returns an ADF report
    """
    print('Augmented Dickey-Fuller Test: {}'.format(title))
    result = adfuller(series.dropna(),autolag='AIC') 
    
    labels = ['ADF test statistic','p-value','# lags used','# observations']
    out = pd.Series(result[0:4],index=labels)

    for key,val in result[4].items():
        out['critical value ({})'.format(key)]=val
        
    print(out.to_string())          
    
    if result[1] <= 0.05:
        print("Strong evidence against the null hypothesis")
        print("Reject the null hypothesis")
        print("Data has no unit root and is stationary")
    else:
        print("Weak evidence against the null hypothesis")
        print("Fail to reject the null hypothesis")
        print("Data has a unit root and is non-stationary")

In [None]:
# Augmented Dickey-Fuller Test
adf_test(train[['date','demand']]['demand'],title='Demand')

We observe, the data is stationary and no differencing is required. In the ADF test, the null hypothesis is the time series has a unit root (so data is non-stationary). Since we have got p-Value < 0.05 we reject the null hypothesis.

In [None]:
# KPSS Test

from statsmodels.tsa.stattools import kpss
result = kpss(train[['date','demand']]['demand'].values, regression='c')
print("KPSS Statistic: {}".format(result[0]))
print("P-Value: {}".format(result[1]))
for key, value in result[3].items():
    print("Critial Values: {}, {}".format(key,value))

KPSS test is used for testing a null hypothesis that an observable time series is stationary around a deterministic trend against the alternative of a unit root. The p-value interpretation is just the opposite of ADH test. So here, we see a p-value > 0.05

In [None]:
# Granger Causality Test

from statsmodels.tsa.stattools import grangercausalitytests
grangercausalitytests(train[['demand','sell_price']], maxlag=3)

Here, we are looking whether one time series for demand is useful in forecasting the sell_price. We should look for very low p-values (<0.05) here. With maxlag = 3, we see that p-values are getting lowered down but still it is >0.05 and hence, one time series is not useful in forecasting the other.

# Lag Plots

A lag plot is a kind of a scatter plot which is used for visualizing whether a dataset is random (stochastic) or not over a particular lag (time). Here, we will check randomness for demand over time and randomness of sell_price over time. Though, according to task, we are supposed to forecast demand over the time.

In [None]:
# Lag PLots

from pandas.plotting import lag_plot
fig, ax = plt.subplots(2,3,figsize=(15,5))
lag_plot(train['demand'], lag=1, ax=ax[0][0])
lag_plot(train['demand'], lag=30, ax=ax[0][1])
lag_plot(train['demand'], lag=60, ax=ax[0][2])
lag_plot(train['sell_price'], lag=1, ax=ax[1][0])
lag_plot(train['sell_price'], lag=30, ax=ax[1][1])
lag_plot(train['sell_price'], lag=60, ax=ax[1][2])
plt.show()

We notice that demand is following quite a random pattern whereas sell_price is quite linear with a positive (upward) slope, which is obvious, because sell_price is supposed to hike over the period of time.

# ACF and PACF Plots

In [None]:
import pandas as pd
predictions = pd.DataFrame()
predictions['date'] = test['date']
stats = pd.DataFrame(columns=['Model Name','Execution Time','RMSE'])

In [None]:
from statsmodels.graphics.tsaplots import plot_acf
fig, ax = plt.subplots(figsize=(15,3))
plot_acf(sale_food_ca['demand'].tolist(), lags=50, ax=ax)
plt.show()

In [None]:
from statsmodels.graphics.tsaplots import plot_pacf
fig, ax = plt.subplots(figsize=(15,3))
plot_pacf(sale_food_ca['demand'].tolist(), lags=50, ax=ax)
plt.show()

From both the autocorrelation plot and partial autocorrelation plots, it is observed that after every 7 lags there is a peak. This indicates a high correlation between the demand of food item on the present day and the demand of it before 7 days. So, if we use MA model, we can set the window size=7 and if we use AR or ARIMA model, we can set the span=7. 

# Different Time Series Modelling

Here, we will use moving average, exponential smoothing, non-seasonal ARIMA, seasonal ARIMA and seasonal ARIMAX models. The moving average model is just used for basic observation purpose. Rest of the models' performances are compared based on total time taken for execution and Root Mean Squared Error (RMSE). First we will use the training data set to train it with various time series models. After training, we will use forecast(x) with x=28 days.

# (1) Simple Moving Average and Expanding Moving Average Models

In [None]:
# Moving Average model with window size 7 
y = train[['date','demand']]
y = y.set_index('date')
y['MA7'] = y.rolling(window=7).mean() 
y.plot(figsize=(15,4))

In [None]:
# Moving Average model with expanding window size 2
y1 = train[['date','demand']]
y1 = y1.set_index('date')
y1['demand'].expanding(min_periods=2).mean().plot(figsize=(15,4))

# (2) Simple/ Double/ Triple Exponential Smooting Models

In [None]:
import time
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
from sklearn.metrics import mean_squared_error

t0 = time.time()
model_name='Simple Exponential Smoothing'
span = 7
alpha = 2/(span+1)
#train
simpleExpSmooth_model = SimpleExpSmoothing(train['demand']).fit(smoothing_level=alpha,optimized=False)
t1 = time.time()-t0
#predict
predictions[model_name] = simpleExpSmooth_model.forecast(28).values
fig, ax = plt.subplots(figsize=(25,5))
train[-28:].plot(x='date',y='demand',label='Train',ax=ax, marker='o', color='blue')
test.plot(x='date',y='demand',label='Test',ax=ax, marker='o', color='red');
predictions.plot(x='date',y=model_name,label=model_name,ax=ax, marker='x', color='green');
#evaluate
score = np.sqrt(mean_squared_error(predictions[model_name].values, test['demand']))
print('RMSE for {}: {:.4f}'.format(model_name,score))

stats = stats.append({'Model Name':model_name, 'Execution Time':t1, 'RMSE':score},ignore_index=True)

We see a linear trend which is not matching the curvy spikes present in original forecasting trend followed by the test data. So we will try applying double exponential smoothing next.

In [None]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

t0 = time.time()
model_name='Double Exponential Smoothing'
#train
doubleExpSmooth_model = ExponentialSmoothing(train['demand'],trend='add',seasonal_periods=7).fit()
t1 = time.time()-t0
#predict
predictions[model_name] = doubleExpSmooth_model.forecast(28).values
fig, ax = plt.subplots(figsize=(25,5))
train[-28:].plot(x='date',y='demand',label='Train',ax=ax, marker='o', color='blue')
test.plot(x='date',y='demand',label='Test',ax=ax, marker='o', color='red');
predictions.plot(x='date',y=model_name,label=model_name,ax=ax, marker='x', color='green');
#evaluate
score = np.sqrt(mean_squared_error(predictions[model_name].values, test['demand']))
print('RMSE for {}: {:.4f}'.format(model_name,score))

stats = stats.append({'Model Name':model_name, 'Execution Time':t1, 'RMSE':score},ignore_index=True)

Even with double exponential smoothing also, scenario remains the same. It is still linear. So we will try using triple exponential smoothing next.

In [None]:
t0 = time.time()
model_name='Triple Exponential Smoothing'
#train
tripleExpSmooth_model = ExponentialSmoothing(train['demand'],trend='add',seasonal='add',seasonal_periods=7).fit()
t1 = time.time()-t0
#predict
predictions[model_name] = tripleExpSmooth_model.forecast(28).values
fig, ax = plt.subplots(figsize=(25,4))
train[-28:].plot(x='date',y='demand',label='Train',ax=ax, marker='o', color='blue')
test.plot(x='date',y='demand',label='Test',ax=ax, marker='o', color='red');
predictions.plot(x='date',y=model_name,label=model_name,ax=ax, marker='x', color='green');
#evaluate
score = np.sqrt(mean_squared_error(predictions[model_name].values, test['demand']))
print('RMSE for {}: {:.4f}'.format(model_name,score))

stats = stats.append({'Model Name':model_name, 'Execution Time':t1, 'RMSE':score},ignore_index=True)

Now, we can see the linear shape has got converted to curve with the spikes following original forecasting trend of the test data set. Also, we can observe a substantial reduction in RMSE while using with Triple Exponential Smoothing (4.2348). So we are stopping here experimenting with the exponential smoothing model.

# (3) Non-Seasonal ARIMA Model

In [None]:
!pip install pmdarima

In [None]:
from pmdarima import auto_arima
t0 = time.time()
model_name='ARIMA'
arima_model = auto_arima(train['demand'], start_p=0, start_q=0,
                          max_p=20, max_q=5,
                          seasonal=False,
                          d=None, trace=True,random_state=12345,
                          error_action='ignore',   
                          suppress_warnings=True,  
                          stepwise=True)
arima_model.summary() 

In [None]:
#train
arima_model.fit(train['demand'])
t1 = time.time()-t0
#predict
predictions[model_name] = arima_model.predict(n_periods=28)
fig, ax = plt.subplots(figsize=(25,5))
train[-28:].plot(x='date',y='demand',label='Train',ax=ax, marker='o', color='blue')
test.plot(x='date',y='demand',label='Test',ax=ax, marker='o', color='red');
predictions.plot(x='date',y=model_name,label=model_name,ax=ax, marker='x', color='green');
#evaluate
score = np.sqrt(mean_squared_error(predictions[model_name].values, test['demand']))
print('RMSE for {}: {:.4f}'.format(model_name,score))

stats = stats.append({'Model Name':model_name, 'Execution Time':t1, 'RMSE':score},ignore_index=True)

We see a moderately fitting curve roughly following the forecasting trend shown by the test data, but non-seasonal ARIMA model is having RMSE greater than what we have got with the triple exponential model (5.2752 is > 4.2348). So the performance has not improved using non-seasonal ARIMA. 

# (4) Seasonal ARIMA Model

In [None]:
t0 = time.time()
model_name='SARIMA'
sarima_model = auto_arima(train['demand'], start_p=0, start_q=0,
                          max_p=20, max_q=5,
                          seasonal=True, m=7,
                          d=None, trace=True,random_state=12345,
                          out_of_sample_size=28,
                          error_action='ignore',   
                          suppress_warnings=True,  
                          stepwise=True)
sarima_model.summary()

In [None]:
#train
sarima_model.fit(train['demand'])
t1 = time.time()-t0
#predict
predictions[model_name] = sarima_model.predict(n_periods=28)
fig, ax = plt.subplots(figsize=(25,4))
train[-28:].plot(x='date',y='demand',label='Train',ax=ax, marker='o', color='blue')
test.plot(x='date',y='demand',label='Test',ax=ax, marker='o', color='red');
predictions.plot(x='date',y=model_name,label=model_name,ax=ax, marker='x', color='green');
#evaluate
score = np.sqrt(mean_squared_error(predictions[model_name].values, test['demand']))
print('RMSE for {}: {:.4f}'.format(model_name,score))

stats = stats.append({'Model Name':model_name, 'Execution Time':t1, 'RMSE':score},ignore_index=True)

Seasonal ARIMA is fitting much better than non-seasonal ARIMA. Also, we are getting reduced RMSE with seasonal ARIMA (4.6820) compared to what we had got using non-seasonal ARIMA (5.2752).  

# (5) Seasonal ARIMAX Model

It is SARIMA model with eXogenous regressors. These regressors introduce the external features which can influence a time series.

In [None]:
t0 = time.time()
model_name='SARIMAX'
sarimax_model = auto_arima(train['demand'], start_p=0, start_q=0,
                          max_p=20, max_q=5,
                          seasonal=True, m=7,
                          exogenous = train[['sell_price']].values,
                          d=None, trace=True,random_state=2020,
                          out_of_sample_size=28,
                          error_action='ignore',   
                          suppress_warnings=True,  
                          stepwise=True)
sarimax_model.summary()

In [None]:
#train
sarimax_model.fit(train['demand'])
t1 = time.time()-t0
#predict
predictions[model_name] = sarimax_model.predict(n_periods=28)
fig, ax = plt.subplots(figsize=(25,4))
train[-28:].plot(x='date',y='demand',label='Train',ax=ax, marker='o', color='blue')
test.plot(x='date',y='demand',label='Test',ax=ax, marker='o', color='red');
predictions.plot(x='date',y=model_name,label=model_name,ax=ax, marker='x', color='green');
#evaluate
score = np.sqrt(mean_squared_error(predictions[model_name].values, test['demand']))
print('RMSE for {}: {:.4f}'.format(model_name,score))

stats = stats.append({'Model Name':model_name, 'Execution Time':t1, 'RMSE':score},ignore_index=True)

We can see that SARIMAX curve is fitting the best compared to previously plotted non-seasonal and seasonal ARIMA curves. Also, here we have got the lowest RMSE (4.4632) compared to what we had got earlier using non-seasonal ARIMA (5.2752) and seasonal ARIMA (4.6820). However, RMSE with triple exponential smoothing was lower that what we've got using SARIMAX.

# Model Performance Comparison

In [None]:
# RMSE Comparison
stats.plot(kind='line',x='Model Name', y='RMSE', figsize=(12,4), title="RMSE Comparison for Different TS Models")
plt.xticks(rotation='vertical')

In [None]:
# Execution Time Comparison
stats.plot(kind='bar',x='Model Name', y='Execution Time', figsize=(12,4), title="Execution Time Comparison for Different TS Models")
plt.xticks(rotation='vertical')

# Conclusion:
We observe that Triple Exponential Smoothing model has the lowest RMSE and the lowest execution time. Hence, it can be concluded as the best performing model in this scenario.

**Acknowledgement:** I am grossly indebted to [Dimitrios Effrosynidis](https://www.kaggle.com/deffro) for his various time series related works on medium and github.