In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random as rd # generating random numbers
import datetime # manipulating date formats
from sklearn.metrics import mean_squared_error
from numpy import sqrt

import matplotlib.pyplot as plt # basic plotting
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 10, 6
import seaborn as sns # for prettier plots

from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.ar_model import AR
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
#from statsmodels.tsa.statespace.varmax import VARMAX

from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.stattools import adfuller, acf, pacf,arma_order_select_ic
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scsor 

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
# Load data

sales=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv")
item_cat=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv")
item=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/items.csv")
shops=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/shops.csv")
submit=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv")

print('sales ' , sales.shape)
print('item_cat ' , item_cat.shape)
print('item ' , item.shape)
print('shops ' , shops.shape)


In [None]:
# formatting the date column from object to date time

print(sales.info())
sales.date=sales.date.apply(lambda x:datetime.datetime.strptime(x, '%d.%m.%Y'))
print(sales.info())

In [None]:
# Group by total monthly sales ...34 months

ts=sales.groupby(["date_block_num"])["item_cnt_day"].sum()
ts.astype('float')
plt.figure(figsize=(10,6))
plt.title('Total sales of company')
plt.xlabel('Months')
plt.ylabel('Sales')
indexedDataset = pd.DataFrame(ts)
#indexedDataset.head()
plt.plot(indexedDataset)
plt.show()

In [None]:
ts

In [None]:
MyWindow = 3

#Determine rolling statistics
rolmean = indexedDataset.rolling(window=MyWindow).mean() #window size 12 denotes 12 months, giving rolling mean at yearly level
rolstd = indexedDataset.rolling(window=MyWindow).std()

#Plot rolling statistics
orig = plt.plot(indexedDataset, color='blue', label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label='Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)

In [None]:
# decompose into trend, seasonality and residuals
res = sm.tsa.seasonal_decompose(indexedDataset.values,freq=MyWindow,model="additive")
#plt.figure(figsize=(16,12))
fig = res.plot()

In [None]:
#Perform Augmented Dickey–Fuller test for stationarity

print('Results of Dickey Fuller Test:')
dftest = adfuller(indexedDataset['item_cnt_day'], autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
    
print(dfoutput)

 # ARIMA

In [None]:
dataAR = list(indexedDataset.item_cnt_day.values)
len(dataAR)

In [None]:
# AR model 

model = AR(dataAR)
model_fit = model.fit()
# make prediction
yhat = model_fit.predict(12, len(dataAR)+ 18) # predict N ahead of the last one

dataList = list(dataAR)
yhatList = list(yhat)

plt.style.use('seaborn-poster')
plt.figure()
plt.plot(dataList, label='Original')
plt.plot(yhatList, ls='--', label='Predicted')
plt.legend(loc='best')
plt.title('AR model')
plt.show()

In [None]:
rmse = sqrt(mean_squared_error(dataList,yhatList[0:34]))
print('AR RMSE: %.1f' % rmse)

In [None]:
# Initial approximation of parameters using Autocorrelation and Partial Autocorrelation Plots
# Get p and q for ARIMA

plt.figure(figsize=(15,10))
ax = plt.subplot(211)
sm.graphics.tsa.plot_acf(dataAR, lags=12, ax=ax)
ax = plt.subplot(212)
sm.graphics.tsa.plot_pacf(dataAR, lags=12, ax=ax)
#plt.tight_layout()
plt.show()

Theoretically from the above charts, p = 4 and q = 7 ...supposedly, but these params do NOT work with SARIMA...

In [None]:
# ARIMA model

model = ARIMA(dataAR, order=(2, 1, 1))
model_fit = model.fit(disp=False)
# make prediction
yhat = model_fit.predict(1, len(dataAR)+6, typ='levels')

dataList = list(dataAR)
yhatList = list(yhat)

plt.style.use('seaborn-poster')
plt.figure()
plt.plot(dataList, label='Original')
plt.plot(yhatList, ls='--', label='Predicted')
plt.legend(loc='best')
plt.title('ARIMA model')
plt.show()

In [None]:
rmse = sqrt(mean_squared_error(dataList,yhatList[0:34]))
print('ARIMA RMSE: %.1f' % rmse)

 # SARIMA

In [None]:
# SARIMA

model = SARIMAX(dataAR, order=(2, 1, 1), seasonal_order=(2,1,1,3))
model_fit = model.fit(disp=False)
# make prediction
yhat = model_fit.predict(1, len(dataAR)+6)

dataList = list(dataAR)
yhatList = list(yhat)

plt.style.use('seaborn-poster')
plt.figure()
plt.plot(dataList, label='Original')
plt.plot(yhatList, ls='--', label='Predicted')
plt.legend(loc='best')
plt.title('SARIMAX model')
plt.show()

In [None]:
rmse = sqrt(mean_squared_error(dataList,yhatList[0:34]))
print('SARIMA RMSE: %.1f' % rmse)

In [None]:
# Find SARIMA order that minimizes AIC by brute force
max_p = 2
max_q = 2
max_d = 1
max_sp = 1
max_sq = 1
max_sd = 1

pattern = max_p*(max_q + 1)*(max_d + 1)*(max_sp + 1)*(max_sq + 1)*(max_sd + 1)

modelSelection = pd.DataFrame(index=range(pattern), columns=["model", "aic"])
pattern

In [None]:
import statsmodels.api as sm

# Automatic SARIMA selection
num = 0

for p in range(1, max_p + 1):
    for d in range(0, max_d + 1):
        for q in range(0, max_q + 1):
            for sp in range(0, max_sp + 1):
                for sd in range(0, max_sd + 1):
                    for sq in range(0, max_sq + 1):
                        sarima = sm.tsa.SARIMAX(
                            ts, order=(p,d,q), 
                            seasonal_order=(sp,sd,sq,4), 
                            enforce_stationarity = False, 
                            enforce_invertibility = False
                        ).fit()
                        modelSelection.iloc[num]["model"] = "order=(" + str(p) + ","+ str(d) + ","+ str(q) + "), season=("+ str(sp) + ","+ str(sd) + "," + str(sq) + ")"
                        modelSelection.iloc[num]["aic"] = sarima.aic
                        num = num + 1

# Check the results for each model
print(modelSelection)

# AIC smallest model
print(modelSelection[modelSelection.aic == min(modelSelection.aic)])

In [None]:
# SARIMA v2

model = SARIMAX(dataAR, order=(2,1,2), seasonal_order=(0,1,1,4))
model_fit = model.fit(disp=False)
# make prediction
yhat = model_fit.predict(1, len(dataAR)+6)

dataList = list(dataAR)
yhatList = list(yhat)

plt.style.use('seaborn-poster')
plt.figure()
plt.plot(dataList, label='Original')
plt.plot(yhatList, ls='--', label='Predicted')
plt.legend(loc='best')
plt.title('SARIMAX model v2')
plt.show()

In [None]:
rmse = sqrt(mean_squared_error(dataList,yhatList[0:34]))
print('SARIMA RMSE: %.1f' % rmse)

In [None]:
yhat

In [None]:
#StackingSubmission = pd.DataFrame({'yhat': yhat})
#StackingSubmission = pd.DataFrame({"ID": yhat.index, "item_cnt_month": yhat})
#StackingSubmission.to_csv("StackingSubmission.csv", index=False)
submit=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv")
submit.to_csv("submit.csv", index=False)