### Super Store Dataset

In [None]:
# Importing the required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [None]:
# Import the dataset
encoding = 'unicode_escape'
storedataset = pd.read_csv('/Users/priyankac/Downloads/Time Series/Super Store.csv', encoding = 'unicode_escape')

In [None]:
storedataset.head()

In [None]:
# Checking the number of rows and columns in the dataset
storedataset.shape

In [None]:
# Checking the data types
storedataset.info()

In [None]:
# Order date is an object type here.It needs to be converted to datetime

from datetime import datetime
storedataset['Order Date'] = pd.to_datetime(storedataset['Order Date'])

In [None]:
storedataset.dtypes

In [None]:
storedataset['Category'].value_counts()

In [None]:
# From the category taking only the furniture category,here we have just one category i.e furniture. This code will
# help to exract the data if there are multiple categories
Furniture = storedataset.loc[storedataset['Category'] == 'Furniture']

# Furniture1 = storedataset.loc[storedataset['Sub-Category'] == 'Chairs'] if you want to extract sub category 
# wise chair data


In [None]:
Furniture.head()

In [None]:
Furniture.shape

In [None]:
print(len(Furniture))

In [None]:
# Checking the minimum and maximum order date for furniture
print(Furniture['Order Date'].min())
print(Furniture['Order Date'].max())

In [None]:
# Check the columns in our Furniture dataframe
Furniture.columns

In [None]:
cols = ['Row ID', 'Order ID',  'Ship Date', 'Ship Mode',
       'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State',
       'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category',
       'Product Name', 'Quantity', 'Discount', 'Profit']

# Check with the stakeholder what they want to forecast i.e sales or quantity or discount or profit
# I want to predict sales forecasting - 'Sales'

In [None]:
# Drop all the above cols from Furniture dataset to retain only the 'order date' and 'sales' columns
Furniture.drop(cols , axis = 1 ,inplace = True)


In [None]:
Furniture.shape

In [None]:
# Sort the dataset first
Furniture.sort_values('Order Date')

In [None]:
# Grouping the sales according to the date and getting the sum of toatl sales in a day
Furniture = Furniture.groupby('Order Date')['Sales'].sum().reset_index()

In [None]:
Furniture

In [None]:
# Set the 'order date' as index
Furniture = Furniture.set_index('Order Date')
Furniture.index

In [None]:
# Our current datetime data can be tricky to work with, therefore, we will use the averages daily sales value 
# for that month instead, and we are using the start of each month as the timestamp.

y = pd.DataFrame(Furniture['Sales'].resample('MS').mean())

In [None]:
y['2017':]

In [None]:
# Check the trend
y.plot()

In [None]:
# Check the dataset components by using the Decomposition method
from statsmodels.tsa.seasonal import seasonal_decompose

decomposition = seasonal_decompose(y['Sales'] , period = 12)
decomposition.plot()
plt.show()

In [None]:
#The plot above clearly shows that the sales of furniture is unstable, along with its obvious seasonality.


In [None]:
# To check whether or data is stationary or not using the Augmented Decay Fuller Test
from statsmodels.tsa.stattools import adfuller

In [None]:
adfuller(y['Sales'])

In [None]:
# Creating a function to explain the above values(remember this function, u can use it always)
def adf_check(timeseries):
    result = adfuller(timeseries)
    print('Augmented Decay Fuller Test : ')
    labels = ['ADF Test Statistics', 'p-value' ,'#Lags', 'No of obs' ]
    
    for a , b in zip(result, labels):
        print(b + ' : ' + str(a))
        
    if result[1] <= 0.05:
        print('\nStrong evidence against null hypothesis and my time series is stationary')
    else:
        print('\nWeak evidence against null hypothesis and my time series is non stationary')

In [None]:
adf_check(y['Sales'])

In [None]:
# Our data is stationary
# Trend d = 0

In [None]:
# Calculating the D value for seasonality
y['seasonality'] = y['Sales'] - y['Sales'].shift(12)

In [None]:
y.head()

In [None]:
adf_check(y['seasonality'].dropna())

In [None]:
# D = 1

In [None]:
# To find the values of p&P and q&Q we will use graphs
from statsmodels.graphics.tsaplots import plot_pacf , plot_acf

In [None]:
plot_pacf(y['Sales'] , lags = 14)
plt.show()

In [None]:
# p = 0

In [None]:
plot_acf(y['Sales'].dropna(), lags = 14)
plt.show()

In [None]:
# q = 0

In [None]:
plot_pacf(y['seasonality'].dropna(), lags = 14)
plt.show()

In [None]:
plot_acf(y['seasonality'].dropna(), lags = 14)
plt.show()

In [None]:
# P = 0
# Q = 0

In [None]:
# Trend:
# d = 0 
# p = 0       
# q = 0

# Seasonality:
# P = 0 
# D = 1 
# Q = 0 

### Model Time Series Analysis and Forecasting

In [None]:
# Automation to find best pdq value and less AIC value for better value
import itertools

In [None]:
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm

In [None]:
p = d = q = range(0, 2)
pdq = list(itertools.product(p, d, q))
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
print('Examples of parameter combinations for Seasonal ARIMA...')
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[1]))
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[2]))
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[3]))
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[4]))


In [None]:
for param in pdq:
    for param_seasonal in seasonal_pdq:
        try:
            model = sm.tsa.statespace.SARIMAX(y['Sales'] , order = param ,
                                 seasonal_order = param_seasonal ,
                                  enforce_stationarity=False, enforce_invertibility=False ) 
            results = model.fit()
            print('ARIMA{}x{} - AIC :{}'.format(param,param_seasonal,results.aic))
        except :
            continue

In [None]:
# ARIMA(1, 1, 1)x(1, 1, 1, 12) - AIC :283.3661018696623

In [None]:
# Fiting the ARIMA model
model = sm.tsa.statespace.SARIMAX(y['Sales'],
                                order=(1, 1, 1),
                                seasonal_order=(1, 1, 1, 12),
                                enforce_stationarity=False,
                                enforce_invertibility=False)
results = model.fit()


In [None]:
print(results.summary())

In [None]:
print(len(y))

In [None]:
# Predict with the training dataset

y['forecast'] = results.predict(start = 37 , dynamic = True) 


In [None]:
y.head()

In [None]:
y.tail()

In [None]:
y[['Sales','forecast']].plot()
plt.show()

In [None]:
# The line plot is showing the observed values compared to the rolling forecast predictions.
# Overall, our forecasts align with the true values very well, showing an upward trend starts 
# from the beginning of the year and captured the seasonality toward the end of the year.


In [None]:
# Will generate some future dates to make prediction

In [None]:
from pandas.tseries.offsets import DateOffset

In [None]:
y.head()

In [None]:
y.tail()

In [None]:
future_dates = [y.index[-1] + DateOffset(months = x) for x in range(0,13)] 

In [None]:
future_dates

In [None]:
future_dates_df = pd.DataFrame(index = future_dates[1:], columns =y.columns)

In [None]:
future_dates_df.head()

In [None]:
future_dates_df.tail()

In [None]:
future_df = pd.concat([y,future_dates_df])

In [None]:
future_df.head(20)

In [None]:
future_df.tail(20)

In [None]:
future_df.shape

In [None]:
future_df['forecast'] = results.predict( start = 47, end = 59, dynamic = True)

In [None]:
future_df[['Sales','forecast']].plot()

In [None]:
future_df.tail(20)

In [None]:
future_df.to_csv('sales_forecasting.csv')