# First Name:
# Last Name:

# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt

# Read in CSV File

In [None]:
lottery = pd.read_csv("sales.csv")
lottery.head()

# Data Management

In [None]:
# Convert character data to datetime object that can be used in python.
from datetime import datetime
lottery['Week Ending Date'] = pd.to_datetime(lottery["Week Ending Date"], format="%d/%m/%Y")
lottery.head()

In [None]:
# Set date as the index of the DataFrame.
lottery.set_index("Week Ending Date", inplace=True)
lottery.head()

In [None]:
# Create a subset of the data containing only the required variable.
TS = "Pulltab"
sub1 = lottery[[TS]].copy()
sub1.head()

In [None]:
# Convert time series values to numeric.
sub1[TS] = pd.to_numeric(sub1[TS])
print(sub1.describe())

# Visualise Time Series

In [None]:
# Plot the raw lottery data with respect to time.
plt.plot(sub1);

In [None]:
# Create a new variable storing just the month from the week ending date.
sub1['Month'] = sub1.index.month
sub1.head()

# Box plot

In [None]:
import seaborn as sns
ax = sns.boxplot(x="Month", y=TS, data=sub1)

In [None]:
# This box plot shows the distribution of values for each month.
# In a stationary time series, the values for each month should
# not deviate much from the means, but should remain fairly
# constant over time.

# 2. Stationarity  - Check

In [None]:
 def test_stationarity(timeseries):
    
    #Determing rolling statistics
    rolmean = timeseries.rolling(window=12).mean()
    rolstd = timeseries.rolling(window=12).std()

    #Plot rolling statistics:
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False) 

In [None]:
test_stationarity(sub1[TS])

In [None]:
# The stationarity test shows that the rolling mean is varying up and down, 
# indicating that this may not be a stationary time series.

In [None]:
from statsmodels.tsa.stattools import adfuller

#Perform Dickey-Fuller test:
def test_Dickey_Fuller(timeseries):
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)

In [None]:
# Perform the Dickey-Fuller test on the time-series.
# Test Statistic: -2.387211
# p-value: 0.145379
# Critical value (1%): -3.467632 > -3.469648
# Critical value (5%): -2.877918 > -2.878799
# The p-value provided by the Dickey-Fuller test is > 0.05, so we accept the null hypothesis, the time-series is non-stationary.
test_Dickey_Fuller(sub1[TS])

# Make Time Series Stationary

# Decomposing

In [None]:
# Display log time series of Instant lottery data.
# np.log() calculates the natural log of all values entered, these can then be plotted on a graph.
# The log() method is used to stabilize the variance of the time series for future operations.
ts_log = np.log(lottery[TS])
plt.plot(ts_log);

In [None]:
# Decompose log of data to obtain trend, seasonal, and residual.
# Decomposition of a time series separates its different parts, which helps us to analyze and stationarize it.
from statsmodels.tsa.seasonal import seasonal_decompose

decomposition = seasonal_decompose(ts_log)
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

plt.subplot(411)
plt.plot(ts_log, label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual, label='Residuals')
plt.legend(loc='best')
plt.tight_layout();

In [None]:
# Test stationarity of residual of data.
# The residuals of a time series are the difference between the observed and fitted values.
# They are examined to check whether the model has obtained all the information in the data.
ts_log_decompose = residual
ts_log_decompose.dropna(inplace=True)
test_stationarity(ts_log_decompose)

In [None]:
# Perform Dickey-Fuller test on residual.
# Test Statistic: -5.602576
# p-value: 0.000001
# Critical value (1%): -3.482088 > -3.483779
# Critical value (5%): -2.884219 > -2.884954
# The p-value provided by the Dickey-Fuller test is < 0.05, so we reject the null hypothesis; the time-series is stationary.
test_Dickey_Fuller(ts_log_decompose)

# Plot ACF & PACF chart & find optimal parameter

In [None]:
from statsmodels.tsa.stattools import acf, pacf

In [None]:
# Use ACF and PACF to measure the internal correlation of the time-series.
# Autocorrelation is comparing the time series to itself in the past.
# The ACF and PACF functions enable us to determine the parameters for the ARIMA model.
# The data will be lagged by 20 measurements, i.e. Compared to itself 20 points in the past.
ts_log_diff = ts_log - ts_log.shift()
ts_log_diff.dropna(inplace=True)
lag_acf = acf(ts_log_diff, nlags=20)
lag_pacf = pacf(ts_log_diff, nlags=20, method='ols')

In [None]:
%matplotlib inline
# Plot ACF.
plt.subplot(121) 
plt.plot(lag_acf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.title('Autocorrelation Function')

# Plot PACF.
plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.title('Partial Autocorrelation Function')
plt.tight_layout()

# Build ARIMA model

In [None]:
from statsmodels.tsa.arima_model import ARIMA

In [None]:
# Plot ARIMA model of time series.
# The ARIMA model is created by comparing the time series to itself in the past
# using the parameters retrieved from the ACF and PACF.
# This model helps us to predict future values based on past values.
model = ARIMA(ts_log, order=(1, 1, 1))  #(p,d,q)
results_ARIMA = model.fit(disp=-1)  
plt.plot(ts_log_diff)
plt.plot(results_ARIMA.fittedvalues, color='red');

# Make predictions

In [None]:
predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)

In [None]:
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()

In [None]:
predictions_ARIMA_log = pd.Series(ts_log[0], index=ts_log.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fill_value=0)

In [None]:
# Plot of predictions vs actual time series.
# These are the predictions made by the ARIMA model. These are the final product of
# the time series analysis and can be used to assist in making real-world decisions
# about the future.
predictions_ARIMA = np.exp(predictions_ARIMA_log)
plt.plot(lottery[TS])
plt.plot(predictions_ARIMA);