In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv("https://raw.githubusercontent.com/blue-yonder/pydse/master/pydse/data/sales-of-shampoo-over-a-three-ye.csv")
data.head()
# data.describe()

In [None]:
def convertToDate(string):
    datestr = string.split(';')[0]
    year = str(int(datestr.split('-')[0]) + 2018)
    month = datestr.split('-')[1]
    return year + '-' + month

data["Month"] = data['Month;"Sales"'].apply(convertToDate)
data["Sales"] = data['Month;"Sales"'].apply(lambda x : x.split(";")[1])
dta = data.drop('Month;"Sales"', axis=1)
dta

In [None]:
data

In [None]:
from datetime import datetime
dta["Month"] = dta["Month"].apply(lambda x:datetime.strptime(x, "%Y-%m"))

In [None]:
dta

In [None]:
df = dta.copy()
df.set_index(df["Month"], inplace=True)
df = df.drop("Month", axis = True).astype(float)
df

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(15, 5))
fig.patch.set_facecolor("white")
fig.add_subplot(111)
plt.plot(df)
plt.show()


In [None]:
from statsmodels.tsa.stattools import adfuller
result = adfuller(df["Sales"])
# H0 : Data is not stationary
# H1 : Data is stationary


def adfuller_test(sales):
    result = adfuller(sales)
    labels = ["adfuller test statistics", "p value", "lags used", 'Number of observations used']
    for value, label in zip(result, labels):
        print(label+ " : "+ str(value))
    if result[1] <= 0.05:
        print("Reject null hypothesis. data is stationary")
    else:
        print("Accept null hypothesis. data is non stationary")
            
adfuller_test(df["Sales"])

In [None]:
# Defferencing
df["first difference"] = df['Sales'] - df["Sales"].shift(1)
df

In [None]:
adfuller_test(df["first difference"].dropna())

In [None]:
df["first difference"].dropna().plot()

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

fig = plt.figure(figsize=(12,8))
fig.patch.set_facecolor("white")
ax1 = fig.add_subplot(211)
fig = plot_acf(df["first difference"].dropna(), lags = 16, ax = ax1)
ax1 = fig.add_subplot(212)
fig = plot_pacf(df["first difference"].dropna(), lags = 16, ax = ax1)

In [None]:
# d = 1 one seasonal differencing done
dt = df["Sales"].dropna()
from statsmodels.tsa.arima.model import ARIMA
model = ARIMA(dt, order=(10,1,1))
fittedmodel = model.fit()
# fit_model.summary()
fittedmodel.params

In [None]:
import statsmodels.api as sm
print(fittedmodel.aic, fittedmodel.bic, fittedmodel.hqic)

In [None]:
dw_test_value = sm.stats.durbin_watson(fittedmodel.resid.values)
dw_test_value


In [None]:
# plot residuals
fig = plt.figure(figsize=(12,6))
fig.patch.set_facecolor('white')
ax = fig.add_subplot(111)
fig = fittedmodel.resid.plot(ax= ax)

In [None]:
from scipy import stats
resids = fittedmodel.resid
stats.normaltest(resids)


In [None]:
from statsmodels.graphics.api import qqplot

%matplotlib inline
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
fig = qqplot(resids, line='q', ax=ax, fit=True)

In [None]:
prediction = fittedmodel.predict("2021-01-01", "2021-12-01", dynamic=True)
prediction

In [None]:
fig = plt.figure(figsize=(15,8))
fig.patch.set_facecolor("w")
ax = fig.add_subplot(111)
ax = plt.plot(dt)
ax = plt.plot(prediction)
plt.legend()
plt.show()


In [None]:
# find the best model
p_values = [10,11,12,14,15,16]
sums = []
for p in p_values:
    dt = df["Sales"].dropna()
    from statsmodels.tsa.arima.model import ARIMA
    model = ARIMA(dt, order=(p,1,1))
    fittedmodel = model.fit()
    sum = fittedmodel.aic + fittedmodel.bic + fittedmodel.hqic
    sums.append(sum)
    # fit_model.summary()
    

In [None]:
print(p_values)
print(sums)

In [None]:
# set p = 14
dt = df["Sales"].dropna()
from statsmodels.tsa.arima.model import ARIMA
model = ARIMA(dt, order=(14,1,1))
fittedmodel = model.fit()

prediction = fittedmodel.predict("2021-01-01", "2021-12-01", dynamic=True)

fig = plt.figure(figsize=(15,8))
fig.patch.set_facecolor("w")
ax = fig.add_subplot(111)
ax = plt.plot(dt)
ax = plt.plot(prediction)
plt.legend()
plt.show()



In [None]:
import pickle
file_dir = open("model/arima.pickle", "wb")
pickle.dump(fittedmodel, file_dir)

In [None]:
import pickle
model_file = open("model/arima.pickle", "rb")
model = pickle.load(model_file)
predvals = model.predict("2021-1", "2022-12", dynamic=True)
predvals

In [None]:
predvals_ = pd.DataFrame({"Date": predvals.index, "Sales Prediction":predvals.values})

In [None]:

predvals_.to_csv("Result/sample.csv", index=False)