In [None]:
%pip install pmdarima

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
from pmdarima.arima import auto_arima
from sklearn.metrics import mean_absolute_error

## Problem Definition:
We are predict the total sales week for one store that contains the highest number of sales.

In [None]:
data_train = pd.read_csv('../input/store-sales-time-series-forecasting/train.csv')

In [None]:
oil = pd.read_csv('../input/store-sales-time-series-forecasting/oil.csv')

In [None]:
store = pd.read_csv('../input/store-sales-time-series-forecasting/stores.csv')

In [None]:
holidays_events = pd.read_csv('../input/store-sales-time-series-forecasting/holidays_events.csv')

## Data Analysis:
We will use store code 44, because it is the store that contains the highest number of sales. If we look at store size, we see that store 20 is in 9th place.

In [None]:
data_train.groupby(['store_nbr'])['sales'].sum().sort_values(ascending=False).head()

We will filter all datasets with only the store code 44, since we only use one store

In [None]:
data_train_choice = data_train[data_train['store_nbr'] == 44].copy()
store_choice = store[store['store_nbr'] == 44].copy()

In [None]:
data_train_choice.info()

In [None]:
data_train_choice['date'] = pd.to_datetime(data_train_choice['date'])

As we can see, we haven't duplicate lines.

In [None]:
print("Rows: ", data_train_choice.shape)
print("Remove duplicates rows:", data_train_choice.drop_duplicates().shape)

In [None]:
data_train_choice = data_train_choice.groupby('date')[['sales','onpromotion']].sum().reset_index()

There are 4 days missing in the dataset and all the first days of the year have 0 sales, this may be related to end of year events and the other 4 days that are not in the dataset is December 25 which is Christmas and we filled with 0.

In [None]:
dates = pd.date_range(start=min(data_train_choice['date']),end=max(data_train_choice['date']))
print('Total Days sales:', data_train_choice.shape[0])
print('Total Days Corret:', len(dates))

In [None]:
data_train_choice[data_train_choice['sales'] <= 0]

In [None]:
dates = pd.DataFrame(dates, columns=['date'])
data_train_choice = dates.merge(data_train_choice,on='date',how='left')

In [None]:
data_train_choice.fillna(0,inplace=True)

**Holidays Events Analysis**:

As we can see, the events that are as transferred, the flag trasnferred is True. We can, remove this events by index.

In [None]:
holidays_events[holidays_events['date'] == '2012-10-09']

In [None]:
holidays_events[holidays_events['date'] == '2012-10-12']

In [None]:
index_events_trasnferred = holidays_events[holidays_events['transferred']].index

In [None]:
holidays_events.drop(index_events_trasnferred, axis=0, inplace=True)

We will choose Quito city and regional events.

In [None]:
store_choice

In [None]:
holidays_events = holidays_events[(holidays_events['locale_name'] == 'Quito') | (holidays_events['locale'] == 'National')]

**Oil data analysis:**

We have missing oil values for a few days. We impute the data by values from the previous day. We cannot apply the average or another imputation method because we have a dependency between the value and the past. Only 2 row we can't impute value wich is date 2013-01-01 and 2017-07-04.

In [None]:
oil['date'] = pd.to_datetime(oil['date'] )

In [None]:
oil.isna().mean()

In [None]:
oil.fillna(oil.shift(1),inplace=True)

In [None]:
oil[oil['dcoilwtico'].isna()]

**Merge the dataframes**:

In [None]:
data_train_choice['holiday'] = np.where(data_train_choice['date'].isin(holidays_events['date']),1,0)
data_train_choice = data_train_choice.merge(oil, on='date',how='left')

As we can see, there are dates with missing oil values and we will fill in these values with the values from the previous day. We did this operation twice, because there are cases that are two consecutive values of NaNs.

In [None]:
data_train_choice['dcoilwtico'] = data_train_choice['dcoilwtico'].fillna(data_train_choice['dcoilwtico'].shift(1))
data_train_choice['dcoilwtico'] = data_train_choice['dcoilwtico'].fillna(data_train_choice['dcoilwtico'].shift(1))

We will divide the data into 2 phases: the train data, the test data. We can't use cross-validation, because at some point we will lose the order of the series and we will use the future date with training to predict the past values.

In [None]:
train_size = 0.70
X_train = data_train_choice.loc[0:np.round(data_train_choice.shape[0]*train_size),:]
X_test = data_train_choice.loc[np.round(data_train_choice.shape[0]*train_size):,:]
X_train = X_train.set_index('date').asfreq('d')
X_test = X_test.set_index('date').asfreq('d')

In [None]:
def plot_ts_decompose(decompose,figsize=(12,8)):
    fig, ax = plt.subplots(4,1,figsize=figsize)
    sns.lineplot(data = decompose.observed, x = decompose.observed.index, y = decompose.observed,ax=ax[0])
    sns.lineplot(data = decompose.trend, x = decompose.trend.index, y = decompose.trend,ax=ax[1])
    sns.lineplot(data = decompose.seasonal ,x = decompose.seasonal.index, y = decompose.seasonal,ax=ax[2])
    sns.lineplot(data = decompose.resid ,x = decompose.resid.index, y = decompose.resid,ax=ax[3])
    plt.tight_layout()

In [None]:
decompose = seasonal_decompose(X_train['sales'],model='additive')

As we can see, we have a low sale in 2013 and in 2014 some months high sale others low, but in September we have a high sale until in January 2015 and then a low sale.

In [None]:
plot_ts_decompose(decompose)

As we can see, we have some seasonal pattern in the series and a high correlation between the present and past value and we don't have white noise in the series because we have this correlation. But in autocorrelation, if one value was correlated with the present, the next value is also the present. For this, we need to look at Partial Autocorrelations, because this way we eliminate the effects of past values for the next value.

In [None]:
plot_acf(X_train['sales'], alpha = 0.05);

We have a correlation with 1 and 7 day sales in the past;

In [None]:
plot_pacf(X_train['sales'], alpha = 0.05);

As we can see, we have a seasonal pattern on day 5,6 and 15, 16, that is, for every 5 days we have an increase in sales.

In [None]:
decompose = seasonal_decompose(X_train[X_train.index < '2013-03-30']['sales'],model='additive')
plot_ts_decompose(decompose)

We need found a stationary serie because is easy to predict. The statistical properties of the past will be the same as in the future. Time series with trend and seasonality are not stationary, because these characteristics affect the value of the series at different times. For this, we use the Dicked-fuller test: p-value <= 0.05: Reject the null hypothesis (H0), the data does not have a unit root and is stationary.

In [None]:
p_value_adfuller = adfuller(X_train['sales'])[1]
print(p_value_adfuller)

We can either transform the series to stationary by calculating the differences between consecutive observations or transform the value to logarithmic.

In [None]:
X_train['diff_sales'] = X_train['sales'].diff()

In [None]:
p_value_adfuller = adfuller(X_train['diff_sales'].dropna())[1]
print(p_value_adfuller)
plot_acf(X_train['diff_sales'].dropna(), alpha = 0.05);
plot_pacf(X_train['diff_sales'].dropna(), alpha = 0.05);

As we can see, only dif_onpromotion has a low level of correlation with sales. We made a difference because sometimes correlation necessarily means that they are correlated.

In [None]:
X_train['diff_onpromotion'] =  X_train['onpromotion'].diff()
X_train['diff_dcoilwtico'] =  X_train['dcoilwtico'].diff()

In [None]:
X_train.corr()

As we can see, when we have holidays we have more sales than when we don't.

In [None]:
X_train.groupby('holiday')[['diff_sales','sales']].describe()

## Conclusion:
* Store code 44 is the store with the most sales;
* Two dates are atypical: first day of the year and Christmas, the store is closed;
* There is a seasonal pattern for every 5 days. Where 5 and 6 days our 15,16 have more sales;
* There is some correlation with promotion and sales.

## Model:

In [None]:
X_test['diff_onpromotion'] =  X_test['onpromotion'].diff()
X_test['diff_sales'] =  X_test['sales'].diff()

We use as order parameter the autoregressive value as 6 as we can see in the partial autocorrelation graph and moving average 2 as we can see in the autocorrelation graph, because for each lag in autocorrelation the past information is kept.

In [None]:
arima_train = X_train.dropna().copy()
model_arima = ARIMA(arima_train['diff_sales'],freq='D', exog=arima_train[['diff_onpromotion','holiday']], order=(6,0,2))
result = model_arima.fit(method_kwargs={"warn_convergence": False})

print(result.summary())

As we can see we dont have a pattern in residual and are near zero mean and uniform variance.

In [None]:
fig, ax = plt.subplots(1,2,figsize=(15,4))
sns.lineplot(x=result.resid.index, y=result.resid.values,ax=ax[0])
sns.kdeplot(x=result.resid.values,ax=ax[1])
print(adfuller(result.resid)[1])

In [None]:
arima_test =  X_test.dropna()

In [None]:
predict_arima = result.predict(start = min(arima_test.index), end = max(arima_test.index), exog = arima_test[['diff_onpromotion','holiday']])

As we can see, we have a high error.

In [None]:
fig, ax = plt.subplots(figsize=(12,3))
ax.plot(arima_test.index, arima_test['diff_sales'], color='blue')
ax.plot(arima_test.index, predict_arima, color='black')
plt.tight_layout()
print(mean_absolute_error(arima_test['diff_sales'], predict_arima))

As we can see, the best autoarima chosen was ARIMA with order parameter the autoregressive value as 5 moving average 2 and we have a better AIC than arima yet a raising MSE.

In [None]:
model_auto_arima = auto_arima(arima_train['diff_sales'],freq='D', exog=arima_train[['diff_onpromotion','holiday']])

In [None]:
print(model_auto_arima.summary())

In [None]:
predict_autoarima = model_auto_arima.predict(n_periods = len(arima_test) , index = arima_test.index)

In [None]:
fig, ax = plt.subplots(figsize=(12,3))
ax.plot(arima_test.index, arima_test['diff_sales'], color='blue')
ax.plot(arima_test.index, predict_autoarima, color='black')
plt.tight_layout()
print(mean_absolute_error(arima_test['diff_sales'], predict_autoarima))

## Model Conclusion:
* The autoarima output is the best for short prediction with low MSE and AIC.
* We can add other exogenous variables such as: temperature, unemployment rate and the CPI (consumer price index)
* Can test with regressors from machine learning models.


And that’s it! It has been a pleasure to make this kernel, I have learned a lot! Thank you for reading and if you like it, please upvote it!
