In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Code partially taken from the book - "Machine learning for time series" by Francesca Lazzeri. Code
https://github.com/FrancescaLazzeri/Machine-Learning-for-Time-Series-Forecasting**

# **First step - exploratory data analisys**

**Import required libraries**

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import os
import warnings
from collections import UserDict
from glob import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import Image
%matplotlib inline
import matplotlib.dates as mpl_dates
import seaborn as sns
from statsmodels.tsa.ar_model import AutoReg, ar_select_order
from statsmodels.tsa.api import acf, pacf, graphics
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.preprocessing import MinMaxScaler
import math
from keras.models import Model, Sequential
from keras.layers import GRU, Dense
from keras.callbacks import EarlyStopping

**Read the datasets**

In [None]:
holiday = pd.read_csv ('../input/store-sales-time-series-forecasting/holidays_events.csv')
holiday.head (10)

In [None]:
oil = pd.read_csv ('../input/store-sales-time-series-forecasting/oil.csv')
oil.head (10)

In [None]:
stores = pd.read_csv ('../input/store-sales-time-series-forecasting/stores.csv')
stores.head (10)

In [None]:
transactions = pd.read_csv ('../input/store-sales-time-series-forecasting/transactions.csv')
transactions.head (10)

**Check the rows with empty values**

In [None]:
holiday.isna ().sum ()

In [None]:
oil.isna ().sum ()

In [None]:
# drop the missing values
oil = oil.dropna ()
oil.count ()

In [None]:
stores.isna ().sum ()

In [None]:
transactions.isna ().sum ()

**We transform the data into a date series format, since only in this way it will be possible to process the data**

In [None]:
# convert datasets to time series
holiday ['date'] = pd.to_datetime(holiday['date']) 
oil ['date'] = pd.to_datetime(oil ['date'])
transactions ['date'] = pd.to_datetime(transactions ['date']) 

In [None]:
holiday.describe(include=[object])  

In [None]:
stores.describe(include=[object])  

**Create a lag plot for datasets. Lag plots are used to check if a time series is random: random data should not exhibit any structure in the lag plot**

In [None]:
pd.options.display.float_format = "{:, .2f}".format
np.set_printoptions (precision = 2)
warnings.filterwarnings ("ignore")

# import lag_plot function
from pandas.plotting import lag_plot
sns.set ()

# pass the lag argument and plot the values
# when lag = 1 the plot is essentially data [:-1] vs. data [1:]
# plot our holiday data set
lag_plot (holiday ['date'])

In [None]:
# plot our oil data set
lag_plot (oil ['date'])

In [None]:
# plot our transactions data set
lag_plot (transactions ['date'])

*As we can see, the data has a clear linear structure.*

**Create an autocorrelation plot for datasets. Autocorrelation plots applied to check randomness in time series by computing autocorrelations for data values at fluctuating time lags. It is heavily used in time series analysis and forecasting. We can calculate the correlation for current time-series observations with observations of previous time steps called lags.**

In [None]:
# import autocorrelation plot function
from pandas.plotting import autocorrelation_plot
sns.set ()

# pass the autocorrelation argument and plot the values holiday
holiday.reset_index(inplace=True)
holiday ['date'] = holiday ['date'].apply(mpl_dates.date2num)
holiday ['date'] = holiday ['date'].astype(float)
autocorrelation_plot (holiday ['date'])

In [None]:
# pass the autocorrelation argument and plot the values oil
oil.reset_index(inplace=True)
oil ['date'] = oil ['date'].apply(mpl_dates.date2num)
oil ['date'] = oil ['date'].astype(float)
autocorrelation_plot (oil ['date'])

In [None]:
# pass the autocorrelation argument and plot the values transactions
transactions.reset_index(inplace=True)
transactions ['date'] = transactions ['date'].apply(mpl_dates.date2num)
transactions ['date'] = transactions ['date'].astype(float)
autocorrelation_plot (transactions ['date'])

In [None]:
# import plot_acf () function
from statsmodels.graphics.tsaplots import plot_acf

# plot the acf function on the holiday data set
plot_acf (holiday ['date'])
plt.show ()

In [None]:
# plot the acf function for oil data set
plot_acf (oil ['date'])
plt.show ()

In [None]:
# plot the acf function for transactions data set
plot_acf (transactions ['date'])
plt.show ()

In [None]:
# import plor_pacf () function
from statsmodels.graphics.tsaplots import plot_pacf

# plot the pacf fucntion on the holiday dataset
plot_pacf (holiday ['date'], lags = 20)
plt.show ()

In [None]:
# plot the pacf function on the oil dataset
plot_pacf (oil ['date'], lags = 30)
plt.show ()

In [None]:
# plot the pacf function on the transactions dataset
plot_pacf (transactions ['date'], lags = 40)
plt.show ()

**Autoregression modeling. 
Autoregression is a time series model that uses observations from previous time steps as input to a regression equation to predict the value at the next time step. It is a very simple idea that can result in accurate forecasts on a range of time series problems.**

In [None]:
# apply AutoReg model for holiday dataset
modelH = AutoReg (oil ['date'], 1)
resultsH = modelH.fit ()
resultsH.summary ()

In [None]:
# apply AutoReg model for oil dataset
modelO = AutoReg (oil ['date'], 1)
resultsO = modelO.fit ()
resultsO.summary ()

In [None]:
# apply AutoReg model for transactions dataset
modelT = AutoReg (transactions ['date'], 1)
resultsT = modelT.fit ()
resultsT.summary ()

**Visualize the forecasts**

In [None]:
# define figure style, plot package and default figure size
sns.set_style ('darkgrid')
pd.plotting.register_matplotlib_converters ()

# default figure size
sns.mpl.rc ('figure', figsize = (18, 8))

# use plot_predict and visualize forecasts for holiday dataset
figure = resultsH.plot_predict (120, 490)

In [None]:
# use plot_predict and visualize forecasts for oil dataset
figure = resultsO.plot_predict (120, 490)

In [None]:
# results plot_predict and visualize forecasts for transactions dataset
figure = resultsT.plot_predict (120, 490)

**Plot_diagnostics indicates that the model captures the key features in the data. They help us determine visually how our model is fitting the data and if any of the basic assumptions of an OLS (ordinary least squares) model are being violated.**

In [None]:
# define default figure size
fig = plt.figure (figsize = (18, 10))

# use plot_predict and visualize forecasts for holiday dataset
fig = resultsH.plot_diagnostics (fig = fig, lags = 30)

In [None]:
# define default figure size
fig = plt.figure (figsize = (18, 10))

# use plot_predict and visualize forecasts for oil dataset
fig = resultsO.plot_diagnostics (fig = fig, lags = 30)

In [None]:
# define default figure size
fig = plt.figure (figsize = (18, 10))

# use plot_predict and visualize forecasts for oil dataset
fig = resultsT.plot_diagnostics (fig = fig, lags = 30)

**Training the model for oil dataset**

In [None]:
from sklearn.model_selection import train_test_split
# create train set containing only the model features
X = oil

X_train, X_test = train_test_split(X, test_size=0.33, random_state=42)

print ('Training data shape X: ', X_train.shape)
print ('Test data shape X: ', X_test.shape)

In [None]:
# scale train data to be in range (0, 1)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler ()
X_train = scaler.fit_transform (X_train)

# scale test data to be in range (0, 1)
X_test = scaler.transform (X_test)

# specify the number of steps to forecast ahead
HORIZON = 3
print ('Forecasting horizon: ', HORIZON, 'days')

In [None]:
# make predictions on the test data
training_window = 720

history = [x for x in X_train]
history = history [(-training_window):]
predictions = list ()

for t in range (X_test.shape [0]):
    modelO = AutoReg (oil ['date'], 1)
    modelO_fit = modelH.fit ()
    yhat = modelO_fit.predict
    predictions.append (yhat)
    obs = list (X_test [t])
    # move the trainig window
    history.append (obs [0])
    history.pop (0)
    print (X_test [t])
    print (t+1, ': predicted =', yhat, 'expected =', obs)

**Deep learning for time series forecasting**

In [None]:
# read the data
X = pd.read_csv ('../input/store-sales-time-series-forecasting/test.csv')
y = pd.read_csv ('../input/store-sales-time-series-forecasting/train.csv')

# sampling the data in X
X = X ['date'].sample(5000)

# sampling the data in y
y = y ['date'].sample (5000)

In [None]:
# setting T - the number of lag variables 
T = 1

# setting the horizon, as we interesting in predicting next day
HORIZON = 1

In [None]:
# convert datasets to time series
X = pd.to_datetime(X) 
y = pd.to_datetime(y)

In [None]:
# create train set containing only the model features
X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

print ('Training data shape X: ', X_train.shape)
print ('Test data shape X: ', X_test.shape)
print ('Training data shape y: ', y_train.shape)
print ('Test data shape y: ', y_test.shape)

In [None]:
# create a validation set
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size=0.5)

print ('Validation data shape X: ', X_valid.shape)
print ('Validation data shape y: ', y_valid.shape)

In [None]:
# converting data
X_train = np.asarray(X_train).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)
y_test = np.asarray(y_test).astype(np.float32)
X_valid = np.asarray (X_valid).astype (np.float32)
y_valid = np.asarray (y_valid).astype (np.float32)

In [None]:
# rescale the data to 0-1 scale
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
X_train = min_max_scaler.fit_transform(X_train.reshape(-1, 1))
y_train = min_max_scaler.fit_transform(y_train.reshape(-1, 1)) 
X_test = min_max_scaler.fit_transform(X_test.reshape(-1, 1))
y_test = min_max_scaler.fit_transform(y_test.reshape(-1, 1))
X_valid = min_max_scaler.fit_transform(X_valid.reshape(-1, 1))
y_valid = min_max_scaler.fit_transform(y_valid.reshape(-1, 1))

In [None]:
# reshape the data
X_train = np.reshape(X_train, (X_train.shape[0], T, X_train.shape[1]))
y_train = np.reshape(y_train, (y_train.shape[0], T, y_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], T, X_test.shape[1]))
y_test = np.reshape(y_test, (y_test.shape[0], T, y_test.shape[1]))
X_valid = np.reshape(X_valid, (X_valid.shape[0], T, X_valid.shape[1]))
y_valid = np.reshape(y_valid, (y_valid.shape[0], T, y_valid.shape[1]))

**Univariate models**

In [None]:
# number of units in the RNN layer
LATENT_DIM = 5  

# number of samples per mini-batch
BATCH_SIZE = 32

# maximum number of times the training algorithm will cycle through all samples
EPOCHS = 15

# define model and create a Sequential model
model = Sequential ()
model.add (GRU (LATENT_DIM, input_shape = (T, 1)))
model.add (Dense (HORIZON))

model.compile (optimizer = 'RMSprop', loss = 'mse')
model.summary ()

In [None]:
# specify early stop criteria
GRU_earlystop = EarlyStopping (monitor = 'val_loss', min_delta = 0, patience = 5)

# fit our model
history = model.fit (X_train, y_train, batch_size = BATCH_SIZE, epochs = EPOCHS, validation_data = (X_valid, y_valid),
                           callbacks = [GRU_earlystop], verbose = 1)

In [None]:
# plot the epochs and train_loss and val_loss
plot_df = pd.DataFrame.from_dict({'train_loss':history.history['loss'], 'val_loss':history.history['val_loss']})
plot_df.plot(logy=True, figsize=(10,10), fontsize=12)
plt.xlabel('epoch', fontsize=12)
plt.ylabel('loss', fontsize=12)
plt.show()

*The number of losses decreases with increasing epochs*

In [None]:
# make the predictiondson the X_test and compare those predictions on the y_test
ts_predictions = model.predict (X_test)
ev_ts_data = pd.DataFrame (ts_predictions)
ev_ts_data ['prediction' ]= pd.DataFrame (ts_predictions)
ev_ts_data ['actual'] = np.transpose (y_test).ravel ()

# evaluate our model and compute MAPE (mean absolute percentage error)  
def mape (ts_predictions, actuals):
    return ((ts_predictions - actuals).abs () / actuals).mean ()

mape (ev_ts_data ['prediction'], ev_ts_data ['actual'])

In [None]:
# plot the count of predictions
ev_ts_data ['actual' < '0.3'].plot (x = 'prediction', style = ['r', 'b'], figsize = (15, 8))
plt.xlabel ('prediction', fontsize = 12)
plt.ylabel ('count', fontsize = 12)
plt.show ()

***Let's try to apply machine learning methods to transactions datasets***

In [None]:
transactions.info ()

In [None]:
X = transactions ['date']
y = transactions ['transactions']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=21)

In [None]:
# converting data
X_train = np.asarray(X_train).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)
y_test = np.asarray(y_test).astype(np.float32)

# rescale the data to 0-1 scale
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
X_train = min_max_scaler.fit_transform(X_train.reshape(-1, 1))
y_train = min_max_scaler.fit_transform(y_train.reshape(-1, 1)) 
X_test = min_max_scaler.fit_transform(X_test.reshape(-1, 1))
y_test = min_max_scaler.fit_transform(y_test.reshape(-1, 1))

# reshape the data
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1]))
y_train = np.reshape(y_train, (y_train.shape[0], y_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1]))
y_test = np.reshape(y_test, (y_test.shape[0], y_test.shape[1]))

In [None]:
# check the data shape
print ('Training data shape X: ', X_train.shape)
print ('Test data shape X: ', X_test.shape)
print ('Training data shape y: ', y_train.shape)
print ('Test data shape y: ', y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(X_train, X_test)

lr_y = y_test
lr_y_fit = lr_model.predict(X_train)
lr_y_pred = lr_model.predict(X_test)

lr_residuals = lr_y_pred - lr_y
lr_rmse = np.sqrt(np.sum(np.power(lr_residuals,2)) / len(lr_residuals))
print('RMSE = %.2f' % lr_rmse)