Let's use RNN (LSTM) to forecast the data.

In [None]:
import yaml
import pandas as pd 
import os
import matplotlib.pyplot as plt

def get_config():
    with open("config.yaml", 'r') as stream:
        config = yaml.safe_load(stream)
    return config

config = get_config()
dir_to = config['directory_to']
file_name_to = config['cleared_data_to']
cleared_data = pd.read_csv(os.path.join(dir_to, file_name_to), parse_dates=['Event Date'])

Let's prepare data for the prediction of user demand on the regional level.

In [None]:
products_data = cleared_data[['Event Date', 'Sub-Category', 'Event', 'Region']].dropna(how='any')
products_data.head()

Let's prepare data for the prediction of average category prices.

In [None]:
data_price_prediction = cleared_data[['Event Date', 'Sub-Category', 'Event', 'Euro Price/Kg']].dropna(how='any')
data_price_prediction.head()

Let's create function which return the dataset for making prediction. There are two possible types of prediction (prediction parameter) - demand and price.

In [None]:
def get_data_for_time_series_analysis(cleared_data, event_type='all', category='all', region='all', prediction='demand'):
    if event_type != 'all':
        cleared_data  = cleared_data[cleared_data['Event'] == event_type]
    if category != 'all':
        cleared_data  = cleared_data[cleared_data['Sub-Category'] == category]
    cleared_data = cleared_data.drop(columns=['Event', 'Sub-Category'])
    if prediction == 'demand':
        if region != 'all':
            cleared_data = cleared_data[cleared_data['Region'] == region]
        cleared_data = cleared_data.groupby('Event Date')['Region'].count()
        cleared_data = cleared_data.reset_index()
        cleared_data = cleared_data.rename(columns = {'Region':'Launches'})
    elif prediction == 'price':
        cleared_data = cleared_data.groupby(['Event Date']).mean()
        cleared_data = cleared_data.reset_index()
        cleared_data = cleared_data.rename(columns = {'Euro Price/Kg':'Prices'})
    else:
        raise Exception("Wrong prediction type. Use prediction=demand or prediction=price")
    return cleared_data[:-2], cleared_data['Event Date'][:-2]

In [None]:
# y_demand, dates_demand = get_data_for_time_series_analysis(products_data, 'New Product', 'Cakes - Pastries & Sweet Goods', 'West Europe', 'demand')
y_demand, dates_demand = get_data_for_time_series_analysis(products_data, 'New Product', 'Bread & Bread Products', 'West Europe', 'demand')
y_demand.head()

In [None]:
y_price, dates_price = get_data_for_time_series_analysis(data_price_prediction, prediction='price')
y_price.head()

The number of dates and actual values should be equal.

In [None]:
print(len(y_demand), len(dates_demand))

In [None]:
from scipy.signal import savgol_filter  

def plot_df(x, y, title="", xlabel='Date', ylabel='Value'):
    plt.figure(figsize=(16,5))
    plt.plot(x, y, color='tab:red', label='Actual')
    
    yhat = savgol_filter(y, 9, 3)
    plt.plot(x, yhat, color='green', linestyle='dashed', label='Smoothing')

    plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
    plt.legend(loc='upper right')
    plt.show()

plot_df(dates_demand, y_demand['Launches'], title='Launches between 2019 and 2023')  
plot_df(dates_price, y_price['Prices'], title='Average prices between 2019 and 2023')  

Let's create a Forecaster object. We can see that the start date for the analysis is 01.06.2019, the end data is 01.01.2023 with the monthly frequency. There are 44 data points.

In [None]:
from scalecast.Forecaster import Forecaster

y = y_demand['Launches'].values

f_demand = Forecaster(
    y=y,
    current_dates=dates_demand,
    cis=True
)
f_demand

Same characteristics are observed for the average prices dataset.

In [None]:
from scalecast.Forecaster import Forecaster

f_price = Forecaster(
    y=y_price['Prices'],
    current_dates=dates_price
)
f_price

We use the PACF plot to inspect partial correlations between the actual values and their lags (lags - previous observations, partial means wihout considering any lags inbetween). The plot_pacf function can only compute partial correlations for lags up to 50% of the sample size. So the lags parameter in the function call should be less than 22.

In [None]:
f_demand.plot_pacf(lags=21, method='ywm')
plt.show()

From this plot, it looks like some statistically significant correlations exist between the current and previous observations in the user demand dataset (for lags 1,4,5,14).

In [None]:
f_price.plot_pacf(lags=21, method='ywm')
plt.show()

No statistically significant correlation exists between the current and previous observations in the price dataset. 

In [None]:
res_demand = f_demand.seasonal_decompose()

def plot_seasonal_decompose(res):
    fig, (ax1,ax2,ax3) = plt.subplots(3,1, figsize=(18,10))
    res.trend.plot(ax=ax1, title='Trend')
    res.resid.plot(ax=ax2, title='Residuals')
    res.seasonal.plot(ax=ax3, title='Seasonal pattern')
    plt.show()
    
plot_seasonal_decompose(res_demand)

There is no linear trend, but we do see a strong seasonality in the user demand dataset! The residuals do not  follow any pattern.

In [None]:
res_price = f_price.seasonal_decompose()
plot_seasonal_decompose(res_price)

We don't see a clear linear trend (but overall there is an upward trend), but we do see a strong seasonality in the price dataset! The residuals appear to be following a pattern too.

Let's normalize the data - rescale it to the range of 0 to 1, split the data into the train and test datasets. We can not use the cross-validation because the order of the data is important.

For each data point starting with the N+1 datapoint the function "create_dataset" returns the list of the N previous observations (where N = loop back) as the first output parameter and the current observation as the second output parameter. For instance, for the loopback 3 the function return (x0,x1,x2) as the first output value and (x3) as the last.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from pandas import read_csv
import math
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back):
        a = dataset[i:(i+look_back), 0]
        dataX.append(a)
        dataY.append(dataset[i + look_back, 0])
    return np.array(dataX), np.array(dataY)

tf.random.set_seed(7)
look_back = 15

dataset = y_demand['Launches'].values
# dataset = y_price['Prices']. values
dataset = dataset.astype('float32')
reshaped_dataset = dataset.reshape(-1, 1)
scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(reshaped_dataset)

train_size = int(len(dataset) * 0.67)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size-look_back:len(dataset),:]

trainX, trainY = create_dataset(train, look_back)
testX, testY = create_dataset(test, look_back)

trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))


model = Sequential()
model.add(LSTM(4, input_shape=(1, look_back)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(trainX, trainY, epochs=110, batch_size=1, verbose=2)

trainPredict = model.predict(trainX)
testPredict = model.predict(testX)


trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform([trainY])
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform([testY])

trainScore = np.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = np.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
print('Test Score: %.2f RMSE' % (testScore))

trainPredictPlot = np.empty_like(dataset)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict

testPredictPlot = np.empty_like(dataset)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(trainPredict)+(look_back):len(dataset), :] = testPredict

inbetween = np.empty_like(dataset)
inbetween[:, :] = np.nan
inbetween[len(trainPredict)+(look_back)-1:len(trainPredict)+(look_back)+1, :] = [trainPredict[-1], testPredict[0]]

plt.plot(scaler.inverse_transform(dataset), label='Actual')
plt.plot(trainPredictPlot, label='Train set')
plt.plot(testPredictPlot, label='Test set')
plt.plot(inbetween, linestyle='dashed', color = 'blue', alpha=0.5)
plt.legend(loc='upper right')
plt.gca().set(title='LSTM user demand prediction', xlabel='Months', ylabel='Launches')
plt.show()

In [None]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

def get_data_times(number, dates):
    d = dates[len(dates)-1]
    list_dates = [d + relativedelta(months=+month_num) for month_num in range(1,number+1)]
    return np.append([date for date in dates], list_dates)

def make_predictions(number, look_back, dataset):
    actuals = []
    last_observations = dataset[-look_back:]
    last_observations = last_observations.reshape(1, -1)
    last_observations = np.reshape(last_observations, (last_observations.shape[0], 1, last_observations.shape[1]))
    for i in range(number):
        new_predict = model.predict(last_observations)
        new_actual = scaler.inverse_transform(new_predict)
        actuals.append(new_actual.ravel()[0])
        last_observations = np.append(last_observations[:,:,1:], new_predict)
        last_observations = last_observations.reshape(1, -1)
        last_observations = np.reshape(last_observations, (last_observations.shape[0], 1, last_observations.shape[1]))     
    dates = get_data_times(number, dates_demand)
    return actuals, dates
        

In [None]:
predictions, datetimes = make_predictions(6, look_back, dataset)

predictoinsPlot = np.empty_like(dataset[:-1])
predictoinsPlot[:, :] = np.nan
first_val = scaler.inverse_transform(dataset[-1].reshape(-1, 1))
predictoinsPlot = np.append(predictoinsPlot, np.append(first_val, predictions))

In [None]:
# plot baseline and predictions
plt.figure(figsize=(20,8))

actuals = np.append(scaler.inverse_transform(dataset), np.zeros(len(datetimes)- len(scaler.inverse_transform(dataset))) * np.nan)
plt.plot(datetimes, actuals, label='Actual')

trains = np.append(trainPredictPlot, np.zeros(len(datetimes)- len(trainPredictPlot)) * np.nan)
plt.plot(datetimes, trains, label='Train set: RMSE = ' + str(round(trainScore, 2)))

tests = np.append(testPredictPlot, np.zeros(len(datetimes)- len(testPredictPlot)) * np.nan)
plt.plot(datetimes, tests, label='Test set: RMSE = ' + str(round(testScore, 2)))

inbetweens = np.append(inbetween, np.zeros(len(datetimes)- len(inbetween)) * np.nan)
plt.plot(datetimes, inbetweens, linestyle='dashed', color = 'blue', alpha=0.5)

plt.plot(datetimes, predictoinsPlot, color = 'blue', linewidth=3, label='Predictions')

plt.legend(loc='lower right')
plt.gca().set(title='LSTM user demand prediction', xlabel='Months', ylabel='Launches')
plt.show()