# DeepAR Only

This is part of a modified version of "Chris Tegho 220111.ipynb".

https://ts.gluon.ai/ 

I should also refer to "Chris Tegho via Slack 220110.docx"

In [None]:
#Import libraries
from helpers import *

from sqlalchemy import create_engine
import psycopg2

import numpy as np
import pandas as pd
import datetime as dt

from sklearn.model_selection import train_test_split

from gluonts.dataset.common import ListDataset
from gluonts.dataset.field_names import FieldName
from gluonts.dataset.util import to_pandas
from gluonts.model import deepar
from gluonts.mx.trainer import Trainer
from gluonts.evaluation import make_evaluation_predictions

import matplotlib.pyplot as plt
%matplotlib inline

#This is suppress all warnings in the notebook - turn when happy code works
# import warnings
# warnings.filterwarnings('ignore')

In [None]:
#Redshift user credentials - set here
USER = 
PASSWORD = 

FCST_PERIOD = 9   #How many months I want to forecast ahead

In [None]:
#Create SQLAlchemy engine for Redshift database
user = USER
password = PASSWORD
host=
port='5439'
dbname='prod'

url = "postgresql+psycopg2://{0}:{1}@{2}:{3}/{4}".format(user, password, host, port, dbname)
engine = create_engine(url)

# A. Get data from Redshift

In [None]:
#This is all harcoded for the moment
#ALSO I have a single database read version of this code that I can use instead of this
#The df_catalog concept was from when I wanted to downselect based on product attributes
#e.g. the Business clustering rules

query = f"""
select
    isbn + ship_to_country_key as key,
    isbn,
    isbn_short,
    subject_2_key,
    series_key,
    series_short,
    family_key,
    family_name,
    ship_to_country_key as country,
    sum(quantity_demanded) as qty_12m
from r2ibp.f_demand_actual t1
left join r2ibp.lu_product t2
on t1.isbn = t2.isbn13
where last_day(date) <= current_date
and last_day(date) > dateadd(month, -{FCST_PERIOD}, current_date)
and ship_to_country_key = 'ES'
group by isbn, isbn_short, subject_2_key, series_key, series_short, family_key, family_name, ship_to_country_key
order by qty_12m desc
"""

conn = engine.connect()
df_catalog = pd.read_sql_query(query, conn)
conn.close()

In [None]:
key_list = list(df_catalog['key'])
   
df_demand = get_demand(key_list, engine)

# B. Pivot into a datafame

NB Drop negative values and replace NaNs (i.e. missing values) with zeroes

Also simplify the columns index

In [None]:
#I only need three columns from df_demand
df_temp = df_demand[['key', 'month', 'qty']]

df_pivoted = df_temp[df_temp['qty']>0].pivot(index='key', columns='month').fillna(0)
df_pivoted.columns = df_pivoted.columns.droplevel(0)

del df_temp

# C. Prepare the data for modelling

The prediction will be for FCST_PERIOD months


Scaled/ normalise the data - start by just scaling based on max value like I did for clustering

Need to split the data both into X and y (12 months) and train (up a year ago) and test (up to the last full month). As well as creating a validation set for the training performance from the train set.

Finally convert dataframes into numpy arrays to input into DeepAR

In [None]:
#Scale the data 0-1 based on the max demand value
dfMax = df_pivoted.max(axis=1)
df_scaled = df_pivoted.divide(dfMax, axis=0)
df_scaled.tail()

In [None]:
#Set key parameters for data prep and modelling
n_features = 1 #i.e. a single quantity for each month

n_total_steps = df_scaled.shape[1] #i.e. the total number of months
n_steps_out = FCST_PERIOD
n_steps_in = n_total_steps - 2*n_steps_out # Need to chop off both the train and test FCST_PERIOD months!

#Split into train and test X and y
df_X = df_scaled.iloc[:, :-(2*n_steps_out)]
df_y = df_scaled.iloc[:, -(2*n_steps_out):-n_steps_out]

df_X_train, df_X_val, df_y_train, df_y_val = train_test_split(df_X, df_y, train_size=0.92) #default is 75:25 split

df_X_test = df_scaled.iloc[:, n_steps_out:-n_steps_out]
df_y_test = df_scaled.iloc[:, -n_steps_out:]


In [None]:
%%capture --no-display

#Format data to be used by DeepAR using ListDataSet()

train_ds_list = []
for ind_ in df_X_train.index:
    target = df_X_train.loc[ind_].to_list() 
    target +=df_y_train.loc[ind_].to_list()
    start = pd.Timestamp(df_X_train.columns[0], freq='M')
    train_ds_list += [{
            FieldName.TARGET: target, 
             FieldName.START: start,
        }]
train_ds = ListDataset(train_ds_list,freq='M')

test_ds_list = []
for ind_ in df_X_test.index:
    target = df_X_test.loc[ind_].to_list() 
    target +=df_y_test.loc[ind_].to_list()
    start = pd.Timestamp(df_X_train.columns[0], freq='M')
    test_ds_list += [{
            FieldName.TARGET: target, 
             FieldName.START: start,
        }]
test_ds = ListDataset(test_ds_list,freq='M')

In [None]:
%%capture --no-display

#This is just to take a look at what's in train_ds

#Create an iterable object
it = iter(train_ds)
#Get the first entry in it
train_entry = next(it)

#Now convert
train_series = to_pandas(train_entry) #This is a series object without headings

train_series.head()
#I could plot
# train_series.plot();
# plt.show()


# D. Model and Predict

## D.1 Train Model

In [None]:
trainer = Trainer(epochs=30, learning_rate=1e-2)
estimator = deepar.DeepAREstimator(
    freq="M", prediction_length=FCST_PERIOD, trainer=trainer)
predictor = estimator.train(training_data=train_ds)

## D.2 Predict

In [None]:
forecast_it, ts_it = make_evaluation_predictions(
    dataset=test_ds,  # test dataset
    predictor=predictor,  # predictor
)


Help on how to analyse the output can be found in https://stackoverflow.com/questions/61416951/export-multiple-gluonts-forecasts-to-pandas-dataframe

The code below reformats the forecast_it to the same format as used in the KERAS models for simple comparison

In [None]:
%%capture --no-display

def create_sample_df(forecast):
    samples = forecast.samples
    ns, h = samples.shape
    dates = pd.date_range(forecast.start_date, freq=forecast.freq, periods=h)
    return pd.DataFrame(samples.T, index=dates)

#Iterate forecast_it
parts = [create_sample_df(entry).assign(entry=i)
         for i, entry in enumerate(forecast_it)]
df_temp = pd.concat(parts)

#I only want the median value (all 100 percentiles are available in samples)
df_temp = df_temp[['entry', 50]].reset_index() 
#Now pivot to create same structure as the KERAS forecasts (to reuse code)
df_temp = df_temp.pivot(index= 'entry', columns = 'index')
yhat = df_temp.to_numpy()

#Set up the rescaled df for plotting at the same time
df_yhat = pd.DataFrame(data=yhat, index=df_y_test.index, columns=df_y_test.columns).mul(dfMax, axis = 0)

del df_temp


# E. Calculate metrics and plot results

## E.1 Calculate metrics

Calculate total demand and RMSE metrics

In [None]:
#This function works across the whole arrays

def calc_prediction_metrics_from_array(y_test, yhat, y_naive1):
    
    sum_pred = np.sum(yhat, axis=1)
    sum_naive1 = np.sum(y_naive1, axis=1)
    sum_act = np.sum(y_test, axis=1)
    
    diff_pred_act = sum_pred - sum_act
    diff_naive1_act = sum_naive1 - sum_act
    
    abs_pred_closer = (abs(diff_pred_act) < abs(diff_naive1_act))
    
    rmse_pred = mean_squared_error(y_test.T, yhat.T, multioutput='raw_values', squared = False)
    rmse_naive1 = mean_squared_error(y_test.T, y_naive1.T, multioutput='raw_values', squared = False)
    
    pred_rmse_lower = (rmse_pred < rmse_naive1)
    rmse_pc_diff = ((rmse_pred - rmse_naive1)/rmse_naive1)*100

    return [sum_naive1, sum_pred, sum_act, diff_naive1_act, diff_pred_act, abs_pred_closer,
                                rmse_naive1, rmse_pred, pred_rmse_lower, rmse_pc_diff]

In [None]:
X_test = df_X_test.to_numpy()
y_test = df_y_test.to_numpy()

if FCST_PERIOD == 12:
    y_naive1 = X_test[:, -12:] #i.e. 12 months ago
else:    
    y_naive1 = X_test[:, -12:FCST_PERIOD-12] #i.e. back 12 months and then PERIOD forward

In [None]:
%%capture --no-display
#This suppresses all warnings - in this case divide by zero
    
metrics = calc_prediction_metrics_from_array(y_test, yhat, y_naive1)

df_metrics = pd.DataFrame(df_X_test.index ,columns = ['key'])

for i in range(len(metrics)):
    df_metrics[i] = metrics[i]

df_metrics.columns = ['key', 'sum_naive1', 'sum_pred', 'sum_act', 'diff_naive1_act', 'diff_pred_act','abs_pred_closer',
                      'rmse_naive1', 'rmse_pred', 'pred_rmse_lower', 'rmse_pc_diff']

#Round all values to 2 dp
df_metrics = df_metrics.round(2)

In [None]:
plot_pred_naive1(df_metrics)

## E.2 Plot selected ISBN countries

In [None]:
plot_list = ['9780521148597ES', '9780521148559ES', '9781108457651ES', '9781108794091ES', '9781108381208ES',
             '9788490365809ES', '9788490369883ES', '9788490361078ES', '9788490369975ES', '9780521221689ES']

print(plot_list)

In [None]:
#Set up grid for plotting
rows = int(np.ceil(len(plot_list)/2))  #round up
fig, axes = plt.subplots(rows, 2, figsize = (16,rows*4))
#The following is to iterate the axes
axes_flat = axes.flat

#Needed to get the period of pred (month values)
x_pred = df_y_test.columns

for i, key in enumerate(plot_list):
    
    #Now plotting the full values
    actuals = df_pivoted[df_pivoted.index == key]
    #convert actuals to ts
    ts_actuals = pd.melt(actuals, var_name='month', value_name='qty')
    ts_actuals = ts_actuals.set_index('month')
    ts_actuals.index = pd.to_datetime(ts_actuals.index)
    
    #do the same for the predictions
    pred = df_yhat[df_yhat.index == key]
    ts_pred = pd.melt(pred, var_name='month', value_name='qty')
    ts_pred = ts_pred.set_index('month')
    ts_pred.index = pd.to_datetime(ts_pred.index)
  
    #and naive-1
    ts_naive1 = ts_actuals[-(12+FCST_PERIOD):-12].shift(periods = 12, freq = 'M')
    
    ax = axes_flat[i]
    ax.plot(ts_actuals[-24:], '-o', label="actuals") #Just the last 2 years
    ax.plot(ts_pred, '-o', label="predicted")
    ax.plot(ts_naive1, '-o', label="naive-1")
    ax.grid()
    ax.legend(fontsize=12)
    ax.set_title(key);
       
plt.tight_layout()
plt.show();

df_metrics[df_metrics['key'].isin(plot_list)]

#These are all significantly worse than naive-1
#NB It doesn't help that we are now forecasting over a quiet part of the year

* Imbalance - try more balanced training examples (select them manually if possible) - check distribution of data
* Take more conventional ML - CatBoost - lightGBM
* Predict median of next 12 months - try to figure out conv ML makes sense
* Standardisation instead of normalisation

# Extra CT Code

This is some extra plotting code that I don't need

In [None]:
def plot_prob_forecasts(ts_entry, forecast_entry):
    plot_length = 12*4
    prediction_intervals = (50.0, 90.0)
    legend = ["observations", "median prediction"] + [f"{k}% prediction interval" for k in prediction_intervals][::-1]

    fig, ax = plt.subplots(1, 1, figsize=(10, 7))
    ts_entry[-plot_length:].plot(ax=ax)  # plot the time series
    forecast_entry.plot(prediction_intervals=prediction_intervals, color='g')
    plt.grid(which="both")
    plt.legend(legend, loc="upper left")
    plt.show()

In [None]:
forecasts = list(forecast_it)
tss = list(ts_it)

gh=20 #Let's take element 20
forecast_entry = forecasts[gh]
ts_entry = tss[gh]

plot_prob_forecasts(ts_entry, forecast_entry)