
# Note: this notebook was not made to be run in the given order.

A cell with just the name of the method, such as 
##### GP
precedes cells which are only meant to be run for a specific method(/and on a specific subset).

The idea is that to run this notebook for the gp method (for example), you run only the `all method` and `gp` cells

##### All methods:

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import pickle
import utils.m5_helpers
import os

from reconciliation import get_rollup, get_series_df

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

In [3]:
DATA_PATH = '../../data/'

## Load and prepare data

#### Load 305 data

##### All methods:

In [4]:
ste = pd.read_csv(f'{DATA_PATH}external/sales_train_evaluation.csv')
cal = pd.read_csv(f'{DATA_PATH}external/calendar.csv')
prices = pd.read_csv(f'{DATA_PATH}external/sell_prices.csv')

subset data

In [6]:
sampled_ids = np.load(f'{DATA_PATH}interim/sampled_ids.npy').tolist()

ste = ste.set_index('id').loc[sampled_ids].reset_index()
train_df = ste

In [7]:
ste

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,FOODS_3_180_CA_1_evaluation,FOODS_3_180,FOODS_3,FOODS,CA_1,CA,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1,HOUSEHOLD_2_383_CA_3_evaluation,HOUSEHOLD_2_383,HOUSEHOLD_2,HOUSEHOLD,CA_3,CA,2,0,2,0,...,0,0,0,0,0,0,0,0,0,2
2,FOODS_3_409_CA_3_evaluation,FOODS_3_409,FOODS_3,FOODS,CA_3,CA,0,0,0,0,...,0,0,0,0,0,1,0,2,1,0
3,FOODS_1_097_CA_2_evaluation,FOODS_1_097,FOODS_1,FOODS,CA_2,CA,0,0,0,0,...,0,3,0,0,0,0,0,3,3,4
4,HOBBIES_1_272_TX_2_evaluation,HOBBIES_1_272,HOBBIES_1,HOBBIES,TX_2,TX,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,HOUSEHOLD_1_505_TX_2_evaluation,HOUSEHOLD_1_505,HOUSEHOLD_1,HOUSEHOLD,TX_2,TX,0,0,0,0,...,2,0,2,4,0,1,0,3,0,3
301,HOBBIES_1_120_WI_3_evaluation,HOBBIES_1_120,HOBBIES_1,HOBBIES,WI_3,WI,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
302,FOODS_3_075_TX_1_evaluation,FOODS_3_075,FOODS_3,FOODS,TX_1,TX,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
303,HOUSEHOLD_2_057_TX_2_evaluation,HOUSEHOLD_2_057,HOUSEHOLD_2,HOUSEHOLD,TX_2,TX,0,0,0,0,...,0,0,0,3,0,1,0,1,2,0


In [8]:
# sampled_ids = np.load(f'../../data/interim/sampled_ids.npy')
# sampled_ids = [ID + '_evaluation' for ID in sampled_ids]
# train_df = ste.set_index('id').loc[sampled_ids].reset_index()

GP_DP = '../../models/'
GP_NAME = 'GP_305'
DEEPAR_DP = '../../models/'
DEEPAR_NAME = 'deepar_305'

##### DeepAR:

In [9]:
forecasts_folder = f'{DEEPAR_DP}{DEEPAR_NAME}_predictions/'
# os.makedirs(forecasts_folder, exist_ok=True)
deepar_forecasts = load_pickle(f'{forecasts_folder}forecasts.pkl')
forecasts = deepar_forecasts

##### GP:

In [21]:
forecasts_folder = GP_DP + GP_NAME + '/'
# os.makedirs(forecasts_folder, exist_ok=True)

In [22]:
means = np.load(f'{forecasts_folder}means.npy') \
                .transpose()
variances = np.load(f'{forecasts_folder}variances.npy') \
                .transpose()
pmfs = np.load(f'{forecasts_folder}pmfs.npy') \
                .transpose()

In [11]:
# np.argwhere(np.sum(pmfs, axis=-1) != 1).shape
# pmfs.shape

#### Create df_list, a list of num_samples dataframes, where each dataframe corresponds to that of a single sample trace of aggregated forecasts 

#### Extract sample predictions

##### DeepAR

In [10]:
np_predictions = np.array([fcast.samples for fcast in forecasts])

##### GP

In [23]:
norm_const = np.sum(pmfs, axis=2, keepdims=True)
norm_pmfs = pmfs / norm_const

In [24]:
%%time
num_samples = 1000
np_pred_shape = (pmfs.shape[0], num_samples, pmfs.shape[1])
np_predictions = -1 * np.ones(np_pred_shape)
    
for ts_idx in range(np_pred_shape[0]):
    for timestep_idx in range(np_pred_shape[2]):
        # print(ts_idx, timestep_idx)
        np_predictions[ts_idx, :, timestep_idx] = np.random.choice(np.arange(pmfs.shape[2]), size=(num_samples), p=norm_pmfs[ts_idx, timestep_idx, :])

Wall time: 793 ms


In [25]:
np.save(f'{forecasts_folder}np_predictions.npy', np_predictions)

##### All methods:

In [26]:
%%time

num_columns = [f'd_{i}' for i in range(1942-28, 1942)]
df_list = []
for i in range(np_predictions.shape[1]): #np_predictions.shape[1]):
    pred_df = pd.DataFrame(data=np_predictions[:, i, :],
                index=sampled_ids,
                columns=num_columns)
    rollup_matrix_csr, rollup_index = get_rollup(train_df)
    agg_pred_df = pd.DataFrame(data=rollup_matrix_csr * np_predictions[:, i, :],
                               index=rollup_index,
                               columns=num_columns)
    # agg_pred_df = get_series_df(pred_df,
    #                             rollup_matrix_csr,
    #                             rollup_index,num_columns=num_columns,
    #                             prediction=True)
    df_list.append(agg_pred_df)

Wall time: 18.3 s


The above takes 15 seconds for 305 samples. Therefore it will take *at least* 1500 seconds for all the time series (actually nevermind, I thought this because there would be more groups (aggregations) but there wouldn't be more groups *per time series*. I think it might actually take *less than* 1500 seconds!

#### Convert df_list to a numpy array, so that each row (dimension 0) corresponds to all the sample traces of the row of the same index in a df otf agg_pred_df. 
That is, the numpy array should have shape `( len(agg_pred_df), num_samples, prediction_length )`

Call this array `agg_pred_arr`

##### All methods:

In [27]:
%%time
agg_pred_arr = -1 * np.ones( (len(df_list[0]),
                              len(df_list),
                              len(df_list[0].columns))
                            )
# for loop is over items
for idx, i in enumerate(df_list[0].index):
    # list comprehension acts as for loop over samples 
    # i.e. over [0,...,num_samples-1]
    # samples is an array with all the samples for all days
    # for a given item. it has shape (num_samples, prediction_length)
    samples = np.array([df_list[j].loc[i].to_numpy() for j in range(len(df_list))])
    agg_pred_arr[idx, :, :] = samples

Wall time: 2min 26s


In [28]:
agg_pred_arr.shape

(1047, 1000, 28)

#### Extract desired quantities

##### Quantile forecasts
Want to put the quantile forecasts in an array where first dimension corresponds to a quantile, the rest correspond to the quantile forecasts for each item `(num_items, prediction_length)`

In [29]:
%%time
quantiles = [.005, .025, .165, .25, .5, .75, .835, .975, .995]
# df_quantiles = np.array([np.quantile(agg_pred_arr, q, axis=1) for q in quantiles])
# note that as of numpy 1.22, the interpolation argument of np.quantile is deprecated
df_quantiles = np.array([np.quantile(agg_pred_arr, q, axis=1, interpolation='lower') for q in quantiles])
df_quantiles = df_quantiles.reshape((-1, 28))

Wall time: 4.3 s


In [30]:
import itertools

In [31]:
%%time
f_index = list(itertools.product(quantiles, df_list[0].index.to_list()))
# f stands for "final"
f_index = [(q, x, y) for q in quantiles for (x, y) in df_list[0].index.to_list()]
f_index_names = ['Quantile', 'Level', 'id']
f_index = pd.MultiIndex.from_tuples(f_index, names=f_index_names)

Wall time: 9 ms


In [32]:
forecasts_folder

'../../models/GP_305/'

In [33]:
quantile_forecasts = pd.DataFrame(data=df_quantiles,
                                  index=f_index,
                                  columns=[f'd_{i}' for i in range(1914, 1942)])

In [34]:
quantile_forecasts.to_pickle(f'{forecasts_folder}quantile_forecasts.pkl')

In [35]:
quantile_forecasts.loc[0.995]

Unnamed: 0_level_0,Unnamed: 1_level_0,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
Level,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,Total,520.0,516.0,521.0,517.0,517.0,529.0,501.0,512.0,543.0,552.0,...,546.0,553.0,556.0,552.0,534.0,554.0,551.0,558.0,580.0,553.0
2,CA,296.0,296.0,293.0,315.0,301.0,273.0,289.0,281.0,293.0,303.0,...,308.0,301.0,325.0,328.0,302.0,309.0,293.0,327.0,328.0,321.0
2,TX,234.0,230.0,253.0,221.0,230.0,261.0,223.0,218.0,248.0,258.0,...,267.0,264.0,258.0,270.0,216.0,304.0,283.0,272.0,246.0,238.0
2,WI,136.0,133.0,134.0,140.0,139.0,134.0,132.0,135.0,131.0,134.0,...,148.0,134.0,140.0,136.0,146.0,147.0,159.0,133.0,152.0,162.0
3,CA_1,53.0,47.0,46.0,46.0,46.0,45.0,47.0,54.0,55.0,50.0,...,53.0,57.0,59.0,53.0,63.0,56.0,55.0,55.0,54.0,55.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,HOUSEHOLD_2_424_TX_2,4.0,4.0,5.0,4.0,4.0,4.0,4.0,5.0,5.0,5.0,...,5.0,4.0,4.0,6.0,4.0,4.0,4.0,4.0,5.0,4.0
12,HOUSEHOLD_2_469_TX_2,3.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,...,4.0,5.0,4.0,5.0,4.0,4.0,4.0,4.0,5.0,4.0
12,HOUSEHOLD_2_469_TX_3,5.0,4.0,4.0,4.0,4.0,6.0,4.0,5.0,5.0,4.0,...,5.0,5.0,5.0,6.0,5.0,5.0,5.0,5.0,5.0,5.0
12,HOUSEHOLD_2_485_TX_1,5.0,6.0,5.0,4.0,6.0,6.0,6.0,5.0,6.0,5.0,...,7.0,6.0,8.0,6.0,8.0,7.0,7.0,7.0,6.0,7.0
