In [None]:
import gc
import pandas as pd
import numpy as np
import scipy.stats as stats
import math
import plotly.graph_objects as go
import random
import os
import copy
import matplotlib.pyplot as plt

from time import time
from datetime import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F

#!pip install properscoring
import properscoring as ps


#!pip install -U "gluonts[torch]"
from gluonts.dataset.pandas import PandasDataset
from gluonts.dataset.util import to_pandas


import json

# Data loading & data preprocessing

In [None]:
# Load monthly MM data.
# Transform the data into a lists of arrays. Each inner array represents a timeseries.
# Remove all the NaN values from the datasets.

# M4
trainset = pd.read_csv('https://raw.githubusercontent.com/Mcompetitions/M4-methods/master/Dataset/Train/Monthly-train.csv')
testset = pd.read_csv('https://raw.githubusercontent.com/Mcompetitions/M4-methods/master/Dataset/Test/Monthly-test.csv')
trainset.set_index('V1', inplace = True)
testset.set_index('V1', inplace = True)
# Add the testset columns behind the trainset columns
testset_merge = trainset.merge(testset, on = 'V1', how = 'inner')
# Get the data in numpy representation
trainset_np = trainset.values
testset_np = testset_merge.values
# Select all non NaN values from the trainset
trainset_clean = [x[x == x] for x in trainset_np]
# Train/validation/test --------------------------------- NBeats paper validation strategy
testset_m4m = [x[x == x] for x in testset_np]
valset_m4m = trainset_clean.copy()
trainset_m4m = [x[:-18] for x in trainset_clean]

del(trainset, testset, testset_merge, trainset_np, testset_np, trainset_clean)

In [None]:
"""
# For debugging & testing
trainset_m4m = trainset_m4m[0:3]
valset_m4m = valset_m4m[0:3]
testset_m4m = testset_m4m[0:3]
"""

# DeepAR data & estimator config

In [None]:
configDeepAR = dict()

configDeepAR["type"] = 'test_run' #'test_run' or 'validation_run'
configDeepAR["run_name"] = '32_context length'

configDeepAR["rndseed"] = 1500 #2500, 3500,4500,5500,6500,7500,8500,9500,10500

configDeepAR["context_length"] = 32
configDeepAR["hidden_layers"] = 3
configDeepAR["hidden_layer_units"] = 40 # 40 instead of 120 because of smaller decoder length!

configDeepAR["learning_rate"] = 1e-3
configDeepAR["batch_size"] = 32
configDeepAR["epochs"] = 1000
configDeepAR["num_batches_per_epoch"] = 100

configDeepAR["sample_paths"] = 100

In [None]:
from gluonts.dataset.common import ListDataset
from gluonts.torch.model.deepar import DeepAREstimator
from gluonts.torch.distributions.distribution_output import NormalOutput

In [None]:
start = "01-01-2023" # add "target" and "start" fields --> doesn't work anymore with 0
frequency = '1M'

if configDeepAR["type"]=='validation_run':
    training_data = trainset_m4m
    test_data = valset_m4m
elif configDeepAR["type"]=='test_run':
    training_data = valset_m4m
    test_data = testset_m4m

# train dataset
train_ds = ListDataset(
    [{'target': x,'start':start} for x in training_data],
    freq=frequency
)



# test datasets
test_ds = dict()
for fold in range(1,13):
    ds_name = 'test' + str(fold) + '_ds'
    test_data_fold = [x[:-(13-fold)] for x in test_data]
    test_ds[ds_name] = ListDataset(
        [{'target': x, 'start': start} for x in test_data_fold],
        freq=frequency
    )
test_ds['test13_ds'] = ListDataset(
    [{'target': x, 'start': start} for x in test_data],
    freq=frequency
)

In [None]:
from gluonts.torch.modules.loss import DistributionLoss
estimator = DeepAREstimator(
    freq = frequency,
    prediction_length = 6,
    context_length = configDeepAR["context_length"],
    num_layers = configDeepAR["hidden_layers"],
    hidden_size = configDeepAR["hidden_layer_units"],
    dropout_rate = 0.0,
    distr_output = NormalOutput(),
    scaling = True,
    batch_size = configDeepAR["batch_size"],
    lr = configDeepAR["learning_rate"],
    patience = 20,
    num_batches_per_epoch = configDeepAR["num_batches_per_epoch"],
trainer_kwargs={"max_epochs": configDeepAR["epochs"]}
)

# Training and evaluation

In [None]:
seed_value = configDeepAR["rndseed"]
np.random.seed(seed_value)

random.seed(seed_value)
os.environ["PYTHONHASHSEED"] = str(seed_value)
torch.manual_seed(seed_value)

In [None]:
predictor = estimator.train(train_ds)

In [None]:
from gluonts.evaluation import make_evaluation_predictions
from gluonts.model.forecast import SampleForecast

In [None]:
ts_dict = dict()
forecast_dict = dict()

for fold in range(1,14):
    print("fold: " + str(fold))

    forecast_it, ts_it = make_evaluation_predictions(
        dataset = test_ds['test' + str(fold) + '_ds'],
        predictor = predictor,
        num_samples = configDeepAR["sample_paths"],
    )

    ts_list = list()
    forecast_list = list()

    for i in range(len(test_ds['test' + str(fold) + '_ds'])):
        ts_list.append(next(ts_it).values.squeeze())
        forecast_list.append(next(forecast_it).samples)

    ts_dict['ts_list_' + str(fold)] = ts_list
    forecast_dict['forecast_list_' + str(fold)] = forecast_list

In [None]:
# Containers to save actuals and forecasts
# n_series, forigin, forecast_length
actuals_np = np.empty(shape = (len(test_data), 13, 6))
mean_forecasts_np = np.empty(shape = (len(test_data), 13, 6))
sd_forecasts_np = np.empty(shape = (len(test_data), 13, 6))
csd_forecasts_np = np.empty(shape = (len(test_data), 13, 6))

In [None]:
for origin in range(1,14):
    helper_actuals = [x[-6:] for x in ts_dict['ts_list_' + str(origin)]]
    actuals_np[:, (origin-1), :] = np.array(helper_actuals)

    forecasts_samples_origin = np.array(forecast_dict['forecast_list_' + str(origin)])
    mean_forecasts_np[:, (origin-1), :] = np.mean(forecasts_samples_origin, axis = 1)
    sd_forecasts_np[:, (origin-1), :] = np.std(forecasts_samples_origin, axis = 1)
    csd_forecasts_np[:, (origin-1), :] = np.std(np.cumsum(forecasts_samples_origin, axis = 2), axis = 1)

In [None]:
# n_series, forigin, forecast_length
fc_colnames = [str(i) for i in range(1, 7)]

m,n,r = actuals_np.shape
actuals_arr = np.column_stack((np.repeat(np.arange(m) + 1, n),
                               np.tile(np.arange(n) + 1, m),
                               actuals_np.reshape(m*n, -1)))
actuals_df = pd.DataFrame(actuals_arr, columns = ['item_id', 'fc_origin'] + fc_colnames)
helper_col = ['actual'] * len(actuals_df)
actuals_df['type'] = helper_col

In [None]:
# n_series, forigin, forecast_length
fc_colnames = [str(i) for i in range(1, 7)]

m,n,r = mean_forecasts_np.shape
mean_forecasts_arr = np.column_stack((np.repeat(np.arange(m) + 1, n),
                                      np.tile(np.arange(n) + 1, m),
                                      mean_forecasts_np.reshape(m*n, -1)))
mean_forecasts_df = pd.DataFrame(mean_forecasts_arr, columns = ['item_id', 'fc_origin'] + fc_colnames)
helper_col = ['mean_forecast'] * len(mean_forecasts_df)
mean_forecasts_df['type'] = helper_col

In [None]:
# n_series, forigin, forecast_length
fc_colnames = [str(i) for i in range(1, 7)]

m,n,r = sd_forecasts_np.shape
sd_forecasts_arr = np.column_stack((np.repeat(np.arange(m) + 1, n),
                                    np.tile(np.arange(n) + 1, m),
                                    sd_forecasts_np.reshape(m*n, -1)))
sd_forecasts_df = pd.DataFrame(sd_forecasts_arr, columns = ['item_id', 'fc_origin'] + fc_colnames)
helper_col = ['sd_forecast'] * len(sd_forecasts_df)
sd_forecasts_df['type'] = helper_col

In [None]:
# n_series, forigin, forecast_length
fc_colnames = [str(i) for i in range(1, 7)]

m,n,r = csd_forecasts_np.shape
csd_forecasts_arr = np.column_stack((np.repeat(np.arange(m) + 1, n),
                                     np.tile(np.arange(n) + 1, m),
                                     csd_forecasts_np.reshape(m*n, -1)))
csd_forecasts_df = pd.DataFrame(csd_forecasts_arr, columns = ['item_id', 'fc_origin'] + fc_colnames)
helper_col = ['csd_forecast'] * len(csd_forecasts_df)
csd_forecasts_df['type'] = helper_col

In [None]:
output_df_mm = pd.concat([actuals_df, mean_forecasts_df, sd_forecasts_df, csd_forecasts_df])


In [None]:
output_df_mm.to_csv('deepAR_prob_' + configDeepAR["type"] + '_' + configDeepAR["run_name"] + "gluonts_pytorch_1000j"+ '.csv', index = False)