In [1]:
%load_ext autoreload
%autoreload 2

## Requirements
* The dataset can be downloaded from [this Kaggle competition](https://www.kaggle.com/c/m5-forecasting-accuracy).
* In addition to the [Anaconda](https://www.anaconda.com) libraries, you need to install `category_encoders`, `selenium` and `geckodriver`.
* You also need to set up an AWS account and install `awscli`, `s3fs` and `sagemaker-python-sdk`.

In [2]:
import itertools
import json
import os
import sklearn
import time
import warnings
import boto3
import category_encoders
import s3fs
import sagemaker
import numpy as np
import pandas as pd
from bokeh.io import export_png, export_svgs, output_notebook
from bokeh.layouts import gridplot
from bokeh.models import Band, ColumnDataSource, HoverTool, Span
from bokeh.plotting import figure, show
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.metrics import (
    mean_absolute_error, 
    mean_absolute_percentage_error,
    mean_squared_error, 
)
from category_encoders import HelmertEncoder
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    ContinuousParameter,
    IntegerParameter,
    HyperparameterTuner,
)
from utils.evaluation import WRMSSEEvaluator

warnings.filterwarnings("ignore")
np.random.seed(42)
output_notebook()
print(f"<VERSION>\ncategory_encoders: {category_encoders.__version__}, sagemaker: {sagemaker.__version__}, sklearn: {sklearn.__version__}")

<VERSION>
category_encoders: 2.2.2, sagemaker: 2.50.0, sklearn: 0.24.1


In [3]:
def write_dicts_to_file(path, data):
    with open(path, "wb") as file:
        for datum in data:
            file.write(json.dumps(datum).encode("utf-8"))
            file.write("\n".encode("utf-8"))

#### Data Loading from Local Directory
The Kaggle dataset was saved in the local directory `~/data/mofc-demand-forecast` in advance.

In [4]:
RAW_DATA_PATH = "../../data/mofc-demand-forecast"
PROC_DATA_PATH = "proc_data"

calendar = pd.read_csv(os.path.join(RAW_DATA_PATH, "calendar.csv"))
selling_prices = pd.read_csv(os.path.join(RAW_DATA_PATH, "sell_prices.csv"))
# df_train_valid = pd.read_csv(os.path.join(DATA_PATH, "sales_train_validation.csv"))
df_train_eval = pd.read_csv(os.path.join(RAW_DATA_PATH, "sales_train_evaluation.csv"))
# sample_submission = pd.read_csv(os.path.join(RAW_DATA_PATH, "sample_submission.csv"))

In [5]:
key_names = ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]
all_ids = df_train_eval["id"].unique()
date_names = ["d_" + str(i) for i in range(1, 1942)]
dates = calendar["date"].unique()
test_steps = 28

key_pairs = list(itertools.product(all_ids, dates))
key_pairs = pd.DataFrame(key_pairs, columns=["id", "date"])

sample_ratio = 0.1

if sample_ratio == 1.0:
    sampled_ids = all_ids
else:
    sampled_ids = np.random.choice(
        all_ids, round(sample_ratio * len(all_ids)), replace=False
    ).tolist()

print(
    f"{len(sampled_ids)} out of {len(all_ids)} IDs were randomly selected."
)

3049 out of 30490 IDs were randomly selected.


# Data Preprocessing

In [6]:
target = df_train_eval[["id"] + date_names]
target = target.set_index("id").T.reset_index()
date_dict = calendar[["date", "d"]].set_index("d").to_dict()["date"]
target["index"] = target["index"].replace(date_dict)
target.columns = ["date"] + target.columns[1:].tolist()
target = target.set_index("date")

In [7]:
feature_names = ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]
events = calendar[["date"] + feature_names].fillna("NA")
train = events[events["date"] < dates[-3 * test_steps]][feature_names]

encoder = HelmertEncoder(drop_invariant=True)
_ = encoder.fit(train)
encoded = encoder.transform(events[feature_names])
events = pd.concat([events[["date"]], encoded], axis=1)

time_related = calendar[["date", "wday", "month"]]
time_related["day"] = time_related["date"].map(lambda x: int(x.split("-")[2]))

feat_dynamic_cat = events.merge(time_related).set_index("date")

scaler = MinMaxScaler()
scaled = scaler.fit_transform(feat_dynamic_cat)
feat_dynamic_cat = pd.DataFrame(
    scaled, columns=feat_dynamic_cat.columns, index=feat_dynamic_cat.index
)
n_feat_dynamic_cat = feat_dynamic_cat.shape[1]

In [8]:
prices = (
    df_train_eval[["id", "store_id", "item_id"]]
    .merge(selling_prices, how="left")
    .drop(["store_id", "item_id"], axis=1)
)
week_to_date = calendar[["date", "wm_yr_wk"]].drop_duplicates()
prices = week_to_date.merge(prices, how="left").drop(
    ["wm_yr_wk"], axis=1
)

scaler = MinMaxScaler()
train = prices[prices["date"] < dates[-3 * test_steps]][["sell_price"]]

_ = scaler.fit(train)
prices["sell_price"] = scaler.transform(prices[["sell_price"]])
prices = prices.pivot(index="date", columns="id", values="sell_price")
prices = prices.fillna(method="bfill")

snap = calendar[["date", "snap_CA", "snap_TX", "snap_WI"]]
snap.columns = ["date", "CA", "TX", "WI"]
snap = pd.melt(
    snap,
    id_vars="date",
    value_vars=["CA", "TX", "WI"],
    var_name="state_id",
    value_name="snap",
)
snap = key_pairs.merge(df_train_eval[["id", "state_id"]], how="left").merge(
    snap, on=["date", "state_id"], how="left"
)
snap = snap.pivot(index="date", columns="id", values="snap")

feat_dynamic_real = pd.concat([prices, snap], axis=1)
n_feat_dynamic_real = int(feat_dynamic_real.shape[1] / target.shape[1])

In [9]:
feature_names = ["dept_id", "cat_id", "store_id", "state_id"]
feat_static_cat = df_train_eval[["id"] + feature_names]

encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
_ = encoder.fit(feat_static_cat.set_index("id").loc[sampled_ids][feature_names])
feat_static_cat[feature_names] = encoder.transform(feat_static_cat[feature_names])
feat_static_cat[feature_names] = feat_static_cat[feature_names].astype(int)
feat_static_cat = feat_static_cat.set_index("id").T

cardinality = [len(category) for category in encoder.categories_]

In [10]:
%%time
def split_into_n_array(x, n):
    arrays = np.hsplit(x.T.ravel(), n)
    return list(map(lambda x: x.tolist(), arrays))


def make_and_write_dateset(
    dataset_name,
    target,
    feat_dynamic_cat,
    feat_dynamic_real,
    n_feat_dynamic_cat,
    n_feat_dynamic_real,
    all_ids,
    ts_lengths,
):
    dataset = []
    for each_id in all_ids:
        dict_by_id = {
            "target": target[each_id].iloc[: ts_lengths[0]].values.tolist(),
            "start": target.index[0],
            "dynamic_feat": split_into_n_array(
                feat_dynamic_cat.iloc[: ts_lengths[1]].values,
                n_feat_dynamic_cat,
            )
            + split_into_n_array(
                feat_dynamic_real[each_id].iloc[: ts_lengths[2]].values,
                n_feat_dynamic_real,
            ),
            "cat": feat_static_cat[each_id].values.tolist(),
        }
        dataset.append(dict_by_id)

    os.makedirs(os.path.join(PROC_DATA_PATH, dataset_name), exist_ok=True)
    write_dicts_to_file(
        os.path.join(PROC_DATA_PATH, dataset_name, f"{dataset_name}.json"), dataset
    )
    
    del dataset

ts_lengths_by_dataset = [
    [-2 * test_steps, -3 * test_steps, -3 * test_steps], 
    [-test_steps, -2 * test_steps, -2 * test_steps],
    [-test_steps, -test_steps, -test_steps], 
]    

for i, dataset_name in enumerate(["train", "valid", "test"]):
    make_and_write_dateset(
        dataset_name,
        target,
        feat_dynamic_cat,
        feat_dynamic_real,
        n_feat_dynamic_cat,
        n_feat_dynamic_real,
        sampled_ids, 
        ts_lengths_by_dataset[i],
    )

CPU times: user 2min 27s, sys: 8.34 s, total: 2min 35s
Wall time: 2min 36s


# Hyperparameter Tuning
#### Uploading Datasets to S3 Bucket

In [11]:
sagemaker_session = sagemaker.session.Session()
BUCKET = sagemaker_session.default_bucket()
BASE_JOB_PREFIX = "mofc-demand-forecast"
region = boto3.Session().region_name
role = sagemaker.get_execution_role()

In [12]:
%%time
!aws s3 cp {PROC_DATA_PATH}/train/train.json s3://{BUCKET}/{BASE_JOB_PREFIX}/train/train.json --quiet
!aws s3 cp {PROC_DATA_PATH}/valid/valid.json s3://{BUCKET}/{BASE_JOB_PREFIX}/valid/valid.json --quiet
!aws s3 cp {PROC_DATA_PATH}/test/test.json s3://{BUCKET}/{BASE_JOB_PREFIX}/test/test.json --quiet

CPU times: user 19.9 s, sys: 7.4 s, total: 27.3 s
Wall time: 17min 26s


In [13]:
s3_file_system = s3fs.S3FileSystem()

with s3_file_system.open(f"s3://{BUCKET}/{BASE_JOB_PREFIX}/train/train.json", "rb") as fp:
    print(fp.readline().decode("utf-8")[:100] + ", ...")

{"target": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...


## Defining DeepAR Estimator

In [14]:
training_image_uri = sagemaker.image_uris.retrieve(
    framework="forecasting-deepar",
    region=region,
    version="1",
    py_version="py3",
    instance_type="ml.c4.4xlarge",
)
model_output_uri = f"s3://{BUCKET}/{BASE_JOB_PREFIX}/models"

estimator = Estimator(
    image_uri=training_image_uri,
    role=role,
    instance_count=1,
    instance_type="ml.c4.4xlarge",
    output_path=model_output_uri,
    use_spot_instances=False,
    max_wait=None,
    sagemaker_session=sagemaker_session,
)

params = {
    "prediction_length": test_steps,
    "time_freq": "D",
    "cardinality": cardinality,
    "early_stopping_patience": 10,
    "likelihood": "negative-binomial",
    "num_dynamic_feat": n_feat_dynamic_cat + n_feat_dynamic_real,
    "num_eval_samples": 100,
    "num_layers": 2,
}
estimator.set_hyperparameters(**params)

## Defining and Fitting HyperparameterTuner

In [15]:
hyperparameter_ranges = {
    "context_length": IntegerParameter(2 * test_steps, 6 * test_steps, scaling_type="Auto"),
    "epochs": IntegerParameter(10, 500, scaling_type="Auto"),
    "dropout_rate": ContinuousParameter(0.0, 0.2, scaling_type="Auto"),
    "embedding_dimension": IntegerParameter(1, 10, scaling_type="Auto"),
    "learning_rate": ContinuousParameter(1e-4, 1e-2, scaling_type="Auto"),
    "mini_batch_size": IntegerParameter(128, 256, scaling_type="Auto"),
    "num_cells": IntegerParameter(40, 80, scaling_type="Auto"),
}

tuner = HyperparameterTuner(
    estimator,
    "test:RMSE",
    hyperparameter_ranges,
    objective_type="Minimize",
    max_jobs=30,
    max_parallel_jobs=3,
    base_tuning_job_name=f"{BASE_JOB_PREFIX}-deepar-hpo",
    early_stopping_type="Auto",
)

In [16]:
%%time
train_input = TrainingInput(
    s3_data=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/train/", content_type="json"
)
valid_input = TrainingInput(
    s3_data=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/valid/", content_type="json"
)

tuner.fit({"train": train_input, "test": valid_input})

best_estimator = tuner.best_estimator()
best_params = best_estimator.hyperparameters()
tuning_job_name = tuner.latest_tuning_job.name

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

## Analyzing Hyperparameter Tuning Results

In [17]:
tuning_job_analytics = sagemaker.HyperparameterTuningJobAnalytics(tuning_job_name)
df_viz = tuning_job_analytics.dataframe()
df_viz = df_viz.sort_values("TrainingStartTime")
df_viz.index = range(df_viz.shape[0])

IMAGE_PATH = "img"
os.makedirs(IMAGE_PATH, exist_ok=True)

In [18]:
df_viz.sort_values("FinalObjectiveValue", ascending=True)[:10]

Unnamed: 0,context_length,dropout_rate,embedding_dimension,epochs,learning_rate,mini_batch_size,num_cells,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
18,163.0,0.091486,4.0,14.0,0.002676,169.0,58.0,mofc-demand-forecast-210829-2356-019-618ed625,Completed,1.956622,2021-08-30 02:00:11+09:00,2021-08-30 02:11:49+09:00,698.0
12,165.0,0.197889,6.0,95.0,0.001308,176.0,52.0,mofc-demand-forecast-210829-2356-013-2660797d,Completed,1.968297,2021-08-30 01:22:25+09:00,2021-08-30 01:44:59+09:00,1354.0
19,164.0,0.091486,4.0,14.0,0.002676,167.0,58.0,mofc-demand-forecast-210829-2356-020-19bb4cc7,Completed,1.972213,2021-08-30 02:10:53+09:00,2021-08-30 02:22:23+09:00,690.0
17,142.0,0.129066,1.0,10.0,0.008069,157.0,69.0,mofc-demand-forecast-210829-2356-018-5dd271af,Completed,1.982282,2021-08-30 01:48:00+09:00,2021-08-30 01:57:11+09:00,551.0
14,143.0,0.069878,2.0,43.0,0.002256,162.0,46.0,mofc-demand-forecast-210829-2356-015-46c26a63,Completed,1.989344,2021-08-30 01:29:34+09:00,2021-08-30 01:43:01+09:00,807.0
23,168.0,0.119025,1.0,16.0,0.00835,149.0,41.0,mofc-demand-forecast-210829-2356-024-90e1d546,Completed,1.99191,2021-08-30 02:25:14+09:00,2021-08-30 02:36:31+09:00,677.0
7,136.0,0.1909,6.0,341.0,0.000398,128.0,42.0,mofc-demand-forecast-210829-2356-008-2849299a,Completed,2.015667,2021-08-30 00:44:05+09:00,2021-08-30 01:00:57+09:00,1012.0
21,163.0,0.133668,3.0,10.0,0.005902,135.0,56.0,mofc-demand-forecast-210829-2356-022-6a1b5d64,Completed,2.025702,2021-08-30 02:14:37+09:00,2021-08-30 02:23:36+09:00,539.0
27,162.0,0.14549,2.0,41.0,0.004967,129.0,47.0,mofc-demand-forecast-210829-2356-028-9ca17a38,Completed,2.026298,2021-08-30 02:44:50+09:00,2021-08-30 03:02:53+09:00,1083.0
28,163.0,0.089519,2.0,10.0,0.004999,165.0,55.0,mofc-demand-forecast-210829-2356-029-07815903,Completed,2.03117,2021-08-30 02:57:11+09:00,2021-08-30 03:06:32+09:00,561.0


In [19]:
class HoverHelper:
    def __init__(self, tuning_job_analytics):
        self.tuning_job_analytics = tuning_job_analytics

    def hovertool(self):
        tooltips = [
            ("TrainingJobName", "@TrainingJobName"),
            ("FinalObjectiveValue", "@FinalObjectiveValue"),
        ]

        for key in self.tuning_job_analytics.tuning_ranges.keys():
            tooltips.append((key, f"@{key}"))

        hover_tool = HoverTool(tooltips=tooltips)
        return hover_tool

    def tools(
        self,
        standard_tools="pan, crosshair, wheel_zoom, zoom_in, zoom_out, undo, reset",
    ):
        return [self.hovertool(), standard_tools]


def make_grid(figures, n_cols):
    rows = []
    for i, figure in enumerate(figures):
        if i % n_cols == 0:
            cols = []
        if (i % n_cols == n_cols - 1) or (i == len(figures) - 1):
            rows.append(cols)
        cols.append(figure)
    return rows

In [20]:
hover_helper = HoverHelper(tuning_job_analytics)

p = figure(
    plot_width=800,
    plot_height=400,
    tools=hover_helper.tools(),
    title="Convergence Plot",
    x_axis_type="datetime",
    x_axis_label="Training Start Time",
    y_axis_label="RMSE",
)
_ = p.line(
    x="TrainingStartTime",
    y="FinalObjectiveValue",
    color="coral",
    line_width=1.5,
    source=df_viz,
)
_ = p.circle(
    x="TrainingStartTime",
    y="FinalObjectiveValue",
    line_color="coral",
    line_width=1.5,
    fill_color="white",
    source=df_viz,
)

p.xgrid.grid_line_color = None
p.title.align = "center"
p.title.text_font_size = "12pt"

show(p)

p.output_backend = "svg"
_ = export_svgs(p, filename=f"{IMAGE_PATH}/convergence_plot.svg")

In [21]:
df_viz = df_viz.reset_index()
df_viz["index"] = (df_viz["index"] + df_viz["index"].min()) / (
    df_viz["index"].max() - df_viz["index"].min()
)

figures = []
for param_name, param_range in tuning_job_analytics.tuning_ranges.items():
    categorical_args = dict()
    if param_range.get("Values"):
        values = param_range["Values"]
        if sum([is_number(x) for x in values]) == len(values):
            print(
                f"Hyperparameter {param_name} is tuned as categorical, but all values are numeric."
            )
        else:
            categorical_args["x_range"] = values

    p = figure(
        plot_width=400,
        plot_height=400,
        tools=hover_helper.tools(),
        x_axis_label=param_name,
        y_axis_label="RMSE",
        **categorical_args,
    )
    p.circle(
        source=df_viz,
        x=param_name,
        y="FinalObjectiveValue",
        color="black",
        alpha="index",
    )
    p.xgrid.grid_line_color = None
    figures.append(p)

grid_plot = gridplot(make_grid(figures, 3), toolbar_location="right")

show(grid_plot)

_ = export_png(grid_plot, filename=f"{IMAGE_PATH}/partial_dependence_plot.png")

# Model Evaluation
## Defining Transformer and Prediction

In [22]:
%%capture
transformer = best_estimator.transformer(
    instance_count=1,
    instance_type="ml.c4.4xlarge",
    strategy="SingleRecord",
    assemble_with="Line",
    env={
        "DEEPAR_INFERENCE_CONFIG": '{"quantiles": ["0.05", "0.25", "0.5", "0.75", "0.95"]}'
    },
    output_path=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/pred",
)

_ = transformer.transform(
    data=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/test/",
    split_type="Line",
)

#### Downloading Prediction Scores to Local Directory

In [23]:
s3_client = boto3.client("s3")

s3_client.download_file(
    BUCKET,
    f"{BASE_JOB_PREFIX}/pred/test.json.out",
    os.path.join(PROC_DATA_PATH, "test", "test.json.out"),
)

with open(os.path.join(PROC_DATA_PATH, "test", "test.json.out"), "r") as file:
    forecasts = [eval(line) for line in file]

## Measuring Predictive Performance

In [24]:
sampled_df_train_eval = (
    df_train_eval.set_index("id").loc[sampled_ids].reset_index()
)
sampled_df_train = sampled_df_train_eval.loc[:, key_names + date_names[:-test_steps]]
sampled_df_test = sampled_df_train_eval.loc[:, date_names[-test_steps:]]

predictions = [forecast["mean"] for forecast in forecasts]
sampled_df_pred = pd.DataFrame(predictions, columns=sampled_df_test.columns)

In [25]:
mae = np.mean(
    [
        mean_absolute_error(y_true, y_pred)
        for y_true, y_pred in zip(sampled_df_test.values.tolist(), predictions)
    ]
)
rmse = np.sqrt(
    np.mean(
        [
            mean_squared_error(y_true, y_pred)
            for y_true, y_pred in zip(sampled_df_test.values.tolist(), predictions)
        ]
    )
)
mase = np.mean(
    [
        mean_absolute_percentage_error(y_true, y_pred)
        for y_true, y_pred in zip(sampled_df_test.values.tolist(), predictions)
    ]
)

agg_metrics = {"MAE": mae, "RMSE": rmse, "MASE": mase}

In [26]:
string = ""
for key, value in agg_metrics.items():
    if not np.isnan(value):
        string += key + ": " + f"{value:.4f}\n"
        
print(string[:-2])

MAE: 1.0167
RMSE: 2.1304
MASE: 1435140187378558.000


In [27]:
wrmsse_evaluator = WRMSSEEvaluator(
    sampled_df_train, sampled_df_test, calendar, selling_prices, test_steps
)
wrmsse = wrmsse_evaluator.score(sampled_df_pred)

print(f"DeepAR WRMSSE: {wrmsse: 6f}")

  0%|          | 0/12 [00:00<?, ?it/s]

DeepAR WRMSSE:  0.777567


In [28]:
def plot_forecast(source, test_steps, plot_id=None, model_name=None, start_date=None):
    title = "Sales Forecast"
    if plot_id is not None:
        title += f" for '{plot_id}'"
    if model_name is not None:
        title = f"{model_name}: " + title

    p = figure(
        plot_width=1000,
        plot_height=300,
        tools="pan, crosshair, wheel_zoom, zoom_in, zoom_out, undo, reset",
        tooltips=[
            ("y", "@y"),
            ("fcst", "@fcst"),
        ],
        title=title,
        x_axis_label="Date",
        y_axis_label="Sales",
        x_axis_type="datetime",
    )
    _ = p.circle(x="time", y="y", size=1.5, color="#000000", source=source)

    _ = p.line(
        x="time",
        y="fcst",
        color="#4267B2",
        line_width=1.0,
        source=source,
    )

    band_90 = Band(
        base="time",
        lower="fcst_lower_05",
        upper="fcst_upper_95",
        level="underlay",
        fill_color="#4267B2",
        fill_alpha=0.25,
        source=ColumnDataSource(source),
    )
    p.add_layout(band_90)

    band_50 = Band(
        base="time",
        lower="fcst_lower_25",
        upper="fcst_upper_75",
        level="underlay",
        fill_color="#4267B2",
        fill_alpha=0.5,
        source=ColumnDataSource(source),
    )
    p.add_layout(band_50)

    loc = (
        time.mktime(
            source[["time"]]
            .iloc[-test_steps : -test_steps + 1]["time"]
            .iloc[0]
            .timetuple()
        )
        * 1000
    )
    rule = Span(
        location=loc,
        dimension="height",
        line_color="red",
        line_dash="dotted",
        line_width=1.0,
    )
    p.add_layout(rule)

    p.title.align = "center"
    p.title.text_font_size = "10pt"

    return p

In [29]:
plot_indices = [1, 6, 10]
plots = []

for plot_index in plot_indices:
    plot_id = sampled_ids[plot_index]

    y = target[[plot_id]].reset_index()
    y.columns = ["time", "y"]
    y["time"] = pd.to_datetime(y["time"])

    forecast = pd.DataFrame(
        [forecasts[plot_index]["mean"]]
        + [
            forecasts[plot_index]["quantiles"][str(p)] for p in [0.05, 0.25, 0.75, 0.95]
        ],
        columns=dates[-2 * test_steps : -test_steps],
    ).T.reset_index()
    forecast.columns = [
        "time",
        "fcst",
        "fcst_lower_05",
        "fcst_lower_25",
        "fcst_upper_75",
        "fcst_upper_95",
    ]

    forecast["time"] = pd.to_datetime(forecast["time"])
    source = y.merge(forecast, how="left")
    source = source[source["time"] >= "2015-05-23"]

    p = plot_forecast(
        source,
        test_steps,
        plot_id=plot_id,
        model_name="DeepAR",
        start_date="2015-05-23",
    )

    plots.append(p)

grid_plot = gridplot(make_grid(plots, 1), toolbar_location="right")

show(grid_plot)

_ = export_png(grid_plot, filename=f"{IMAGE_PATH}/sales_forecast_plots.png")

# Model Re-training
### Data Loading, Preprocessing and Uploading

In [30]:
feature_names = ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]
events = calendar[["date"] + feature_names].fillna("NA")
train = events[events["date"] < dates[-test_steps]][feature_names]

encoder = HelmertEncoder(drop_invariant=True)
_ = encoder.fit(train)
encoded = encoder.transform(events[feature_names])
events = pd.concat([events[["date"]], encoded], axis=1)

feat_dynamic_cat = events.merge(time_related).set_index("date")

scaler = MinMaxScaler()
scaled = scaler.fit_transform(feat_dynamic_cat)
feat_dynamic_cat = pd.DataFrame(
    scaled, columns=feat_dynamic_cat.columns, index=feat_dynamic_cat.index
)

feature_names = ["dept_id", "cat_id", "store_id", "state_id"]
feat_static_cat = df_train_eval[["id"] + feature_names]

encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
feat_static_cat[feature_names] = encoder.fit_transform(feat_static_cat[feature_names])
feat_static_cat[feature_names] = feat_static_cat[feature_names].astype(int)
feat_static_cat = feat_static_cat.set_index("id").T

cardinality = [len(category) for category in encoder.categories_]

In [31]:
prices = (
    df_train_eval[["id", "store_id", "item_id"]]
    .merge(selling_prices, how="left")
    .drop(["store_id", "item_id"], axis=1)
)
week_to_date = calendar[["date", "wm_yr_wk"]].drop_duplicates()
prices = week_to_date.merge(prices, how="left").drop(
    ["wm_yr_wk"], axis=1
)

scaler = MinMaxScaler()
train = prices[prices["date"] < dates[-test_steps]][["sell_price"]]

_ = scaler.fit(train)
prices["sell_price"] = scaler.transform(prices[["sell_price"]])
prices = prices.pivot(index="date", columns="id", values="sell_price")
prices = prices.fillna(method="bfill")

feat_dynamic_real = pd.concat([prices, snap], axis=1)

In [32]:
%%time
ts_lengths_by_dataset = [
    [None, -test_steps, -test_steps], 
    [None, None, None], 
]    

for i, dataset_name in enumerate(["re_train", "re_test"]):
    make_and_write_dateset(
        dataset_name,
        target,
        feat_dynamic_cat,
        feat_dynamic_real,
        n_feat_dynamic_cat,
        n_feat_dynamic_real,
        all_ids, 
        ts_lengths_by_dataset[i],
    )

CPU times: user 25min 38s, sys: 17min 47s, total: 43min 25s
Wall time: 48min 12s


In [33]:
%%time
!aws s3 cp {PROC_DATA_PATH}/re_train/re_train.json s3://{BUCKET}/{BASE_JOB_PREFIX}/re_train/re_train.json --quiet
!aws s3 cp {PROC_DATA_PATH}/re_test/re_test.json s3://{BUCKET}/{BASE_JOB_PREFIX}/re_test/re_test.json --quiet

CPU times: user 2min 41s, sys: 1min, total: 3min 42s
Wall time: 1h 43min 15s


## Defining and Fitting Estimator

In [34]:
full_estimator = Estimator(
    image_uri=training_image_uri,
    role=role,
    instance_count=1,
    instance_type="ml.p3.16xlarge",
    output_path=model_output_uri,
    use_spot_instances=False,
    max_wait=None,
)

best_params = best_estimator.hyperparameters()
_ = best_params.pop("_tuning_objective_metric")
params.update(best_params)
full_estimator.set_hyperparameters(**params)

In [35]:
%%capture
re_train_input = TrainingInput(
    s3_data=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/re_train/", content_type="json"
)

full_estimator.fit({"train": re_train_input})

## Defining Transformer and Prediction

In [36]:
%%capture
transformer = full_estimator.transformer(
    instance_count=1,
    instance_type="ml.c4.4xlarge",
    strategy="SingleRecord",
    assemble_with="Line",
    output_path=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/re_pred",
)

_ = transformer.transform(
    data=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/re_test/",
    split_type="Line",
)

#### Downloading Prediction Scores to Local Directory

In [37]:
s3_client = boto3.client("s3")

s3_client.download_file(
    BUCKET,
    f"{BASE_JOB_PREFIX}/re_pred/re_test.json.out",
    os.path.join(PROC_DATA_PATH, "re_test", "re_test.json.out"),
)

with open(os.path.join(PROC_DATA_PATH, "re_test", "re_test.json.out"), "r") as file:
    forecasts = [eval(line) for line in file]

In [38]:
column_names = ["F" + str(i) for i in range(1, 29)]
valid_submission = df_train_eval.loc[
    :, ["id"] + date_names[-test_steps:]
]
valid_submission.columns = ["id"] + column_names
valid_submission["id"] = valid_submission["id"].str.replace("evaluation", "validation")
eval_submission = pd.DataFrame(
    [forecast["mean"] for forecast in forecasts],
    columns=column_names,
    index=all_ids,
).reset_index()
eval_submission.columns = ["id"] + eval_submission.columns[1:].tolist()
submission = pd.concat([valid_submission, eval_submission])
submission.to_csv("submission.csv", index=False)