In [None]:
# Import Helper Functions
source("sales_exploration_helpers.R")


In [None]:
library(dplyr)
library(tsibble)
library(fable)
library(ggplot2)
library(tidyr)
library(purrr)
library(feasts)
library(GGally)
library(patchwork)
options(repr.plot.width = 20, repr.plot.height = 7, repr.plot.res = 100)

# Source Helper Functions
source("../Baseline/baseline_helpers.R")
source("../Data_Inspection/data_cleaning_helpers.R")

# Clean validation data
validation <- get_validation_data()

dates <- get_dates()

train <- get_train_data()


In [None]:
train |>
  filter(product == "FOODS_3_001") |>
  model(
    STL(
      sales ~ trend(window = 365) +
        season(period = 7) +
        season(period = 365),
      robust = TRUE
    )
  ) |>
  components() |>
  autoplot()
train |>
  filter(product == "FOODS_3_002") |>
  model(
    STL(
      sales ~ trend(window = 365) +
        season(period = 7) +
        season(period = 365),
      robust = TRUE
    )
  ) |>
  components() |>
  autoplot()
train |>
  filter(product == "FOODS_3_003") |>
  model(
    STL(
      sales ~ trend(window = 365) +
        season(period = 7) +
        season(period = 365),
      robust = TRUE
    )
  ) |>
  components() |>
  autoplot()
train |>
  filter(product == "FOODS_3_008") |>
  model(
    STL(
      sales ~ trend(window = 365) +
        season(period = 7) +
        season(period = 365),
      robust = TRUE
    )
  ) |>
  components() |>
  autoplot()


In [None]:
stl_feats <- train |>
  group_by(product) |>
  features(
    sales,
    feat_stl,
    .model = STL(
      sales ~ trend(window = 365) +
        season(period = 7) +
        season(period = 365),
      robust = TRUE
    )
  )


intermittency_features <- function(y) {
  y <- as.numeric(y)
  y[is.na(y)] <- 0

  nz_idx <- which(y > 0)
  p_zero <- mean(y == 0)

  # Average inter-demand interval (ADI)
  adi <- if (length(nz_idx) <= 1) Inf else mean(diff(nz_idx))

  # Non-zero statistics
  y_nz <- y[y > 0]
  mean_nz <- if (length(y_nz) == 0) 0 else mean(y_nz)
  var_nz <- if (length(y_nz) <= 1) 0 else var(y_nz)

  cv2 <- if (mean_nz <= 0) Inf else var_nz / (mean_nz^2)

  tibble(
    p_zero = p_zero,
    adi = adi,
    mean_nz = mean_nz,
    cv2 = cv2
  )
}

feat_int <- train |>
  as_tibble() |>
  group_by(product) |>
  summarise(intermittency_features(sales), .groups = "drop")
head(feat_int)

average_sparsity <- mean(feat_int$p_zero)
cat("Average Sparsity (Proportion of zero daily sales across all products): ", round(average_sparsity, 4), "\n")

features_all <- feat_int |>
  left_join(stl_feats, by = "product")
head(features_all)


In [None]:
ggpairs(
  features_all,
  columns = c("p_zero", "adi", "mean_nz", "cv2", "seasonal_strength_week"),
  title = "Scatterplot Matrix of Intermittency Features"
)


In [None]:
feat <- features_all |>
  mutate(
    regime = if_else(p_zero > 0.6, "sparse", "dense")
  )


### Setting up some helper functions
We want to train the sparsity model ("hurdle model" or "occurrence model"), and then later a "size model" (how many sales given the fact that more than zero happen).

### Fit hurdle models on each product

In [None]:
# Settings
h <- 28 # forecast horizon / test window (days)

sparse_products <- feat |>
    filter(regime == "sparse") |>
    pull(product)

# Filter data to sparse products and split into train/test per product
data_sparse <- train |>
    filter(product %in% sparse_products) |>
    group_by(product) |>
    arrange(day, .by_group = TRUE) |>
    mutate(
        max_day = max(day),
        is_test = day > (max_day - days(h))
    ) |>
    ungroup() |>
    select(-max_day)

train_sparse <- data_sparse |> filter(!is_test)
test_sparse <- data_sparse |> filter(is_test)


# Nest by product for some reason
nested <- train_sparse |>
    group_by(product) |>
    nest()

# Fit models per product
models <- nested |>
    mutate(
        model = map(data, fit_hurdle_one)
    ) |>
    select(product, model)


### Predict and compute MSE on the test set

First predict the price in the future:

In [None]:
path <- "models/prices/naive.rds"

# load fit or fit it and store the fit
if (file.exists(path)) {
    price_fit_naive <- readRDS(path)
} else {
    price_fit_naive <- train |>
        model(NAIVE(sell_price))

    saveRDS(price_fit_naive, path)
}

price_fit_naive <- train |>
    model(NAIVE(sell_price))

sell_price_future <- price_fit_naive |>
    forecast(h = h) |>
    as_tsibble() |>
    select(product, day, .mean) |>
    rename(sell_price = .mean)


In [None]:
calendar_events_future <- create_future_indicators(max(train$day), h)

future_data <- sell_price_future |> inner_join(calendar_events_future, by = c("day"))

head(future_data)


In [None]:
head(test_sparse)


In [None]:
preds <- test_sparse |>
  as_tibble() |>
  group_by(product) |>
  tidyr::nest() |>
  left_join(models, by = "product")
|>
mutate(
  pred = purrr::map2(model, data, ~ predict_hurdle_one(.x, .y))
) |>
select(product, pred) |>
tidyr::unnest(pred)


# Evaluate RMSE per product and overall
rmse_by_product <- preds |>
  group_by(product) |>
  summarise(
    rmse = rmse(sales, y_hat),
    n_test = n(),
    .groups = "drop"
  )

overall_rmse <- rmse(preds$sales, preds$y_hat)
overall_mae <- mean(abs(preds$sales - preds$y_hat), na.rm = TRUE)

cat("RMSE on sparse products", overall_rmse, "\n")
cat("MAE on sparse products", overall_mae, "\n")
print(rmse_by_product |>
  arrange(desc(rmse)) |>
  head(20))
