# Statistical Baseline Prediction

## Notebook Configuration && Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

from storesales.constants import (
    SUBMISSIONS_PATH,
    EXTERNAL_TRAIN_PATH,
    EXTERNAL_SAMPLE_SUBMISSION_PATH,
    EXTERNAL_TEST_PATH,
)

### Train Data

In [3]:
original_train_df = pd.read_csv(
    EXTERNAL_TRAIN_PATH, parse_dates=["date"], index_col="id"
)

original_train_df.sort_values(by=["date", "store_nbr", "family"], inplace=True)
original_train_df["day_of_week"] = original_train_df["date"].dt.dayofweek
original_train_df["is_weekend"] = original_train_df["day_of_week"] >= 5

### Test Data

In [4]:
test_df = pd.read_csv(EXTERNAL_TEST_PATH, parse_dates=["date"])

### Sample Submission

In [5]:
sample_submission_df = pd.read_csv(EXTERNAL_SAMPLE_SUBMISSION_PATH, index_col="id")

# Rolling Window Mean Prediction

In [6]:
test_period_length = test_df["date"].nunique()

In [7]:
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_true) - np.log1p(y_pred))))

## 1) Group by store_nbr and family

**Kaggle Public Score: 0.45751**

### Predictions

In [8]:
train_df = original_train_df.copy()
cols = ["store_nbr", "family"]
window = 14

In [9]:
grouped_window_sales = train_df.groupby(cols)["sales"].rolling(window=window)
train_df["mean_window_sales"] = grouped_window_sales.mean().reset_index(
    level=cols, drop=True
)

train_df["window_prediction"] = train_df.groupby(cols)["mean_window_sales"].shift(1)
train_df.dropna(inplace=True)

### Evaluation

In [10]:
def evaluate_algo_1(df: pd.DataFrame, eval_start: pd.Timestamp, eval_length: int):
    predictions = []
    actual_values = []

    for (_store, _family), group in tqdm(df.groupby(cols)):
        group = group[group["date"] >= eval_start]

        for i in range(0, len(group) - eval_length):
            actual_sales = group["sales"].iloc[i : i + eval_length]

            prediction = group["window_prediction"].iloc[i]
            predicted_sales = np.repeat(prediction, eval_length)

            predictions.extend(predicted_sales)
            actual_values.extend(actual_sales.values)

    return np.array(predictions), np.array(actual_values)

In [11]:
last_period = pd.DateOffset(days=50)
evaluation_start_date = train_df["date"].max() - last_period

In [12]:
sales_pred, sales_true = evaluate_algo_1(
    train_df, evaluation_start_date, test_period_length
)

100%|██████████| 1782/1782 [00:04<00:00, 419.79it/s]


In [13]:
error = rmsle(sales_true, sales_pred)

print(f"RMSLE Error: {error}")

RMSLE Error: 0.481519852937688


### Submission

In [14]:
store_to_family_submissions_file = "rolling_mean_store_family.csv"
store_to_family_submissions_df = sample_submission_df.copy()

test_prediction = train_df[train_df["date"] == train_df["date"].max()]

sub_values = test_df.merge(  # use not shifted predictions 'mean_window_sales'
    test_prediction[["store_nbr", "family", "mean_window_sales"]],
    on=["store_nbr", "family"],
    how="left",
)
sub_values.set_index("id", inplace=True)

In [15]:
store_to_family_submissions_df["sales"] = sub_values["mean_window_sales"]

In [16]:
file_path = os.path.join(SUBMISSIONS_PATH, store_to_family_submissions_file)

store_to_family_submissions_df.to_csv(file_path)

## 2) Group by store_nbr, family and is_weekend

**Idea**: Calculate separate rolling mean sales values for weekends and weekdays to capture 'sales' to 'is weekend' correlation.

In [17]:
train_df = original_train_df.copy()

cols = ["store_nbr", "family", "is_weekend"]
n_window_weekdays = 10  # 10 weekdays ~ 2 weeks period
n_window_weekends = 6  # 6 weekends ~ 3 weeks period

In [18]:
def get_rolling_mean(group, is_weekend):
    if is_weekend:
        return group.rolling(window=n_window_weekdays).mean()
    return group.rolling(window=n_window_weekends).mean()

In [19]:
# Group data by store, family, and is_weekend (weekday/weekend separation)
store_to_family_groups = train_df.groupby(cols)["sales"]

# Apply rolling mean within each group depending on is_weekend
train_df["mean_sales"] = store_to_family_groups.apply(
    lambda group: get_rolling_mean(group, group.name[2])  # group.name[2] -> is_weekend
).reset_index(level=cols, drop=True)

# Shift the rolling mean prediction within the same groups
train_df["mean_sales_shift"] = train_df.groupby(cols)["mean_sales"].shift(1)
train_df.dropna(inplace=True)

### Evaluation

In [20]:
def evaluate_algo_2(df: pd.DataFrame, eval_start_date: pd.Timestamp, eval_length: int):
    predictions = []
    actual_values = []

    eval_length_offset = pd.DateOffset(days=eval_length)
    df = df.set_index("date")
    max_date = df.index.max()

    for (_store, _family, _is_weekend), group in tqdm(df.groupby(cols)):
        group = group[group.index >= eval_start_date]

        indices = group.index[group.index < max_date - eval_length_offset]
        for eval_start in indices:
            actual_sales = group["sales"].loc[
                eval_start : eval_start + eval_length_offset
            ]

            prediction = group["mean_sales_shift"].loc[eval_start]
            predicted_sales = np.repeat(prediction, actual_sales.shape[0])

            predictions.extend(predicted_sales)
            actual_values.extend(actual_sales.values)

            eval_start += pd.DateOffset(days=1)

    return np.array(predictions), np.array(actual_values)

In [21]:
last_period = pd.DateOffset(days=50)
evaluation_start_date = train_df["date"].max() - last_period

In [22]:
sales_pred, sales_true = evaluate_algo_2(
    train_df, evaluation_start_date, test_period_length - 1
)

100%|██████████| 3564/3564 [00:15<00:00, 229.31it/s]


In [23]:
error = rmsle(sales_true, sales_pred)

print(f"RMSLE Error: {error}")

RMSLE Error: 0.46772057606550316


### Submission

In [24]:
store_to_family_weekend_submissions_file = "rolling_mean_store_family_weekend.csv"
store_to_family_weekend_submissions_df = sample_submission_df.copy()

test_df["is_weekend"] = test_df["date"].dt.dayofweek >= 5

In [25]:
max_dates = train_df.groupby(["store_nbr", "family", "is_weekend"])["date"].idxmax()
submission_prediction = train_df.loc[max_dates]

In [26]:
sub_values = test_df.merge(
    submission_prediction[["store_nbr", "family", "is_weekend", "mean_sales"]],
    on=["store_nbr", "family", "is_weekend"],
    how="left",
)
sub_values.set_index("id", inplace=True)

In [27]:
store_to_family_weekend_submissions_df["sales"] = sub_values["mean_sales"]

In [28]:
file_path = os.path.join(SUBMISSIONS_PATH, store_to_family_weekend_submissions_file)

store_to_family_weekend_submissions_df.to_csv(file_path)