# Statistical Baseline Prediction

## Notebook Configuration && Imports

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import pandas as pd
import numpy as np

from storesales.constants import (
    SUBMISSIONS_PATH,
    EXTERNAL_TRAIN_PATH,
    EXTERNAL_SAMPLE_SUBMISSION_PATH,
    EXTERNAL_OIL_PATH,
    EXTERNAL_TEST_PATH,
)

## Load && Prepare Data

In [4]:
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_true) - np.log1p(y_pred))))

In [6]:
def process_oil_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.asfreq("D")
    df["dcoilwtico"] = df["dcoilwtico"].ffill()
    df = df.dropna()
    return df

In [258]:
train_df = pd.read_csv(EXTERNAL_TRAIN_PATH, parse_dates=["date"], index_col="id")
train_df.sort_values(by=["date", "store_nbr", "family"], inplace=True)
train_df["day_of_week"] = train_df["date"].dt.dayofweek
train_df["is_weekend"] = train_df["day_of_week"] >= 5

test_df = pd.read_csv(EXTERNAL_TEST_PATH, parse_dates=["date"], index_col="id")

oil_df = pd.read_csv(EXTERNAL_OIL_PATH, parse_dates=["date"], index_col="date")

sample_submission_df = pd.read_csv(EXTERNAL_SAMPLE_SUBMISSION_PATH, index_col="id")

In [None]:
oil_df = process_oil_df(oil_df)

# Mean Prediction

In [220]:
prediction_col_name = "window_sales_shift"

In [250]:
def predict(df: pd.DataFrame, cols: list, window: int) -> pd.DataFrame:
    rolling_group_sales = df.groupby(cols)["sales"].rolling(window=window)
    shifted_mean_sales = rolling_group_sales.mean().shift(1)
    df[prediction_col_name] = shifted_mean_sales.reset_index(level=cols, drop=True)
    df.dropna(inplace=True)
    return df


def weekend_group_predict(
    df: pd.DataFrame, cols: list, window_weekday: int, window_weekend: int
) -> pd.DataFrame:
    def get_rolling_mean(group, is_weekend):
        # todo shifting here - sale value lost for the test set
        if is_weekend:
            return group.rolling(window=window_weekend).mean().shift(1)
        return group.rolling(window=window_weekday).mean().shift(1)

    df[prediction_col_name] = (
        df.groupby(cols)[["sales", "is_weekend"]]
        .apply(
            lambda group: get_rolling_mean(group["sales"], group["is_weekend"].iloc[0])
        )
        .reset_index(level=cols, drop=True)
    )

    df.dropna(inplace=True)
    return df


def eval_prediction(df: pd.DataFrame) -> float:
    test_date_start = test_df["date"].min() - pd.Timedelta(days=17)
    test = df[df["date"] >= test_date_start]
    y_true, y_pred = test["sales"], test[prediction_col_name]
    return rmsle(y_true, y_pred)

In [251]:
group_columns = ["store_nbr", "family", "is_weekend"]
n_window_weekdays = 10
n_window_weekends = 6

predict_df = weekend_group_predict(
    train_df.copy(), group_columns, n_window_weekdays, n_window_weekends
)
eval_prediction(predict_df)

np.float64(0.4009640685973311)

In [268]:
test_df.head()

Unnamed: 0_level_0,date,store_nbr,family,onpromotion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3000888,2017-08-16,1,AUTOMOTIVE,0
3000889,2017-08-16,1,BABY CARE,0
3000890,2017-08-16,1,BEAUTY,2
3000891,2017-08-16,1,BEVERAGES,20
3000892,2017-08-16,1,BOOKS,0


In [274]:
grouped_by_is_weekend_sub_file = "grouped_by_is_weekend_submission.csv"
grouped_by_is_weekend_sub = sample_submission_df.copy()

test_prediction = predict_df[predict_df["date"] == predict_df["date"].max()]

test_df.reset_index(inplace=True)
sub_values = test_df.merge(
    test_prediction[["store_nbr", "family", prediction_col_name]],
    on=["store_nbr", "family"],
    how="left",
)

In [275]:
sub_values.head()

Unnamed: 0,index,id,date,store_nbr,family,onpromotion,window_sales_shift
0,0,3000888,2017-08-16,1,AUTOMOTIVE,0,4.8
1,1,3000889,2017-08-16,1,BABY CARE,0,0.0
2,2,3000890,2017-08-16,1,BEAUTY,2,4.6
3,3,3000891,2017-08-16,1,BEVERAGES,20,2150.2
4,4,3000892,2017-08-16,1,BOOKS,0,0.0


In [276]:
sub_values.set_index("id", inplace=True)

grouped_by_is_weekend_sub["sales"] = sub_values[prediction_col_name]

In [277]:
grouped_by_is_weekend_sub.to_csv(
    os.path.join(SUBMISSIONS_PATH, grouped_by_is_weekend_sub_file)
)

In [237]:
group_columns = ["store_nbr", "family"]
n_values_for_mean = 14

predict_df = predict(train_df.copy(), group_columns, n_values_for_mean)
eval_prediction(predict_df)

np.float64(0.4557929809136305)

In [192]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

In [238]:
outer_cv = TimeSeriesSplit(n_splits=5, test_size=40)
inner_cv = TimeSeriesSplit(n_splits=12, test_size=40)


search_range = pd.date_range(train_df["date"].min(), test_df["date"].min(), freq="D")

# for train_idx, test_idx in outer_cv.split(search_range):
#     train_con = train_df["date"].isin(search_range[train_idx])
#     test_con = train_df["date"].isin(search_range[test_idx])
#
#     X_train = train_df[train_con].copy()
#     X_test = predict_df[test_con].copy()
#
#
#
#     y_train = predict_df[train_con]["sales"]
#     y_test = predict_df[test_con]["sales"]

## 1) Mean Prediction

In [33]:
submission_file_name = "all_period_mean_submission.csv"

In [34]:
all_period_mean_submission_df = sample_submission_df.copy()
columns = ["store_nbr", "family"]

mean_sales_per_store = train_df.groupby(columns)["sales"].mean()
prediction = test_df.merge(mean_sales_per_store, on=columns, how="left")
prediction.set_index("id", inplace=True)

all_period_mean_submission_df["sales"] = prediction["sales"]

In [35]:
submission_path = os.path.join(SUBMISSIONS_PATH, submission_file_name)
all_period_mean_submission_df.to_csv(submission_path)

## 2) Mean Sales Per Last Period

In [36]:
period_length = [7, 14, 30, 50, 70]  # days
submission_file_name = "last_period_mean_submission"

In [38]:
last_period_mean_submission_df = sample_submission_df.copy()
columns = ["store_nbr", "family"]

for period in period_length:
    date_condition = train_df["date"] > train_df["date"].max() - pd.Timedelta(
        days=period
    )
    mean_sales_per_store = train_df[date_condition].groupby(columns)["sales"].mean()
    prediction = test_df.merge(mean_sales_per_store, on=columns, how="left")
    prediction.set_index("id", inplace=True)

    last_period_mean_submission_df["sales"] = prediction["sales"]

    submission_path = os.path.join(
        SUBMISSIONS_PATH, f"{submission_file_name}_{period}_days.csv"
    )
    last_period_mean_submission_df.to_csv(submission_path)

## 3) Mean Sales For Each Day Of The Week

In [1]:
submission_file_name = "day_of_week_mean_submission"