In [None]:
import pathlib
import sys

In [None]:
import numpy as np
import pandas as pd

In [None]:
root_dir = "../.."

In [None]:
root_dir_path = pathlib.Path(root_dir)
data_dir_path = root_dir_path / "data"
raw_dir_path = data_dir_path / "raw"
calendar_path = raw_dir_path / "calendar.csv"
sales_train_validation_path = raw_dir_path / "sales_train_validation.csv"
sell_prices_path = raw_dir_path / "sell_prices.csv"
interim_dir_path = data_dir_path / "interim"
interim_path = interim_dir_path / "interim.parquet"
processed_dir_path = data_dir_path / "processed"
train_path = processed_dir_path / "train.parquet"
src_dir_path = root_dir_path / "src"

In [None]:
sys.path.append(str(src_dir_path))

In [None]:
from package.constants import *
from package.feature_extraction import *
from package.preprocessing import *
from package.utils import *

In [None]:
sales_train_validation = pd.read_csv(sales_train_validation_path, dtype=dtype)

In [None]:
reduce_memory_usage(sales_train_validation)

In [None]:
for i in range(train_days + 1, train_days + 1 + 2 * test_days):
    sales_train_validation[f"d_{i}"] = np.nan

In [None]:
interim = sales_train_validation.melt(
    id_vars=["id", "item_id", "store_id", "dept_id", "cat_id", "state_id"],
    var_name="d",
    value_name=target,
)

In [None]:
del sales_train_validation

In [None]:
calendar = pd.read_csv(calendar_path, dtype=dtype, parse_dates=parse_dates)

In [None]:
create_calendar_features(calendar, parse_dates)
create_event_name(calendar)
create_event_type(calendar)

In [None]:
calendar.drop(
    columns=[
        "event_name_1",
        "event_name_2",
        "event_type_1",
        "event_type_2",
        "month",
        "wday",
        "weekday",
        "year",
    ],
    inplace=True,
)

In [None]:
reduce_memory_usage(calendar)

In [None]:
interim = interim.merge(calendar, copy=False, how="left", on="d")

In [None]:
del calendar

In [None]:
sell_prices = pd.read_csv(sell_prices_path, dtype=dtype)

In [None]:
sell_prices["dept_id"] = sell_prices["item_id"].str.extract(r"(\w+_\d+)_\d+")
sell_prices["cat_id"] = sell_prices["dept_id"].str.extract(r"(\w+)_\d+")
sell_prices["state_id"] = sell_prices["store_id"].str.extract(r"(\w+)_\d+")

In [None]:
create_aggregate_features(sell_prices, level_id_cols[1:11], raw_numerical_features)
create_expanding_features(sell_prices, level_id_cols[11:], raw_numerical_features)
create_pct_change_features(sell_prices, level_id_cols[11], raw_numerical_features, periods)
create_scaled_features(sell_prices, level_id_cols[11], raw_numerical_features)

In [None]:
sell_prices.drop(
    columns=["dept_id", "cat_id", "state_id"], inplace=True,
)

In [None]:
reduce_memory_usage(sell_prices)

In [None]:
interim = interim.merge(
    sell_prices, copy=False, how="left", on=["store_id", "item_id", "wm_yr_wk"]
)

In [None]:
del sell_prices

In [None]:
interim.reset_index(drop=True, inplace=True)

In [None]:
create_shift_features(interim, level_id_cols[11], [target], periods_batch)
create_rolling_features(interim, level_id_cols[11:], shift_features_batch, windows)
create_days_since_release(interim)
# See https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/144842
create_is_working_day(interim)
# See https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/150955
create_snap(interim)

In [None]:
interim.drop(columns=["wm_yr_wk", "snap_CA", "snap_TX", "snap_WI"], inplace=True)

In [None]:
label_encode(interim, categorical_features)

In [None]:
reduce_memory_usage(interim)

In [None]:
interim.info()

In [None]:
interim.to_parquet(interim_path)

In [None]:
interim.dropna(inplace=True, subset=[target])

In [None]:
create_shift_features(interim, level_id_cols[11], [target], periods_online)
create_rolling_features(interim, level_id_cols[11:], shift_features_online, windows)

In [None]:
# See https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/138268
interim.dropna(inplace=True, subset=["sell_price"])
interim.reset_index(drop=True, inplace=True)

In [None]:
interim[transformed_target] = interim[target] * interim["sell_price"]

In [None]:
interim.drop(columns=["id", "d", target], inplace=True)

In [None]:
reduce_memory_usage(interim)

In [None]:
interim.info()

In [None]:
interim.to_parquet(train_path)