In [None]:
import pathlib
import sys

In [None]:
import numpy as np
import pandas as pd

In [None]:
root_dir = "../.."

In [None]:
root_dir_path = pathlib.Path(root_dir)
data_dir_path = root_dir_path / "data"
raw_dir_path = data_dir_path / "raw"
calendar_path = raw_dir_path / "calendar.csv"
sales_train_validation_path = raw_dir_path / "sales_train_validation.csv"
sell_prices_path = raw_dir_path / "sell_prices.csv"
interim_dir_path = data_dir_path / "interim"
interim_path = interim_dir_path / "interim.parquet"
src_dir_path = root_dir_path / "src"

In [None]:
sys.path.append(str(src_dir_path))

In [None]:
from package.constants import *
from package.feature_extraction import *
from package.preprocessing import *
from package.utils import *

## calendar

In [None]:
calendar = pd.read_csv(calendar_path, dtype=dtype, parse_dates=parse_dates)

In [None]:
calendar.drop(columns="wday", inplace=True)

In [None]:
create_calendar_features(calendar, "date")
create_event_name(calendar)
create_event_type(calendar)

In [None]:
reduce_memory_usage(calendar)

## sales_train_validation

In [None]:
sales_train_validation = pd.read_csv(sales_train_validation_path, dtype=dtype)

In [None]:
for i in range(train_days + 1, train_days + 1 + 2 * test_days):
    sales_train_validation[f"d_{i}"] = np.nan

In [None]:
reduce_memory_usage(sales_train_validation)

## sell_prices

In [None]:
sell_prices = pd.read_csv(sell_prices_path, dtype=dtype)

In [None]:
create_aggregated_features(sell_prices, ["sell_price"])
create_pct_change_features(sell_prices, ["sell_price"], periods)

In [None]:
reduce_memory_usage(sell_prices)

## interim

In [None]:
interim = sales_train_validation.melt(
    id_vars=["id", "item_id", "store_id", "dept_id", "cat_id", "state_id"],
    var_name="d",
    value_name=target,
)

In [None]:
del sales_train_validation

In [None]:
interim = interim.merge(calendar, copy=False, how="left", on="d")

In [None]:
del calendar

In [None]:
interim = interim.merge(
    sell_prices, copy=False, how="left", on=["store_id", "item_id", "wm_yr_wk"]
)

In [None]:
del sell_prices

In [None]:
interim.reset_index(drop=True, inplace=True)

In [None]:
interim.drop(columns="wm_yr_wk", inplace=True)

In [None]:
# See https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/144842
create_is_holiday(interim)
# See https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/150955
create_snap(interim)

In [None]:
# Encode categorical features
for col in ["store_id", "item_id", "dept_id", "cat_id", "state_id"]:
    interim[col] = label_encode(interim[col])

In [None]:
reduce_memory_usage(interim)

In [None]:
interim.info()

In [None]:
interim.to_parquet(interim_path)