In [None]:
import pathlib
import sys

In [None]:
import numpy as np
import pandas as pd

In [None]:
root_dir = "../.."

In [None]:
root_dir_path = pathlib.Path(root_dir)
data_dir_path = root_dir_path / "data"
raw_dir_path = data_dir_path / "raw"
calendar_path = raw_dir_path / "calendar.csv"
sales_train_validation_path = raw_dir_path / "sales_train_validation.csv"
sell_prices_path = raw_dir_path / "sell_prices.csv"
interim_dir_path = data_dir_path / "interim"
interim_path = interim_dir_path / "interim.parquet"
src_dir_path = root_dir_path / "src"

In [None]:
sys.path.append(str(src_dir_path))

In [None]:
from package.feature_extraction import *
from package.utils import *

## calendar

In [None]:
calendar = pd.read_csv(calendar_path, parse_dates=["date"])

In [None]:
create_calendar_features(calendar)

In [None]:
calendar.drop(columns="wday", inplace=True)

In [None]:
# Encode categorical features
for col in ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]:
    codes, _ = pd.factorize(calendar[col], sort=True)
    calendar[col] = codes

In [None]:
reduce_memory_usage(calendar)

## sales_train_validation

In [None]:
sales_train_validation = pd.read_csv(sales_train_validation_path)

In [None]:
# Encode categorical features
for col in ["store_id", "item_id", "dept_id", "cat_id", "state_id"]:
    codes, _ = pd.factorize(sales_train_validation[col], sort=True)
    sales_train_validation[col] = codes

In [None]:
train_days = 1913
validation_days = 28
evaluation_days = 28

In [None]:
for i in range(train_days + 1, train_days + 1 + validation_days + evaluation_days):
    sales_train_validation[f"d_{i}"] = np.nan

In [None]:
reduce_memory_usage(sales_train_validation)

## sell_prices

In [None]:
sell_prices = pd.read_csv(sell_prices_path)

In [None]:
create_aggregated_features(sell_prices)

In [None]:
# Encode categorical features
for col in ["store_id", "item_id"]:
    codes, _ = pd.factorize(sell_prices[col], sort=True)
    sell_prices[col] = codes

In [None]:
reduce_memory_usage(sell_prices)

## interim

In [None]:
interim = sales_train_validation.melt(
    id_vars=["id", "item_id", "store_id", "dept_id", "cat_id", "state_id"],
    var_name="d",
    value_name="demand",
)

In [None]:
interim = interim.merge(calendar, copy=False, how="left", on="d")
interim = interim.merge(
    sell_prices, copy=False, how="left", on=["store_id", "item_id", "wm_yr_wk"]
)

In [None]:
interim.drop(columns="wm_yr_wk", inplace=True)

In [None]:
interim.reset_index(drop=True, inplace=True)

In [None]:
reduce_memory_usage(interim)

In [None]:
interim.info()

In [None]:
interim.to_parquet(interim_path)