In [1]:
import pathlib
import sys

In [2]:
import numpy as np
import pandas as pd

In [3]:
root_dir_path = pathlib.Path("..")
data_dir_path = root_dir_path / "data"
raw_dir_path = data_dir_path / "raw"
calendar_path = raw_dir_path / "calendar.csv"
sales_train_validation_path = raw_dir_path / "sales_train_validation.csv"
sell_prices_path = raw_dir_path / "sell_prices.csv"
interim_dir_path = data_dir_path / "interim"
interim_path = interim_dir_path / "interim.parquet"
src_dir_path = root_dir_path / "src"

In [4]:
sys.path.append(str(src_dir_path))

In [5]:
from package.utils import *

## calendar

In [6]:
calendar = pd.read_csv(calendar_path, parse_dates=["date"])

In [7]:
calendar.drop(columns=["weekday", "wday", "month", "year"], inplace=True)

In [8]:
# Encode categorical features
for col in ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]:
    codes, _ = pd.factorize(calendar[col], sort=True)
    calendar[col] = codes

In [9]:
reduce_memory_usage(calendar)

## sales_train_validation

In [10]:
sales_train_validation = pd.read_csv(sales_train_validation_path)

In [11]:
# Encode categorical features
for col in ["store_id", "item_id", "dept_id", "cat_id", "state_id"]:
    codes, _ = pd.factorize(sales_train_validation[col], sort=True)
    sales_train_validation[col] = codes

In [12]:
train_days = 1913
validation_days = 28
evaluation_days = 28

In [13]:
for i in range(train_days + 1, train_days + 1 + validation_days + evaluation_days):
    sales_train_validation[f"d_{i}"] = np.nan

In [14]:
reduce_memory_usage(sales_train_validation)

## sell_prices

In [15]:
sell_prices = pd.read_csv(sell_prices_path)

In [16]:
# Encode categorical features
for col in ["store_id", "item_id"]:
    codes, _ = pd.factorize(sell_prices[col], sort=True)
    sell_prices[col] = codes

In [17]:
reduce_memory_usage(sell_prices)

## interim

In [18]:
interim = sales_train_validation.melt(
    id_vars=["id", "item_id", "store_id", "dept_id", "cat_id", "state_id"],
    var_name="d",
    value_name="demand",
)

In [19]:
interim = interim.merge(calendar, copy=False, how="left", on="d")
interim = interim.merge(
    sell_prices, copy=False, how="left", on=["store_id", "item_id", "wm_yr_wk"]
)

In [20]:
interim.drop(columns="wm_yr_wk", inplace=True)

In [21]:
interim.reset_index(drop=True, inplace=True)

In [22]:
reduce_memory_usage(interim)

In [23]:
interim.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60034810 entries, 0 to 60034809
Data columns (total 17 columns):
 #   Column        Dtype         
---  ------        -----         
 0   id            object        
 1   item_id       int16         
 2   store_id      int8          
 3   dept_id       int8          
 4   cat_id        int8          
 5   state_id      int8          
 6   d             object        
 7   demand        float32       
 8   date          datetime64[ns]
 9   event_name_1  int8          
 10  event_type_1  int8          
 11  event_name_2  int8          
 12  event_type_2  int8          
 13  snap_CA       int8          
 14  snap_TX       int8          
 15  snap_WI       int8          
 16  sell_price    float32       
dtypes: datetime64[ns](1), float32(2), int16(1), int8(11), object(2)
memory usage: 2.5+ GB


In [24]:
interim.to_parquet(interim_path)