In [1]:
import gc
import pathlib
import sys

In [2]:
import numpy as np
import pandas as pd
from workalendar.usa import California
from workalendar.usa import Texas
from workalendar.usa import Wisconsin

In [3]:
root_dir_path = pathlib.Path("..")
data_dir_path = root_dir_path / "data"
processed_dir_path = data_dir_path / "processed"
train_path = processed_dir_path / "train.parquet"
train_engineered_path = processed_dir_path / "train_engineered.parquet"
test_path = processed_dir_path / "test.parquet"
test_engineered_path = processed_dir_path / "test_engineered.parquet"
src_dir_path = root_dir_path / "src"

In [4]:
sys.path.append(str(src_dir_path))

In [5]:
from constants import *
from utils import *

In [6]:
train = pd.read_parquet(train_path)
test = pd.read_parquet(test_path)

In [7]:
train_size, _ = train.shape
train = pd.concat([train, test])

In [8]:
reduce_memory_usage(train)

In [9]:
del test

In [10]:
gc.collect()

40

In [11]:
# Create calendar features
for col in calendar_features:
    train[col] = getattr(train["date"].dt, col)

cals = [
    California(),
    Texas(),
    Wisconsin(),
]

intermediate = train["date"].unique()
intermediate = pd.DataFrame(intermediate, columns=["date"])

for cal in cals:
    intermediate[f"is_{cal.__class__.__name__.lower()}_holiday"] = intermediate["date"].apply(cal.is_holiday)

train = train.merge(intermediate, how="left", on="date")

In [12]:
reduce_memory_usage(train)

In [13]:
grouped = train.groupby(["store_id", "item_id"])

In [14]:
# Create lag features
for i in [1, 28]:
    train[f"demand_shift_{i}"] = grouped["demand"].shift(i)
    train[f"sell_price_shift_{i}"] = grouped["sell_price"].shift(i)

In [15]:
train["sell_price_day_over_day"] = train["sell_price"] / train["sell_price_shift_1"]

In [16]:
# Create aggregated features
for i in [28]:
    train[f"demand_shift_1_rolling_{i}_mean"] = grouped["demand_shift_1"].transform(lambda x: x.rolling(i, min_periods=1).mean())

In [17]:
reduce_memory_usage(train)

In [18]:
del grouped

In [19]:
gc.collect()

60

In [20]:
# Create missing indicators
train["sell_price_isnull"] = train["sell_price"].isnull()

In [21]:
train, test = train.iloc[:train_size], train.iloc[train_size:]

In [22]:
reduce_memory_usage(train)

In [23]:
test.drop(columns="demand", inplace=True)

In [24]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58327370 entries, 0 to 58327369
Data columns (total 38 columns):
 #   Column                          Dtype         
---  ------                          -----         
 0   id                              object        
 1   item_id                         int16         
 2   store_id                        int8          
 3   dept_id                         int8          
 4   cat_id                          int8          
 5   state_id                        int8          
 6   d                               object        
 7   demand                          int16         
 8   date                            datetime64[ns]
 9   event_name_1                    int8          
 10  event_type_1                    int8          
 11  event_name_2                    int8          
 12  event_type_2                    int8          
 13  snap_CA                         int8          
 14  snap_TX                         int8          
 

In [25]:
train.isnull().sum()

id                                       0
item_id                                  0
store_id                                 0
dept_id                                  0
cat_id                                   0
state_id                                 0
d                                        0
demand                                   0
date                                     0
event_name_1                             0
event_type_1                             0
event_name_2                             0
event_type_2                             0
snap_CA                                  0
snap_TX                                  0
snap_WI                                  0
sell_price                        12299413
year                                     0
dayofyear                                0
weekofyear                               0
quarter                                  0
month                                    0
day                                      0
weekday    

In [26]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1707440 entries, 58327370 to 60034809
Data columns (total 37 columns):
 #   Column                          Dtype         
---  ------                          -----         
 0   id                              object        
 1   item_id                         int16         
 2   store_id                        int8          
 3   dept_id                         int8          
 4   cat_id                          int8          
 5   state_id                        int8          
 6   d                               object        
 7   date                            datetime64[ns]
 8   event_name_1                    int8          
 9   event_type_1                    int8          
 10  event_name_2                    int8          
 11  event_type_2                    int8          
 12  snap_CA                         int8          
 13  snap_TX                         int8          
 14  snap_WI                         int8      

In [27]:
test.isnull().sum()

id                                      0
item_id                                 0
store_id                                0
dept_id                                 0
cat_id                                  0
state_id                                0
d                                       0
date                                    0
event_name_1                            0
event_type_1                            0
event_name_2                            0
event_type_2                            0
snap_CA                                 0
snap_TX                                 0
snap_WI                                 0
sell_price                              0
year                                    0
dayofyear                               0
weekofyear                              0
quarter                                 0
month                                   0
day                                     0
weekday                                 0
is_year_start                     

In [28]:
train.to_parquet(train_engineered_path)
test.to_parquet(test_engineered_path)