In [1]:
import gc
import pathlib

In [2]:
import numpy as np
import pandas as pd

In [3]:
def reduce_memory_usage(df, verbose=True):
    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in ("int16", "int32", "int64"):
            c_min = df[col].min()
            c_max = df[col].max()

            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)

        elif col_type in ("float16", "float32", "float64"):
            c_min = df[col].min()
            c_max = df[col].max()

            if (
                c_min > np.finfo(np.float32).min
                and c_max < np.finfo(np.float32).max
            ):
                df[col] = df[col].astype(np.float32)

In [4]:
root_dir_path = pathlib.Path("..")
data_dir_path = root_dir_path / "data"
processed_dir_path = data_dir_path / "processed"
train_path = processed_dir_path / "train.parquet"
train_engineered_path = processed_dir_path / "train_engineered.parquet"
test_path = processed_dir_path / "test.parquet"
test_engineered_path = processed_dir_path / "test_engineered.parquet"

In [5]:
test_days = 28
n_lag_features = 3

In [6]:
train = pd.read_parquet(train_path)
test = pd.read_parquet(test_path)

In [7]:
train_size, _ = train.shape
demand = train.pop("demand")
train = pd.concat([train, test])

In [8]:
del test

In [9]:
gc.collect()

0

In [10]:
# Create calendar features
for attr in [
    "year",
    "dayofyear",
    "weekofyear",
    "quarter",
    "month",
    "day",
    "weekday",
    "is_year_start",
    "is_year_end",
    "is_month_start",
    "is_month_end",
]:
    train[attr] = getattr(train["date"].dt, attr)

In [11]:
reduce_memory_usage(train)

In [12]:
train, test = train.iloc[:train_size], train.iloc[train_size:]
train["demand"] = demand

In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58327370 entries, 0 to 58327369
Data columns (total 28 columns):
 #   Column          Dtype         
---  ------          -----         
 0   id              object        
 1   item_id         int16         
 2   store_id        int8          
 3   dept_id         int8          
 4   cat_id          int8          
 5   state_id        int8          
 6   d               object        
 7   date            datetime64[ns]
 8   event_name_1    int8          
 9   event_type_1    int8          
 10  event_name_2    int8          
 11  event_type_2    int8          
 12  snap_CA         int8          
 13  snap_TX         int8          
 14  snap_WI         int8          
 15  sell_price      float32       
 16  year            int16         
 17  dayofyear       int16         
 18  weekofyear      int8          
 19  quarter         int8          
 20  month           int8          
 21  day             int8          
 22  weekday         

In [14]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1707440 entries, 0 to 1707439
Data columns (total 27 columns):
 #   Column          Dtype         
---  ------          -----         
 0   id              object        
 1   item_id         int16         
 2   store_id        int8          
 3   dept_id         int8          
 4   cat_id          int8          
 5   state_id        int8          
 6   d               object        
 7   date            datetime64[ns]
 8   event_name_1    int8          
 9   event_type_1    int8          
 10  event_name_2    int8          
 11  event_type_2    int8          
 12  snap_CA         int8          
 13  snap_TX         int8          
 14  snap_WI         int8          
 15  sell_price      float32       
 16  year            int16         
 17  dayofyear       int16         
 18  weekofyear      int8          
 19  quarter         int8          
 20  month           int8          
 21  day             int8          
 22  weekday         in

In [15]:
train.to_parquet(train_engineered_path)
test.to_parquet(test_engineered_path)