In [1]:
import gc
import pathlib

In [2]:
import numpy as np
import pandas as pd

In [3]:
def reduce_memory_usage(df):
    for col in df.columns:
        col_type = df[col].dtype

        if col_type == "datetime64[ns]":
            continue

        try:
            df[col] = pd.to_numeric(df[col], downcast="integer")
        except ValueError:
            continue

        col_type = df[col].dtype

        if col_type in ["float16", "float32", "float64"]:
            c_min = df[col].min()
            c_max = df[col].max()

            if (
                c_min > np.finfo(np.float32).min
                and c_max < np.finfo("float32").max
            ):
                df[col] = df[col].astype("float32")

In [4]:
root_dir_path = pathlib.Path("..")
data_dir_path = root_dir_path / "data"
processed_dir_path = data_dir_path / "processed"
train_path = processed_dir_path / "train.parquet"
train_engineered_path = processed_dir_path / "train_engineered.parquet"
test_path = processed_dir_path / "test.parquet"
test_engineered_path = processed_dir_path / "test_engineered.parquet"

In [5]:
calendar_features = [
    "year",
    "dayofyear",
    "weekofyear",
    "quarter",
    "month",
    "day",
    "weekday",
    "is_year_start",
    "is_year_end",
    "is_month_start",
    "is_month_end",
]
lag_features = [f"demand_shift_{i}" for i in range(28, 29)]
categorical_features = [
    "store_id",
    "item_id",
    "dept_id",
    "cat_id",
    "state_id",
    "event_name_1",
    "event_type_1",
    "event_name_2",
    "event_type_2",
]
numerical_features = [
    "snap_CA",
    "snap_TX",
    "snap_WI",
    "sell_price",
] + calendar_features + lag_features
features = categorical_features + numerical_features

In [6]:
train = pd.read_parquet(train_path)
test = pd.read_parquet(test_path)

In [7]:
train_size, _ = train.shape
train = pd.concat([train, test])

In [8]:
reduce_memory_usage(train)

In [9]:
del test

In [10]:
gc.collect()

0

In [11]:
# Create calendar features
for col in calendar_features:
    train[col] = getattr(train["date"].dt, col)

In [12]:
reduce_memory_usage(train)

In [13]:
# Create lag features
grouped = train.groupby(["store_id", "item_id"])

for i in range(28, 29):
    train[f"demand_shift_{i}"] = grouped["demand"].shift(i)

In [14]:
del grouped

In [15]:
gc.collect()

0

In [16]:
train, test = train.iloc[:train_size], train.iloc[train_size:]

In [17]:
reduce_memory_usage(train)

In [18]:
test.drop(columns="demand", inplace=True)

In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58327370 entries, 0 to 58327369
Data columns (total 29 columns):
 #   Column           Dtype         
---  ------           -----         
 0   id               object        
 1   item_id          int16         
 2   store_id         int8          
 3   dept_id          int8          
 4   cat_id           int8          
 5   state_id         int8          
 6   d                object        
 7   demand           int16         
 8   date             datetime64[ns]
 9   event_name_1     int8          
 10  event_type_1     int8          
 11  event_name_2     int8          
 12  event_type_2     int8          
 13  snap_CA          int8          
 14  snap_TX          int8          
 15  snap_WI          int8          
 16  sell_price       float32       
 17  year             int16         
 18  dayofyear        int16         
 19  weekofyear       int8          
 20  quarter          int8          
 21  month            int8        

In [20]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1707440 entries, 0 to 1707439
Data columns (total 28 columns):
 #   Column           Dtype         
---  ------           -----         
 0   id               object        
 1   item_id          int16         
 2   store_id         int8          
 3   dept_id          int8          
 4   cat_id           int8          
 5   state_id         int8          
 6   d                object        
 7   date             datetime64[ns]
 8   event_name_1     int8          
 9   event_type_1     int8          
 10  event_name_2     int8          
 11  event_type_2     int8          
 12  snap_CA          int8          
 13  snap_TX          int8          
 14  snap_WI          int8          
 15  sell_price       float32       
 16  year             int16         
 17  dayofyear        int16         
 18  weekofyear       int8          
 19  quarter          int8          
 20  month            int8          
 21  day              int8          

In [21]:
train.to_parquet(train_engineered_path)
test.to_parquet(test_engineered_path)