In [1]:
import pathlib

In [2]:
import numpy as np
import pandas as pd

In [3]:
def reduce_memory_usage(df, verbose=True):
    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in ("int16", "int32", "int64"):
            c_min = df[col].min()
            c_max = df[col].max()

            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)

        elif col_type in ("float16", "float32", "float64"):
            c_min = df[col].min()
            c_max = df[col].max()

            if (
                c_min > np.finfo(np.float16).min
                and c_max < np.finfo(np.float16).max
            ):
                # df[col] = df[col].astype(np.float16)
                pass
            elif (
                c_min > np.finfo(np.float32).min
                and c_max < np.finfo(np.float32).max
            ):
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)

In [4]:
root_dir_path = pathlib.Path("..")
data_dir_path = root_dir_path / "data"
processed_dir_path = data_dir_path / "processed"
train_path = processed_dir_path / "train.parquet"
train_engineered_path = processed_dir_path / "train_engineered.parquet"
test_path = processed_dir_path / "test.parquet"
test_engineered_path = processed_dir_path / "test_engineered.parquet"

In [5]:
train = pd.read_parquet(train_path)
test = pd.read_parquet(test_path)

In [6]:
for attr in (
    # "year",
    # "dayofyear",
    # "weekofyear",
    # "quarter",
    "month",
    # "day",
    "weekday",
):
    train[attr] = getattr(train["date"].dt, attr)
    test[attr] = getattr(test["date"].dt, attr)

In [7]:
reduce_memory_usage(train)
reduce_memory_usage(test)

In [8]:
train.to_parquet(train_engineered_path)
test.to_parquet(test_engineered_path)