In [1]:
import sys
from pathlib import Path

# add project root
project_root = Path().resolve().parents[0]
sys.path.append(str(project_root))


In [2]:
import pandas as pd
from src.features.build_features import make_features


In [3]:
train = pd.read_csv('../data/raw/train.csv')
test = pd.read_csv("../data/raw/test.csv")
test['sales'] = None

concat for proper time series features

In [4]:
full = pd.concat([train, test], ignore_index=True)

feature engineering

In [5]:
df_fe = make_features(full)

split back

In [6]:
train_fe = df_fe[df_fe['sales'].notnull()]
test_fe  = df_fe[df_fe['sales'].isnull()]

CEK MISSING VALUE

In [7]:
TARGET = "sales"

BASE_FEATURES = [
    'store','item','year','month','week','day',
    'dayofweek','is_weekend'
]

LAG_FEATURES = [
    'lag_7','lag_14','lag_28'
]

ROLLING_FEATURES = [
    'rolling_mean_7','rolling_mean_14','rolling_mean_28'
]

FEATURES = BASE_FEATURES + LAG_FEATURES + ROLLING_FEATURES


In [8]:
train_fe[FEATURES + [TARGET]].isna().sum()
test_fe[FEATURES + [TARGET]].isna().sum()

store                  0
item                   0
year                   0
month                  0
week                   0
day                    0
dayofweek              0
is_weekend             0
lag_7              41500
lag_14             38000
lag_28             31000
rolling_mean_7     44500
rolling_mean_14    44500
rolling_mean_28    44500
sales              45000
dtype: int64

Duplicates

In [9]:
train_fe.duplicated(
    subset=['date','store','item']
).sum()

test_fe.duplicated(
    subset=['date','store','item']
).sum()

np.int64(0)

Business sanity

In [10]:
train_fe[TARGET].describe()
test_fe[TARGET].describe()

count       0
unique      0
top       NaN
freq      NaN
Name: sales, dtype: object

Lag & Rolling

In [11]:
train_fe[['lag_7','lag_14','lag_28']].describe()
test_fe[['lag_7','lag_14','lag_28']].describe()


Unnamed: 0,lag_7,lag_14,lag_28
count,3500,7000,14000
unique,117,119,123
top,21,22,22
freq,75,148,277


In [13]:
train_fe_clean = train_fe.dropna(
    subset=LAG_FEATURES + ROLLING_FEATURES
)
train_fe_clean[FEATURES + [TARGET]].isna().sum()


store              0
item               0
year               0
month              0
week               0
day                0
dayofweek          0
is_weekend         0
lag_7              0
lag_14             0
lag_28             0
rolling_mean_7     0
rolling_mean_14    0
rolling_mean_28    0
sales              0
dtype: int64

In [15]:
test_fe_clean = test_fe.dropna(
    subset=LAG_FEATURES + ROLLING_FEATURES
)
test_fe_clean[FEATURES].isna().sum()


store              0
item               0
year               0
month              0
week               0
day                0
dayofweek          0
is_weekend         0
lag_7              0
lag_14             0
lag_28             0
rolling_mean_7     0
rolling_mean_14    0
rolling_mean_28    0
dtype: int64

Simpan Data Feature Engineering

In [16]:
# create processed folder if not exists
import os
os.makedirs("../data/processed", exist_ok=True)

# save full feature dataset
train_fe_clean.to_csv("../data/processed/train_fe.csv", index=False)
test_fe_clean.to_csv("../data/processed/test_fe.csv", index=False)

print("✅ Feature-engineered data saved to data/processed/")
print("Shape:", train_fe_clean.shape)
print("Shape:", test_fe_clean.shape)


✅ Feature-engineered data saved to data/processed/
Shape: (899000, 17)
Shape: (500, 17)


## Feature Engineering Summary

- Calendar features were created to capture weekly and yearly seasonality.
- Lag features (7, 14, 28 days) were generated to model temporal dependencies.
- Rolling statistics were used to smooth short-term fluctuations.
- All features were created using past data only to prevent data leakage.