In [2]:
import pandas as pd

In [5]:
train_df = pd.read_csv('../data/raw/train.csv')

In [8]:
train_df['date'] = pd.to_datetime(train_df['date'])

STEP 3.1 — SORT DATA

In [6]:
train_df = train_df.sort_values(['store', 'item', 'date']).reset_index(drop=True)

STEP 3.2 — CALENDAR FEATURES

In [9]:
train_df['year'] = train_df['date'].dt.year
train_df['month'] = train_df['date'].dt.month
train_df['week'] = train_df['date'].dt.isocalendar().week.astype(int)
train_df['day'] = train_df['date'].dt.day
train_df['dayofweek'] = train_df['date'].dt.dayofweek

STEP 3.3 — LAG FEATURES

In [10]:
LAGS = [2, 14, 28]

for lag in LAGS:
    train_df[f'lag_{lag}'] = (
        train_df
        .groupby(['store', 'item'])['sales']
        .shift(lag)
    )

STEP 3.4 — ROLLING STATISTICS (SMOOTHING)

In [11]:
WINDOWS = [7, 14, 28]

for window in WINDOWS:
    train_df[f'rolling_mean_{window}'] = (
        train_df
        .groupby(['store', 'item'])['sales']
        .shift(1)
        .rolling(window)
        .mean()
    )

STEP 3.5 — HANDLE NaN

In [12]:
train_df.isna().sum()

date                   0
store                  0
item                   0
sales                  0
year                   0
month                  0
week                   0
day                    0
dayofweek              0
lag_2               1000
lag_14              7000
lag_28             14000
rolling_mean_7      3500
rolling_mean_14     7000
rolling_mean_28    14000
dtype: int64

In [13]:
train_df_fe = train_df.dropna().reset_index(drop=True)

STEP 3.6 — QUICK SANITY CHECK

In [17]:
train_df_fe.shape

(899000, 15)

In [14]:
train_df_fe.head()

Unnamed: 0,date,store,item,sales,year,month,week,day,dayofweek,lag_2,lag_14,lag_28,rolling_mean_7,rolling_mean_14,rolling_mean_28
0,2013-01-29,1,1,6,2013,1,5,29,1,12.0,5.0,13.0,10.428571,10.642857,10.714286
1,2013-01-30,1,1,9,2013,1,5,30,2,11.0,7.0,11.0,10.285714,10.714286,10.464286
2,2013-01-31,1,1,13,2013,1,5,31,3,6.0,16.0,14.0,10.285714,10.857143,10.392857
3,2013-02-01,1,1,11,2013,2,5,1,4,9.0,7.0,13.0,11.0,10.642857,10.357143
4,2013-02-02,1,1,21,2013,2,5,2,5,13.0,18.0,10.0,10.571429,10.928571,10.285714


## Feature Engineering Summary

- Calendar features were created to capture weekly and yearly seasonality.
- Lag features (7, 14, 28 days) were generated to model temporal dependencies.
- Rolling statistics were used to smooth short-term fluctuations.
- All features were created using past data only to prevent data leakage.