# Part III: Feature Engineering

## Basic settings

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import warnings
from datetime import datetime, timedelta

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

warnings.filterwarnings("ignore")
pd.options.display.max_rows = 50
pd.options.display.max_columns = None

In [3]:
src_path = os.path.abspath(os.path.join("../..", "src"))
if src_path not in sys.path:
    sys.path.append(src_path)

In [4]:
from utils.utils import save_data, add_date_features

In [5]:
src_path = os.path.abspath(os.path.join("../..", "src"))
if src_path not in sys.path:
    sys.path.append(src_path)
from data_generator.data_generator import check_missing_values
from utils.utils import add_date_features, compute_optimal_lags, create_rolling_features, create_ewma_features, remove_multicollinear_features

In [6]:
DATA_DIR = "../../data/"
FIGURES_DIR = "../figures"

## Load preprocessed data

In [7]:
df_sales = pd.read_csv(
    os.path.join(DATA_DIR, "data_processed/sales_data_preprocessed.csv"), parse_dates=["date"]
)
df_weather = pd.read_csv(
    os.path.join(DATA_DIR, "data_processed/weather_preprocessed.csv"), parse_dates=["date"]
)

df_weather_key_store_merged = pd.read_csv(
    os.path.join(DATA_DIR, "data_processed/weather_key_store_merged.csv"), parse_dates=["date"]
)


## Feature Engineering

In [8]:
# Start by creating a copy of the logunits dataframe
df_weather_key_store_processed = df_weather_key_store_merged.copy()

### Add date and holidays related features

In [9]:
df_weather_key_store_processed = add_date_features(df_weather_key_store_processed)

✓ Encoded season into 3 dummy columns
Fetching US holidays...
✓ Added date features with US seasons


In [10]:
df_weather_key_store_processed

Unnamed: 0,date,store_nbr,item_nbr,units,logunits,station_nbr,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,sunrise,sunset,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,BCFG,BLDU,BLSN,BR,DU,DZ,FG,FG+,FU,FZDZ,FZFG,FZRA,GR,GS,HZ,MIFG,PL,PRFG,RA,SG,SN,SQ,TS,TSRA,TSSN,UP,VCFG,VCTS,year,month,day,day_of_week,day_of_year,week_of_year,quarter,is_weekend,season,season_Spring,season_Summer,season_Winter,is_holiday,is_blackfriday
0,2012-01-01,1,9,29.0,3.401197,1,52.0,31.0,42.0,1.452913,36.0,40.0,23.0,0.0,585.011125,1823.365065,0.038126,0.05,29.78,29.92,3.6,20.0,4.6,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2012,1,1,6,1,52,1,1,0,0,0,1,1,0
1,2012-01-02,1,9,60.0,4.110874,1,50.0,31.0,41.0,11.333333,26.0,35.0,24.0,0.0,585.011125,1823.365065,0.038126,0.01,29.44,29.62,9.8,24.0,10.3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,2,0,2,1,1,0,0,0,0,1,1,0
2,2012-01-03,1,9,15.0,2.772589,1,32.0,11.0,22.0,-4.666667,4.0,18.0,43.0,0.0,585.011125,1823.365065,0.038126,0.00,29.67,29.87,10.8,31.0,11.6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,3,1,3,1,1,0,0,0,0,1,0,0
3,2012-01-04,1,9,20.0,3.044522,1,28.0,9.0,19.0,-8.333333,-1.0,14.0,46.0,0.0,585.011125,1823.365065,0.038126,0.00,29.86,30.03,6.3,27.0,8.3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,4,2,4,1,1,0,0,0,0,1,0,0
4,2012-01-05,1,9,16.0,2.833213,1,38.0,25.0,32.0,4.000000,13.0,25.0,33.0,0.0,585.011125,1823.365065,0.038126,0.00,29.67,29.84,6.9,25.0,7.8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,5,3,5,1,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236033,2014-10-27,45,50,0.0,0.000000,16,62.0,38.0,50.0,6.666667,37.0,44.0,15.0,0.0,585.011125,1823.365065,0.038126,0.00,29.71,29.86,8.1,29.0,8.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2014,10,27,0,300,44,4,0,3,0,0,0,0,0
236034,2014-10-28,45,50,1.0,0.693147,16,61.0,36.0,49.0,7.666667,44.0,47.0,16.0,0.0,585.011125,1823.365065,0.038126,0.00,29.89,29.99,2.3,12.0,3.7,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2014,10,28,1,301,44,4,0,3,0,0,0,0,0
236035,2014-10-29,45,50,0.0,0.000000,16,74.0,49.0,62.0,5.666667,51.0,54.0,3.0,0.0,585.011125,1823.365065,0.038126,0.04,29.72,29.82,3.7,27.0,4.7,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2014,10,29,2,302,44,4,0,3,0,0,0,0,0
236036,2014-10-30,45,50,0.0,0.000000,16,57.0,34.0,46.0,1.666667,39.0,43.0,19.0,0.0,585.011125,1823.365065,0.038126,0.00,29.79,29.91,4.0,30.0,4.6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2014,10,30,3,303,44,4,0,3,0,0,0,0,0


### Remove multicollinearity

In [11]:
df_weather_key_store_processed, dropped = remove_multicollinear_features(
    df_weather_key_store_processed, 
    threshold=0.90,
    exclude_cols=['units', 'logunits'] 
)

Features có correlation > 0.9:
  - tmin: correlation cao với ['tmax']
  - tavg: correlation cao với ['tmax', 'tmin']
  - dewpoint: correlation cao với ['tmin']
  - wetbulb: correlation cao với ['tmax', 'tmin', 'tavg', 'dewpoint']
  - heat: correlation cao với ['tmax', 'tavg']
  - day_of_year: correlation cao với ['month']
  - week_of_year: correlation cao với ['month', 'day_of_year']
  - quarter: correlation cao với ['month', 'day_of_year', 'week_of_year']

✓ Đã bỏ 8 features: ['tmin', 'tavg', 'dewpoint', 'wetbulb', 'heat', 'day_of_year', 'week_of_year', 'quarter']
✓ Còn lại 57 features
✓ Protected columns: ['units', 'logunits']


### Last n days logunits

In [12]:
optimal_lags = compute_optimal_lags(df_weather_key_store_processed, target_col='units', max_lag=30)
print(optimal_lags)

✓ Analyzed 255/255 groups
✓ Average sample size: 926 observations
✓ Average CI threshold: ±0.064 (95% confidence)
✓ Found 14 lags significant in ≥30% of groups
✓ Top 10 lags: [1, 2, 7, 6, 5, 3, 4, 14, 8, 21]
{'significant_lags': [1, 2, 7, 6, 5, 3, 4, 14, 8, 21, 13, 28, 20, 9], 'lag_frequencies': {1: 234, 6: 156, 7: 159, 8: 115, 10: 69, 13: 99, 14: 118, 16: 38, 17: 44, 20: 81, 28: 96, 2: 165, 3: 133, 4: 130, 5: 143, 18: 40, 19: 39, 12: 57, 11: 53, 21: 108, 22: 46, 24: 24, 25: 32, 26: 36, 9: 79, 15: 63, 29: 37, 30: 29, 23: 29, 27: 68}, 'avg_ci_threshold': 0.06442096382702076, 'stats': {'total_groups': 255, 'analyzed_groups': 255, 'avg_sample_size': 925.6392156862745}}


In [13]:
lags = [1, 2, 3, 4, 5, 6, 7, 14, 21, 28]

for n in lags:
    df_weather_key_store_processed[f"logunits_lag_{n}"] = (
        df_weather_key_store_processed
        .groupby(["store_nbr", "item_nbr"])["logunits"]
        .transform(lambda x: x.shift(n))
    )

### Rolling average features

In [14]:
df_weather_key_store_processed = create_rolling_features(
    df_weather_key_store_processed,
    target_col='logunits',
    windows=[7, 14, 28],
    statistics=['mean', 'min', 'max', 'std'],
    group_cols=['store_nbr', 'item_nbr']
)

✓ Created 12 rolling features for 'logunits'


In [15]:
df_weather_key_store_processed

Unnamed: 0,date,store_nbr,item_nbr,units,logunits,station_nbr,tmax,depart,cool,sunrise,sunset,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,BCFG,BLDU,BLSN,BR,DU,DZ,FG,FG+,FU,FZDZ,FZFG,FZRA,GR,GS,HZ,MIFG,PL,PRFG,RA,SG,SN,SQ,TS,TSRA,TSSN,UP,VCFG,VCTS,year,month,day,day_of_week,is_weekend,season,season_Spring,season_Summer,season_Winter,is_holiday,is_blackfriday,logunits_lag_1,logunits_lag_2,logunits_lag_3,logunits_lag_4,logunits_lag_5,logunits_lag_6,logunits_lag_7,logunits_lag_14,logunits_lag_21,logunits_lag_28,logunits_mean_7d,logunits_min_7d,logunits_max_7d,logunits_std_7d,logunits_mean_14d,logunits_min_14d,logunits_max_14d,logunits_std_14d,logunits_mean_28d,logunits_min_28d,logunits_max_28d,logunits_std_28d
0,2012-01-01,1,9,29.0,3.401197,1,52.0,1.452913,0.0,585.011125,1823.365065,0.038126,0.05,29.78,29.92,3.6,20.0,4.6,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2012,1,1,6,1,0,0,0,1,1,0,,,,,,,,,,,,,,,,,,,,,,
1,2012-01-02,1,9,60.0,4.110874,1,50.0,11.333333,0.0,585.011125,1823.365065,0.038126,0.01,29.44,29.62,9.8,24.0,10.3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,2,0,0,0,0,0,1,1,0,3.401197,,,,,,,,,,3.401197,3.401197,3.401197,,3.401197,3.401197,3.401197,,3.401197,3.401197,3.401197,
2,2012-01-03,1,9,15.0,2.772589,1,32.0,-4.666667,0.0,585.011125,1823.365065,0.038126,0.00,29.67,29.87,10.8,31.0,11.6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,3,1,0,0,0,0,1,0,0,4.110874,3.401197,,,,,,,,,3.756036,3.401197,4.110874,0.501817,3.756036,3.401197,4.110874,0.501817,3.756036,3.401197,4.110874,0.501817
3,2012-01-04,1,9,20.0,3.044522,1,28.0,-8.333333,0.0,585.011125,1823.365065,0.038126,0.00,29.86,30.03,6.3,27.0,8.3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,4,2,0,0,0,0,1,0,0,2.772589,4.110874,3.401197,,,,,,,,3.428220,2.772589,4.110874,0.669552,3.428220,2.772589,4.110874,0.669552,3.428220,2.772589,4.110874,0.669552
4,2012-01-05,1,9,16.0,2.833213,1,38.0,4.000000,0.0,585.011125,1823.365065,0.038126,0.00,29.67,29.84,6.9,25.0,7.8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,5,3,0,0,0,0,1,0,0,3.044522,2.772589,4.110874,3.401197,,,,,,,3.332296,2.772589,4.110874,0.579372,3.332296,2.772589,4.110874,0.579372,3.332296,2.772589,4.110874,0.579372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236033,2014-10-27,45,50,0.0,0.000000,16,62.0,6.666667,0.0,585.011125,1823.365065,0.038126,0.00,29.71,29.86,8.1,29.0,8.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2014,10,27,0,0,3,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.693147,0.000000,0.000000,1.098612,0.000000,0.0,0.099021,0.000000,0.693147,0.261985,0.127983,0.000000,1.098612,0.334904,0.088747,0.000000,1.098612,0.268558
236034,2014-10-28,45,50,1.0,0.693147,16,61.0,7.666667,0.0,585.011125,1823.365065,0.038126,0.00,29.89,29.99,2.3,12.0,3.7,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2014,10,28,1,0,3,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.693147,0.000000,0.000000,0.693147,0.0,0.099021,0.000000,0.693147,0.261985,0.049511,0.000000,0.693147,0.185251,0.088747,0.000000,1.098612,0.268558
236035,2014-10-29,45,50,0.0,0.000000,16,74.0,5.666667,0.0,585.011125,1823.365065,0.038126,0.04,29.72,29.82,3.7,27.0,4.7,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2014,10,29,2,0,3,0,0,0,0,0,0.693147,0.000000,0.000000,0.000000,0.000000,0.000000,0.693147,0.000000,0.000000,0.0,0.198042,0.000000,0.693147,0.338221,0.099021,0.000000,0.693147,0.251707,0.113502,0.000000,1.098612,0.291077
236036,2014-10-30,45,50,0.0,0.000000,16,57.0,1.666667,0.0,585.011125,1823.365065,0.038126,0.00,29.79,29.91,4.0,30.0,4.6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2014,10,30,3,0,3,0,0,0,0,0,0.000000,0.693147,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.099021,0.000000,0.693147,0.261985,0.099021,0.000000,0.693147,0.251707,0.113502,0.000000,1.098612,0.291077


In [16]:
store_id = 1
item_id = 9

df_weather_key_store_processed.query("store_nbr == @store_id and item_nbr == @item_id")\
           .sort_values('date')\
           .head(5)

Unnamed: 0,date,store_nbr,item_nbr,units,logunits,station_nbr,tmax,depart,cool,sunrise,sunset,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,BCFG,BLDU,BLSN,BR,DU,DZ,FG,FG+,FU,FZDZ,FZFG,FZRA,GR,GS,HZ,MIFG,PL,PRFG,RA,SG,SN,SQ,TS,TSRA,TSSN,UP,VCFG,VCTS,year,month,day,day_of_week,is_weekend,season,season_Spring,season_Summer,season_Winter,is_holiday,is_blackfriday,logunits_lag_1,logunits_lag_2,logunits_lag_3,logunits_lag_4,logunits_lag_5,logunits_lag_6,logunits_lag_7,logunits_lag_14,logunits_lag_21,logunits_lag_28,logunits_mean_7d,logunits_min_7d,logunits_max_7d,logunits_std_7d,logunits_mean_14d,logunits_min_14d,logunits_max_14d,logunits_std_14d,logunits_mean_28d,logunits_min_28d,logunits_max_28d,logunits_std_28d
0,2012-01-01,1,9,29.0,3.401197,1,52.0,1.452913,0.0,585.011125,1823.365065,0.038126,0.05,29.78,29.92,3.6,20.0,4.6,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2012,1,1,6,1,0,0,0,1,1,0,,,,,,,,,,,,,,,,,,,,,,
1,2012-01-02,1,9,60.0,4.110874,1,50.0,11.333333,0.0,585.011125,1823.365065,0.038126,0.01,29.44,29.62,9.8,24.0,10.3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,2,0,0,0,0,0,1,1,0,3.401197,,,,,,,,,,3.401197,3.401197,3.401197,,3.401197,3.401197,3.401197,,3.401197,3.401197,3.401197,
2,2012-01-03,1,9,15.0,2.772589,1,32.0,-4.666667,0.0,585.011125,1823.365065,0.038126,0.0,29.67,29.87,10.8,31.0,11.6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,3,1,0,0,0,0,1,0,0,4.110874,3.401197,,,,,,,,,3.756036,3.401197,4.110874,0.501817,3.756036,3.401197,4.110874,0.501817,3.756036,3.401197,4.110874,0.501817
3,2012-01-04,1,9,20.0,3.044522,1,28.0,-8.333333,0.0,585.011125,1823.365065,0.038126,0.0,29.86,30.03,6.3,27.0,8.3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,4,2,0,0,0,0,1,0,0,2.772589,4.110874,3.401197,,,,,,,,3.42822,2.772589,4.110874,0.669552,3.42822,2.772589,4.110874,0.669552,3.42822,2.772589,4.110874,0.669552
4,2012-01-05,1,9,16.0,2.833213,1,38.0,4.0,0.0,585.011125,1823.365065,0.038126,0.0,29.67,29.84,6.9,25.0,7.8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,5,3,0,0,0,0,1,0,0,3.044522,2.772589,4.110874,3.401197,,,,,,,3.332296,2.772589,4.110874,0.579372,3.332296,2.772589,4.110874,0.579372,3.332296,2.772589,4.110874,0.579372


### Exponentially weighted moving average

In [17]:
df_weather_key_store_processed = create_ewma_features(
    df_weather_key_store_processed,
    target_col="logunits",
    alphas=(0.5, 0.75),
    windows=(7, 14, 28),
    group_cols=("store_nbr", "item_nbr"),
)

✓ Created 6 EWMA features for 'logunits'


In [18]:
store_id = 1
item_id = 9

df_weather_key_store_processed.query("store_nbr == @store_id and item_nbr == @item_id")\
           .sort_values('date')\
           .head(5)

Unnamed: 0,date,store_nbr,item_nbr,units,logunits,station_nbr,tmax,depart,cool,sunrise,sunset,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,BCFG,BLDU,BLSN,BR,DU,DZ,FG,FG+,FU,FZDZ,FZFG,FZRA,GR,GS,HZ,MIFG,PL,PRFG,RA,SG,SN,SQ,TS,TSRA,TSSN,UP,VCFG,VCTS,year,month,day,day_of_week,is_weekend,season,season_Spring,season_Summer,season_Winter,is_holiday,is_blackfriday,logunits_lag_1,logunits_lag_2,logunits_lag_3,logunits_lag_4,logunits_lag_5,logunits_lag_6,logunits_lag_7,logunits_lag_14,logunits_lag_21,logunits_lag_28,logunits_mean_7d,logunits_min_7d,logunits_max_7d,logunits_std_7d,logunits_mean_14d,logunits_min_14d,logunits_max_14d,logunits_std_14d,logunits_mean_28d,logunits_min_28d,logunits_max_28d,logunits_std_28d,logunits_ewma_7d_a05,logunits_ewma_14d_a05,logunits_ewma_28d_a05,logunits_ewma_7d_a075,logunits_ewma_14d_a075,logunits_ewma_28d_a075
0,2012-01-01,1,9,29.0,3.401197,1,52.0,1.452913,0.0,585.011125,1823.365065,0.038126,0.05,29.78,29.92,3.6,20.0,4.6,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2012,1,1,6,1,0,0,0,1,1,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2012-01-02,1,9,60.0,4.110874,1,50.0,11.333333,0.0,585.011125,1823.365065,0.038126,0.01,29.44,29.62,9.8,24.0,10.3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,2,0,0,0,0,0,1,1,0,3.401197,,,,,,,,,,3.401197,3.401197,3.401197,,3.401197,3.401197,3.401197,,3.401197,3.401197,3.401197,,3.401197,3.401197,3.401197,3.401197,3.401197,3.401197
2,2012-01-03,1,9,15.0,2.772589,1,32.0,-4.666667,0.0,585.011125,1823.365065,0.038126,0.0,29.67,29.87,10.8,31.0,11.6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,3,1,0,0,0,0,1,0,0,4.110874,3.401197,,,,,,,,,3.756036,3.401197,4.110874,0.501817,3.756036,3.401197,4.110874,0.501817,3.756036,3.401197,4.110874,0.501817,3.756036,3.756036,3.756036,3.933455,3.933455,3.933455
3,2012-01-04,1,9,20.0,3.044522,1,28.0,-8.333333,0.0,585.011125,1823.365065,0.038126,0.0,29.86,30.03,6.3,27.0,8.3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,4,2,0,0,0,0,1,0,0,2.772589,4.110874,3.401197,,,,,,,,3.42822,2.772589,4.110874,0.669552,3.42822,2.772589,4.110874,0.669552,3.42822,2.772589,4.110874,0.669552,3.264312,3.264312,3.264312,3.062805,3.062805,3.062805
4,2012-01-05,1,9,16.0,2.833213,1,38.0,4.0,0.0,585.011125,1823.365065,0.038126,0.0,29.67,29.84,6.9,25.0,7.8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,5,3,0,0,0,0,1,0,0,3.044522,2.772589,4.110874,3.401197,,,,,,,3.332296,2.772589,4.110874,0.579372,3.332296,2.772589,4.110874,0.579372,3.332296,2.772589,4.110874,0.579372,3.154417,3.154417,3.154417,3.049093,3.049093,3.049093


### Store/Item-level features

In [19]:
# Tổng / trung bình 7 ngày gần nhất của TOÀN STORE (context theo store)
df_weather_key_store_processed = df_weather_key_store_processed.sort_values(["store_nbr", "date"])

df_weather_key_store_processed["store_sum_7d"] = (
    df_weather_key_store_processed.groupby("store_nbr")["logunits"]
    .transform(lambda x: x.shift(1).rolling(7, min_periods=1).sum())
)

df_weather_key_store_processed["store_mean_7d"] = (
    df_weather_key_store_processed.groupby("store_nbr")["logunits"]
    .transform(lambda x: x.shift(1).rolling(7, min_periods=1).mean())
)

# Tương tự cho item (trên tất cả stores)
df_weather_key_store_processed = df_weather_key_store_processed.sort_values(["item_nbr", "date"])

df_weather_key_store_processed["item_sum_7d"] = (
    df_weather_key_store_processed.groupby("item_nbr")["logunits"]
    .transform(lambda x: x.shift(1).rolling(7, min_periods=1).sum())
)

df_weather_key_store_processed["item_mean_7d"] = (
    df_weather_key_store_processed.groupby("item_nbr")["logunits"]
    .transform(lambda x: x.shift(1).rolling(7, min_periods=1).mean())
)


In [20]:
store_id = 1
item_id = 9

df_weather_key_store_processed.query("store_nbr == @store_id and item_nbr == @item_id")\
           .sort_values('date')\
           .head(5)

Unnamed: 0,date,store_nbr,item_nbr,units,logunits,station_nbr,tmax,depart,cool,sunrise,sunset,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,BCFG,BLDU,BLSN,BR,DU,DZ,FG,FG+,FU,FZDZ,FZFG,FZRA,GR,GS,HZ,MIFG,PL,PRFG,RA,SG,SN,SQ,TS,TSRA,TSSN,UP,VCFG,VCTS,year,month,day,day_of_week,is_weekend,season,season_Spring,season_Summer,season_Winter,is_holiday,is_blackfriday,logunits_lag_1,logunits_lag_2,logunits_lag_3,logunits_lag_4,logunits_lag_5,logunits_lag_6,logunits_lag_7,logunits_lag_14,logunits_lag_21,logunits_lag_28,logunits_mean_7d,logunits_min_7d,logunits_max_7d,logunits_std_7d,logunits_mean_14d,logunits_min_14d,logunits_max_14d,logunits_std_14d,logunits_mean_28d,logunits_min_28d,logunits_max_28d,logunits_std_28d,logunits_ewma_7d_a05,logunits_ewma_14d_a05,logunits_ewma_28d_a05,logunits_ewma_7d_a075,logunits_ewma_14d_a075,logunits_ewma_28d_a075,store_sum_7d,store_mean_7d,item_sum_7d,item_mean_7d
0,2012-01-01,1,9,29.0,3.401197,1,52.0,1.452913,0.0,585.011125,1823.365065,0.038126,0.05,29.78,29.92,3.6,20.0,4.6,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2012,1,1,6,1,0,0,0,1,1,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2012-01-02,1,9,60.0,4.110874,1,50.0,11.333333,0.0,585.011125,1823.365065,0.038126,0.01,29.44,29.62,9.8,24.0,10.3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,2,0,0,0,0,0,1,1,0,3.401197,,,,,,,,,,3.401197,3.401197,3.401197,,3.401197,3.401197,3.401197,,3.401197,3.401197,3.401197,,3.401197,3.401197,3.401197,3.401197,3.401197,3.401197,1.791759,0.255966,25.130209,3.59003
2,2012-01-03,1,9,15.0,2.772589,1,32.0,-4.666667,0.0,585.011125,1823.365065,0.038126,0.0,29.67,29.87,10.8,31.0,11.6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,3,1,0,0,0,0,1,0,0,4.110874,3.401197,,,,,,,,,3.756036,3.401197,4.110874,0.501817,3.756036,3.401197,4.110874,0.501817,3.756036,3.401197,4.110874,0.501817,3.756036,3.756036,3.756036,3.933455,3.933455,3.933455,2.484907,0.354987,29.649597,4.235657
3,2012-01-04,1,9,20.0,3.044522,1,28.0,-8.333333,0.0,585.011125,1823.365065,0.038126,0.0,29.86,30.03,6.3,27.0,8.3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,4,2,0,0,0,0,1,0,0,2.772589,4.110874,3.401197,,,,,,,,3.42822,2.772589,4.110874,0.669552,3.42822,2.772589,4.110874,0.669552,3.42822,2.772589,4.110874,0.669552,3.264312,3.264312,3.264312,3.062805,3.062805,3.062805,2.302585,0.328941,25.803097,3.686157
4,2012-01-05,1,9,16.0,2.833213,1,38.0,4.0,0.0,585.011125,1823.365065,0.038126,0.0,29.67,29.84,6.9,25.0,7.8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,5,3,0,0,0,0,1,0,0,3.044522,2.772589,4.110874,3.401197,,,,,,,3.332296,2.772589,4.110874,0.579372,3.332296,2.772589,4.110874,0.579372,3.332296,2.772589,4.110874,0.579372,3.154417,3.154417,3.154417,3.049093,3.049093,3.049093,1.386294,0.198042,24.761612,3.537373


In [21]:
# Drop rows with NaN (these will be the first n days without lag features)
df_weather_key_store_processed = df_weather_key_store_processed.dropna()

## Save feature engineered data

In [22]:
df_weather_key_store_processed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 228898 entries, 184415 to 204501
Data columns (total 89 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   date                    228898 non-null  datetime64[ns]
 1   store_nbr               228898 non-null  int64         
 2   item_nbr                228898 non-null  int64         
 3   units                   228898 non-null  float64       
 4   logunits                228898 non-null  float64       
 5   station_nbr             228898 non-null  int64         
 6   tmax                    228898 non-null  float64       
 7   depart                  228898 non-null  float64       
 8   cool                    228898 non-null  float64       
 9   sunrise                 228898 non-null  float64       
 10  sunset                  228898 non-null  float64       
 11  snowfall                228898 non-null  float64       
 12  preciptotal             228898

In [23]:
# PATH
num_features = df_weather_key_store_processed.shape[1]
save_path = os.path.join(DATA_DIR, f"feature_engineered_data_{num_features}_features.feather")
save_path

'../../data/feature_engineered_data_89_features.feather'

In [24]:
# Save data
save_data(df_weather_key_store_processed, save_path, file_format='feather')

DataFrame saved to ../../data/feature_engineered_data_89_features.feather in Feather format.
