# Part III: Feature Engineering

## Basic settings

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import warnings
from datetime import datetime, timedelta

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

warnings.filterwarnings("ignore")
pd.options.display.max_rows = 50
pd.options.display.max_columns = None

In [3]:
src_path = os.path.abspath(os.path.join("../..", "src"))
if src_path not in sys.path:
    sys.path.append(src_path)

In [4]:
from utils.utils import save_data, add_date_features

In [5]:
src_path = os.path.abspath(os.path.join("../..", "src"))
if src_path not in sys.path:
    sys.path.append(src_path)
from data_generator.data_generator import check_missing_values
from utils.utils import add_date_features, compute_optimal_lags, create_rolling_features, create_ewma_features, remove_multicollinear_features

In [6]:
DATA_DIR = "../../data/"
FIGURES_DIR = "../figures"

## Load preprocessed data

In [7]:
df_sales = pd.read_csv(
    os.path.join(DATA_DIR, "data_processed/sales_data_preprocessed.csv"), parse_dates=["date"]
)
df_weather = pd.read_csv(
    os.path.join(DATA_DIR, "data_processed/weather_preprocessed.csv"), parse_dates=["date"]
)

df_weather_key_store_merged = pd.read_csv(
    os.path.join(DATA_DIR, "data_processed/weather_key_store_merged.csv"), parse_dates=["date"]
)


In [8]:
df_weather_key_store_merged

Unnamed: 0,date,store_nbr,item_nbr,units,logunits,is_kaggle_test,station_nbr,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,sunrise,sunset,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,BCFG,BLDU,BLSN,BR,DU,DZ,FG,FG+,FU,FZDZ,FZFG,FZRA,GR,GS,HZ,MIFG,PL,PRFG,RA,SG,SN,SQ,TS,TSRA,TSSN,UP,VCFG,VCTS
0,2013-06-04,1,1,,,True,1,70.0,53.0,62.0,-5.000000,42.0,52.0,3.0,0.0,585.011125,1823.365065,0.038126,0.000000,29.77,29.96,7.3,31.0,8.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2013-06-05,1,1,,,True,1,75.0,50.0,63.0,-2.666667,46.0,54.0,2.0,0.0,585.011125,1823.365065,0.038126,0.000000,29.98,30.15,1.9,25.0,4.8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2013-06-06,1,1,,,True,1,74.0,54.0,64.0,-3.666667,50.0,56.0,1.0,0.0,585.011125,1823.365065,0.038126,0.010000,30.00,30.16,3.2,12.0,4.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,2013-06-07,1,1,,,True,1,58.0,54.0,56.0,-9.666667,52.0,54.0,9.0,0.0,585.011125,1823.365065,0.038126,1.290000,29.89,30.02,8.4,6.0,8.7,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,2013-06-08,1,1,,,True,1,79.0,54.0,67.0,-3.333333,56.0,59.0,0.0,2.0,585.011125,1823.365065,0.038126,0.970000,29.64,29.82,4.8,32.0,8.6,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762950,2014-10-22,45,111,,,True,16,56.0,50.0,53.0,4.000000,49.0,50.0,12.0,0.0,585.011125,1823.365065,0.038126,1.310000,29.83,29.92,18.1,3.0,18.5,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
762951,2014-10-23,45,111,,,True,16,53.0,49.0,51.0,4.666667,48.0,50.0,14.0,0.0,585.011125,1823.365065,0.038126,5.040000,29.57,29.65,18.1,2.0,18.6,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
762952,2014-10-24,45,111,,,True,16,53.0,45.0,49.0,4.666667,45.0,47.0,16.0,0.0,585.011125,1823.365065,0.038126,0.040000,29.56,29.68,9.1,33.0,9.5,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
762953,2014-10-25,45,111,,,True,16,65.0,43.0,54.0,7.333333,41.0,47.0,11.0,0.0,585.011125,1823.365065,0.038126,0.000000,29.55,29.62,6.4,26.0,7.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Feature Engineering

In [9]:
# Start by creating a copy of the logunits dataframe
df_weather_key_store_merged = df_weather_key_store_merged.sort_values(['store_nbr', 'item_nbr', 'date'])
df_weather_key_store_processed = df_weather_key_store_merged.copy()

### Add date and holidays related features

In [10]:
df_weather_key_store_processed = add_date_features(df_weather_key_store_processed)

✓ Encoded season into 3 dummy columns
Fetching US holidays...


✓ Added date features with US seasons


In [11]:
df_weather_key_store_processed

Unnamed: 0,date,store_nbr,item_nbr,units,logunits,is_kaggle_test,station_nbr,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,sunrise,sunset,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,BCFG,BLDU,BLSN,BR,DU,DZ,FG,FG+,FU,FZDZ,FZFG,FZRA,GR,GS,HZ,MIFG,PL,PRFG,RA,SG,SN,SQ,TS,TSRA,TSSN,UP,VCFG,VCTS,year,month,day,day_of_week,day_of_year,week_of_year,quarter,is_weekend,season,season_Spring,season_Summer,season_Winter,is_holiday,is_blackfriday
0,2013-06-04,1,1,,,True,1,70.0,53.0,62.0,-5.000000,42.0,52.0,3.0,0.0,585.011125,1823.365065,0.038126,0.000000,29.77,29.96,7.3,31.0,8.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2013,6,4,1,155,23,2,0,2,0,1,0,0,0
1,2013-06-05,1,1,,,True,1,75.0,50.0,63.0,-2.666667,46.0,54.0,2.0,0.0,585.011125,1823.365065,0.038126,0.000000,29.98,30.15,1.9,25.0,4.8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2013,6,5,2,156,23,2,0,2,0,1,0,0,0
2,2013-06-06,1,1,,,True,1,74.0,54.0,64.0,-3.666667,50.0,56.0,1.0,0.0,585.011125,1823.365065,0.038126,0.010000,30.00,30.16,3.2,12.0,4.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2013,6,6,3,157,23,2,0,2,0,1,0,0,0
3,2013-06-07,1,1,,,True,1,58.0,54.0,56.0,-9.666667,52.0,54.0,9.0,0.0,585.011125,1823.365065,0.038126,1.290000,29.89,30.02,8.4,6.0,8.7,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2013,6,7,4,158,23,2,0,2,0,1,0,0,0
4,2013-06-08,1,1,,,True,1,79.0,54.0,67.0,-3.333333,56.0,59.0,0.0,2.0,585.011125,1823.365065,0.038126,0.970000,29.64,29.82,4.8,32.0,8.6,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2013,6,8,5,159,23,2,1,2,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762950,2014-10-22,45,111,,,True,16,56.0,50.0,53.0,4.000000,49.0,50.0,12.0,0.0,585.011125,1823.365065,0.038126,1.310000,29.83,29.92,18.1,3.0,18.5,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2014,10,22,2,295,43,4,0,3,0,0,0,0,0
762951,2014-10-23,45,111,,,True,16,53.0,49.0,51.0,4.666667,48.0,50.0,14.0,0.0,585.011125,1823.365065,0.038126,5.040000,29.57,29.65,18.1,2.0,18.6,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,2014,10,23,3,296,43,4,0,3,0,0,0,0,0
762952,2014-10-24,45,111,,,True,16,53.0,45.0,49.0,4.666667,45.0,47.0,16.0,0.0,585.011125,1823.365065,0.038126,0.040000,29.56,29.68,9.1,33.0,9.5,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2014,10,24,4,297,43,4,0,3,0,0,0,0,0
762953,2014-10-25,45,111,,,True,16,65.0,43.0,54.0,7.333333,41.0,47.0,11.0,0.0,585.011125,1823.365065,0.038126,0.000000,29.55,29.62,6.4,26.0,7.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2014,10,25,5,298,43,4,1,3,0,0,0,0,0


### Remove multicollinearity

In [12]:
df_weather_key_store_processed, dropped = remove_multicollinear_features(
    df_weather_key_store_processed, 
    threshold=0.90,
    exclude_cols=['units', 'logunits'] 
)

Features có correlation > 0.9:
  - tmin: correlation cao với ['tmax']
  - tavg: correlation cao với ['tmax', 'tmin']
  - dewpoint: correlation cao với ['tmin', 'tavg']
  - wetbulb: correlation cao với ['tmax', 'tmin', 'tavg', 'dewpoint']
  - heat: correlation cao với ['tmax', 'tavg', 'wetbulb']
  - avgspeed: correlation cao với ['resultspeed']
  - day_of_year: correlation cao với ['month']
  - week_of_year: correlation cao với ['month', 'day_of_year']
  - quarter: correlation cao với ['month', 'day_of_year', 'week_of_year']

✓ Đã bỏ 9 features: ['tmin', 'tavg', 'dewpoint', 'wetbulb', 'heat', 'avgspeed', 'day_of_year', 'week_of_year', 'quarter']
✓ Còn lại 57 features
✓ Protected columns: ['units', 'logunits']


### Last n days logunits

In [13]:
optimal_lags = compute_optimal_lags(df_weather_key_store_processed, target_col='units', max_lag=30)
print(optimal_lags)

✓ Analyzed 255/4892 groups
✓ Average sample size: 926 observations
✓ Average CI threshold: ±0.064 (95% confidence)
✓ Found 0 lags significant in ≥30% of groups
✓ Top 10 lags: []
{'significant_lags': [], 'lag_frequencies': {1: 234, 6: 156, 7: 159, 8: 115, 10: 69, 13: 99, 14: 118, 16: 38, 17: 44, 20: 81, 28: 96, 2: 165, 3: 133, 4: 130, 5: 143, 18: 40, 19: 39, 12: 57, 11: 53, 21: 108, 22: 46, 24: 24, 25: 32, 26: 36, 9: 79, 15: 63, 29: 37, 30: 29, 23: 29, 27: 68}, 'avg_ci_threshold': 0.06442096382702076, 'stats': {'total_groups': 4892, 'analyzed_groups': 255, 'avg_sample_size': 925.6392156862745}}


In [14]:
lags = [1, 2, 3, 4, 5, 6, 7, 14, 21, 28]

for n in lags:
    df_weather_key_store_processed[f"logunits_lag_{n}"] = (
        df_weather_key_store_processed
        .groupby(["store_nbr", "item_nbr"])["logunits"]
        .transform(lambda x: x.shift(n))
    )

### Rolling average features

In [15]:
df_weather_key_store_processed = create_rolling_features(
    df_weather_key_store_processed,
    target_col='logunits',
    windows=[7, 14, 28],
    statistics=['mean', 'min', 'max', 'std'],
    group_cols=['store_nbr', 'item_nbr']
)

✓ Created 12 rolling features for 'logunits'


In [16]:
df_weather_key_store_processed

Unnamed: 0,date,store_nbr,item_nbr,units,logunits,is_kaggle_test,station_nbr,tmax,depart,cool,sunrise,sunset,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,BCFG,BLDU,BLSN,BR,DU,DZ,FG,FG+,FU,FZDZ,FZFG,FZRA,GR,GS,HZ,MIFG,PL,PRFG,RA,SG,SN,SQ,TS,TSRA,TSSN,UP,VCFG,VCTS,year,month,day,day_of_week,is_weekend,season,season_Spring,season_Summer,season_Winter,is_holiday,is_blackfriday,logunits_lag_1,logunits_lag_2,logunits_lag_3,logunits_lag_4,logunits_lag_5,logunits_lag_6,logunits_lag_7,logunits_lag_14,logunits_lag_21,logunits_lag_28,logunits_mean_7d,logunits_min_7d,logunits_max_7d,logunits_std_7d,logunits_mean_14d,logunits_min_14d,logunits_max_14d,logunits_std_14d,logunits_mean_28d,logunits_min_28d,logunits_max_28d,logunits_std_28d
0,2013-06-04,1,1,,,True,1,70.0,-5.000000,0.0,585.011125,1823.365065,0.038126,0.000000,29.77,29.96,7.3,31.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2013,6,4,1,0,2,0,1,0,0,0,,,,,,,,,,,,,,,,,,,,,,
1,2013-06-05,1,1,,,True,1,75.0,-2.666667,0.0,585.011125,1823.365065,0.038126,0.000000,29.98,30.15,1.9,25.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2013,6,5,2,0,2,0,1,0,0,0,,,,,,,,,,,,,,,,,,,,,,
2,2013-06-06,1,1,,,True,1,74.0,-3.666667,0.0,585.011125,1823.365065,0.038126,0.010000,30.00,30.16,3.2,12.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2013,6,6,3,0,2,0,1,0,0,0,,,,,,,,,,,,,,,,,,,,,,
3,2013-06-07,1,1,,,True,1,58.0,-9.666667,0.0,585.011125,1823.365065,0.038126,1.290000,29.89,30.02,8.4,6.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2013,6,7,4,0,2,0,1,0,0,0,,,,,,,,,,,,,,,,,,,,,,
4,2013-06-08,1,1,,,True,1,79.0,-3.333333,2.0,585.011125,1823.365065,0.038126,0.970000,29.64,29.82,4.8,32.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2013,6,8,5,1,2,0,1,0,0,0,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762950,2014-10-22,45,111,,,True,16,56.0,4.000000,0.0,585.011125,1823.365065,0.038126,1.310000,29.83,29.92,18.1,3.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2014,10,22,2,0,3,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,
762951,2014-10-23,45,111,,,True,16,53.0,4.666667,0.0,585.011125,1823.365065,0.038126,5.040000,29.57,29.65,18.1,2.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,2014,10,23,3,0,3,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,
762952,2014-10-24,45,111,,,True,16,53.0,4.666667,0.0,585.011125,1823.365065,0.038126,0.040000,29.56,29.68,9.1,33.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2014,10,24,4,0,3,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,
762953,2014-10-25,45,111,,,True,16,65.0,7.333333,0.0,585.011125,1823.365065,0.038126,0.000000,29.55,29.62,6.4,26.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2014,10,25,5,1,3,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,


In [17]:
store_id = 1
item_id = 9

df_weather_key_store_processed.query("store_nbr == @store_id and item_nbr == @item_id")\
           .sort_values('date')\
           .head(5)

Unnamed: 0,date,store_nbr,item_nbr,units,logunits,is_kaggle_test,station_nbr,tmax,depart,cool,sunrise,sunset,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,BCFG,BLDU,BLSN,BR,DU,DZ,FG,FG+,FU,FZDZ,FZFG,FZRA,GR,GS,HZ,MIFG,PL,PRFG,RA,SG,SN,SQ,TS,TSRA,TSSN,UP,VCFG,VCTS,year,month,day,day_of_week,is_weekend,season,season_Spring,season_Summer,season_Winter,is_holiday,is_blackfriday,logunits_lag_1,logunits_lag_2,logunits_lag_3,logunits_lag_4,logunits_lag_5,logunits_lag_6,logunits_lag_7,logunits_lag_14,logunits_lag_21,logunits_lag_28,logunits_mean_7d,logunits_min_7d,logunits_max_7d,logunits_std_7d,logunits_mean_14d,logunits_min_14d,logunits_max_14d,logunits_std_14d,logunits_mean_28d,logunits_min_28d,logunits_max_28d,logunits_std_28d
840,2012-01-01,1,9,29.0,3.401197,False,1,52.0,1.452913,0.0,585.011125,1823.365065,0.038126,0.05,29.78,29.92,3.6,20.0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2012,1,1,6,1,0,0,0,1,1,0,,,,,,,,,,,,,,,,,,,,,,
841,2012-01-02,1,9,60.0,4.110874,False,1,50.0,11.333333,0.0,585.011125,1823.365065,0.038126,0.01,29.44,29.62,9.8,24.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,2,0,0,0,0,0,1,1,0,3.401197,,,,,,,,,,3.401197,3.401197,3.401197,,3.401197,3.401197,3.401197,,3.401197,3.401197,3.401197,
842,2012-01-03,1,9,15.0,2.772589,False,1,32.0,-4.666667,0.0,585.011125,1823.365065,0.038126,0.0,29.67,29.87,10.8,31.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,3,1,0,0,0,0,1,0,0,4.110874,3.401197,,,,,,,,,3.756036,3.401197,4.110874,0.501817,3.756036,3.401197,4.110874,0.501817,3.756036,3.401197,4.110874,0.501817
843,2012-01-04,1,9,20.0,3.044522,False,1,28.0,-8.333333,0.0,585.011125,1823.365065,0.038126,0.0,29.86,30.03,6.3,27.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,4,2,0,0,0,0,1,0,0,2.772589,4.110874,3.401197,,,,,,,,3.42822,2.772589,4.110874,0.669552,3.42822,2.772589,4.110874,0.669552,3.42822,2.772589,4.110874,0.669552
844,2012-01-05,1,9,16.0,2.833213,False,1,38.0,4.0,0.0,585.011125,1823.365065,0.038126,0.0,29.67,29.84,6.9,25.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,5,3,0,0,0,0,1,0,0,3.044522,2.772589,4.110874,3.401197,,,,,,,3.332296,2.772589,4.110874,0.579372,3.332296,2.772589,4.110874,0.579372,3.332296,2.772589,4.110874,0.579372


### Exponentially weighted moving average

In [18]:
df_weather_key_store_processed = create_ewma_features(
    df_weather_key_store_processed,
    target_col="logunits",
    alphas=(0.5, 0.75),
    windows=(7, 14, 28),
    group_cols=("store_nbr", "item_nbr"),
)

✓ Created 6 EWMA features for 'logunits'


In [19]:
store_id = 1
item_id = 9

df_weather_key_store_processed.query("store_nbr == @store_id and item_nbr == @item_id")\
           .sort_values('date')\
           .head(5)

Unnamed: 0,date,store_nbr,item_nbr,units,logunits,is_kaggle_test,station_nbr,tmax,depart,cool,sunrise,sunset,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,BCFG,BLDU,BLSN,BR,DU,DZ,FG,FG+,FU,FZDZ,FZFG,FZRA,GR,GS,HZ,MIFG,PL,PRFG,RA,SG,SN,SQ,TS,TSRA,TSSN,UP,VCFG,VCTS,year,month,day,day_of_week,is_weekend,season,season_Spring,season_Summer,season_Winter,is_holiday,is_blackfriday,logunits_lag_1,logunits_lag_2,logunits_lag_3,logunits_lag_4,logunits_lag_5,logunits_lag_6,logunits_lag_7,logunits_lag_14,logunits_lag_21,logunits_lag_28,logunits_mean_7d,logunits_min_7d,logunits_max_7d,logunits_std_7d,logunits_mean_14d,logunits_min_14d,logunits_max_14d,logunits_std_14d,logunits_mean_28d,logunits_min_28d,logunits_max_28d,logunits_std_28d,logunits_ewma_7d_a05,logunits_ewma_14d_a05,logunits_ewma_28d_a05,logunits_ewma_7d_a075,logunits_ewma_14d_a075,logunits_ewma_28d_a075
840,2012-01-01,1,9,29.0,3.401197,False,1,52.0,1.452913,0.0,585.011125,1823.365065,0.038126,0.05,29.78,29.92,3.6,20.0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2012,1,1,6,1,0,0,0,1,1,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
841,2012-01-02,1,9,60.0,4.110874,False,1,50.0,11.333333,0.0,585.011125,1823.365065,0.038126,0.01,29.44,29.62,9.8,24.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,2,0,0,0,0,0,1,1,0,3.401197,,,,,,,,,,3.401197,3.401197,3.401197,,3.401197,3.401197,3.401197,,3.401197,3.401197,3.401197,,3.401197,3.401197,3.401197,3.401197,3.401197,3.401197
842,2012-01-03,1,9,15.0,2.772589,False,1,32.0,-4.666667,0.0,585.011125,1823.365065,0.038126,0.0,29.67,29.87,10.8,31.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,3,1,0,0,0,0,1,0,0,4.110874,3.401197,,,,,,,,,3.756036,3.401197,4.110874,0.501817,3.756036,3.401197,4.110874,0.501817,3.756036,3.401197,4.110874,0.501817,3.756036,3.756036,3.756036,3.933455,3.933455,3.933455
843,2012-01-04,1,9,20.0,3.044522,False,1,28.0,-8.333333,0.0,585.011125,1823.365065,0.038126,0.0,29.86,30.03,6.3,27.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,4,2,0,0,0,0,1,0,0,2.772589,4.110874,3.401197,,,,,,,,3.42822,2.772589,4.110874,0.669552,3.42822,2.772589,4.110874,0.669552,3.42822,2.772589,4.110874,0.669552,3.264312,3.264312,3.264312,3.062805,3.062805,3.062805
844,2012-01-05,1,9,16.0,2.833213,False,1,38.0,4.0,0.0,585.011125,1823.365065,0.038126,0.0,29.67,29.84,6.9,25.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,5,3,0,0,0,0,1,0,0,3.044522,2.772589,4.110874,3.401197,,,,,,,3.332296,2.772589,4.110874,0.579372,3.332296,2.772589,4.110874,0.579372,3.332296,2.772589,4.110874,0.579372,3.154417,3.154417,3.154417,3.049093,3.049093,3.049093


### Store/Item-level features

In [20]:
# Tổng / trung bình 7 ngày gần nhất của TOÀN STORE (context theo store)
df_weather_key_store_processed = df_weather_key_store_processed.sort_values(["store_nbr", "date"])

df_weather_key_store_processed["store_sum_7d"] = (
    df_weather_key_store_processed.groupby("store_nbr")["logunits"]
    .transform(lambda x: x.shift(1).rolling(7, min_periods=1).sum())
)

df_weather_key_store_processed["store_mean_7d"] = (
    df_weather_key_store_processed.groupby("store_nbr")["logunits"]
    .transform(lambda x: x.shift(1).rolling(7, min_periods=1).mean())
)

# Tương tự cho item (trên tất cả stores)
df_weather_key_store_processed = df_weather_key_store_processed.sort_values(["item_nbr", "date"])

df_weather_key_store_processed["item_sum_7d"] = (
    df_weather_key_store_processed.groupby("item_nbr")["logunits"]
    .transform(lambda x: x.shift(1).rolling(7, min_periods=1).sum())
)

df_weather_key_store_processed["item_mean_7d"] = (
    df_weather_key_store_processed.groupby("item_nbr")["logunits"]
    .transform(lambda x: x.shift(1).rolling(7, min_periods=1).mean())
)


In [21]:
store_id = 1
item_id = 9

df_weather_key_store_processed.query("store_nbr == @store_id and item_nbr == @item_id")\
           .sort_values('date')\
           .head(5)

Unnamed: 0,date,store_nbr,item_nbr,units,logunits,is_kaggle_test,station_nbr,tmax,depart,cool,sunrise,sunset,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,BCFG,BLDU,BLSN,BR,DU,DZ,FG,FG+,FU,FZDZ,FZFG,FZRA,GR,GS,HZ,MIFG,PL,PRFG,RA,SG,SN,SQ,TS,TSRA,TSSN,UP,VCFG,VCTS,year,month,day,day_of_week,is_weekend,season,season_Spring,season_Summer,season_Winter,is_holiday,is_blackfriday,logunits_lag_1,logunits_lag_2,logunits_lag_3,logunits_lag_4,logunits_lag_5,logunits_lag_6,logunits_lag_7,logunits_lag_14,logunits_lag_21,logunits_lag_28,logunits_mean_7d,logunits_min_7d,logunits_max_7d,logunits_std_7d,logunits_mean_14d,logunits_min_14d,logunits_max_14d,logunits_std_14d,logunits_mean_28d,logunits_min_28d,logunits_max_28d,logunits_std_28d,logunits_ewma_7d_a05,logunits_ewma_14d_a05,logunits_ewma_28d_a05,logunits_ewma_7d_a075,logunits_ewma_14d_a075,logunits_ewma_28d_a075,store_sum_7d,store_mean_7d,item_sum_7d,item_mean_7d
840,2012-01-01,1,9,29.0,3.401197,False,1,52.0,1.452913,0.0,585.011125,1823.365065,0.038126,0.05,29.78,29.92,3.6,20.0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2012,1,1,6,1,0,0,0,1,1,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
841,2012-01-02,1,9,60.0,4.110874,False,1,50.0,11.333333,0.0,585.011125,1823.365065,0.038126,0.01,29.44,29.62,9.8,24.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,2,0,0,0,0,0,1,1,0,3.401197,,,,,,,,,,3.401197,3.401197,3.401197,,3.401197,3.401197,3.401197,,3.401197,3.401197,3.401197,,3.401197,3.401197,3.401197,3.401197,3.401197,3.401197,1.791759,0.255966,25.130209,3.59003
842,2012-01-03,1,9,15.0,2.772589,False,1,32.0,-4.666667,0.0,585.011125,1823.365065,0.038126,0.0,29.67,29.87,10.8,31.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,3,1,0,0,0,0,1,0,0,4.110874,3.401197,,,,,,,,,3.756036,3.401197,4.110874,0.501817,3.756036,3.401197,4.110874,0.501817,3.756036,3.401197,4.110874,0.501817,3.756036,3.756036,3.756036,3.933455,3.933455,3.933455,2.484907,0.354987,29.649597,4.235657
843,2012-01-04,1,9,20.0,3.044522,False,1,28.0,-8.333333,0.0,585.011125,1823.365065,0.038126,0.0,29.86,30.03,6.3,27.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,4,2,0,0,0,0,1,0,0,2.772589,4.110874,3.401197,,,,,,,,3.42822,2.772589,4.110874,0.669552,3.42822,2.772589,4.110874,0.669552,3.42822,2.772589,4.110874,0.669552,3.264312,3.264312,3.264312,3.062805,3.062805,3.062805,2.302585,0.328941,25.803097,3.686157
844,2012-01-05,1,9,16.0,2.833213,False,1,38.0,4.0,0.0,585.011125,1823.365065,0.038126,0.0,29.67,29.84,6.9,25.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2012,1,5,3,0,0,0,0,1,0,0,3.044522,2.772589,4.110874,3.401197,,,,,,,3.332296,2.772589,4.110874,0.579372,3.332296,2.772589,4.110874,0.579372,3.332296,2.772589,4.110874,0.579372,3.154417,3.154417,3.154417,3.049093,3.049093,3.049093,1.386294,0.198042,24.761612,3.537373


In [22]:
print("DEBUG BEFORE DROPNA:")
print("Total rows:", len(df_weather_key_store_processed))
print("Test rows (is_kaggle_test=1):", (df_weather_key_store_processed['is_kaggle_test'] == 1).sum())
print("Date range of Test:", df_weather_key_store_processed[df_weather_key_store_processed['is_kaggle_test']==1]['date'].min(), "to", df_weather_key_store_processed[df_weather_key_store_processed['is_kaggle_test']==1]['date'].max())

DEBUG BEFORE DROPNA:
Total rows: 762955
Test rows (is_kaggle_test=1): 526917
Date range of Test: 2013-04-01 00:00:00 to 2014-10-26 00:00:00


In [23]:
# Drop rows with NaN (these will be the first n days without lag features)
is_kaggle = df_weather_key_store_processed['is_kaggle_test'] == 1

# Tách train/test
train_part = df_weather_key_store_processed[~is_kaggle].copy()
test_part  = df_weather_key_store_processed[is_kaggle].copy()

# Drop NaN CHỈ trong train
train_part = train_part.dropna()

# Ghép lại
df_weather_key_store_processed = pd.concat([train_part, test_part], ignore_index=True)

print("After dropna on train only:")
print("  Train rows:", (~df_weather_key_store_processed['is_kaggle_test']).sum())
print("  Test rows:", (df_weather_key_store_processed['is_kaggle_test']).sum())

After dropna on train only:
  Train rows: 159270
  Test rows: 526917


## Save feature engineered data

In [24]:
df_weather_key_store_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 686187 entries, 0 to 686186
Data columns (total 89 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   date                    686187 non-null  datetime64[ns]
 1   store_nbr               686187 non-null  int64         
 2   item_nbr                686187 non-null  int64         
 3   units                   159270 non-null  float64       
 4   logunits                159270 non-null  float64       
 5   is_kaggle_test          686187 non-null  bool          
 6   station_nbr             686187 non-null  int64         
 7   tmax                    686187 non-null  float64       
 8   depart                  686187 non-null  float64       
 9   cool                    686187 non-null  float64       
 10  sunrise                 686187 non-null  float64       
 11  sunset                  686187 non-null  float64       
 12  snowfall                686187

In [25]:
# PATH
num_features = df_weather_key_store_processed.shape[1]
save_path = os.path.join(DATA_DIR, f"feature_engineered_data_{num_features}_features.feather")
save_path

'../../data/feature_engineered_data_89_features.feather'

In [26]:
# Save data
save_data(df_weather_key_store_processed, save_path, file_format='feather')

DataFrame saved to ../../data/feature_engineered_data_89_features.feather in Feather format.
