In [5]:


# ## 1. Import Libraries
import pandas as pd
import numpy as np
import os

# ## 2. Load Preprocessed Data
processed_path = "../data/processed/cleaned_walmart.csv"
df = pd.read_csv(processed_path, parse_dates=["Date"])

print(" Data Loaded Successfully")
print("Shape:", df.shape)
df.head()


 Data Loaded Successfully
Shape: (6435, 8)


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,2010-02-05,1.074309,0,-0.995136,-1.7138,1.004175,0.056964
1,1,2010-02-12,1.071198,1,-1.20117,-1.766089,1.00788,0.056964
2,1,2010-02-19,1.017382,0,-1.124178,-1.840166,1.009074,0.056964
3,1,2010-02-26,0.654458,0,-0.760907,-1.737766,1.009849,0.056964
4,1,2010-03-05,0.914805,0,-0.767955,-1.598328,1.010624,0.056964


In [6]:
# ## 3. Create Time-based Features
df["year"] = df["Date"].dt.year
df["month"] = df["Date"].dt.month
df["week"] = df["Date"].dt.isocalendar().week
df["day_of_week"] = df["Date"].dt.dayofweek
df["is_weekend"] = df["day_of_week"].isin([5,6]).astype(int)

df.head()


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,year,month,week,day_of_week,is_weekend
0,1,2010-02-05,1.074309,0,-0.995136,-1.7138,1.004175,0.056964,2010,2,5,4,0
1,1,2010-02-12,1.071198,1,-1.20117,-1.766089,1.00788,0.056964,2010,2,6,4,0
2,1,2010-02-19,1.017382,0,-1.124178,-1.840166,1.009074,0.056964,2010,2,7,4,0
3,1,2010-02-26,0.654458,0,-0.760907,-1.737766,1.009849,0.056964,2010,2,8,4,0
4,1,2010-03-05,0.914805,0,-0.767955,-1.598328,1.010624,0.056964,2010,3,9,4,0


In [7]:
# ## 4. Create Lag Features (Weekly Sales history per store)
df = df.sort_values(["Store", "Date"])

df["sales_lag_1"] = df.groupby("Store")["Weekly_Sales"].shift(1)
df["sales_lag_2"] = df.groupby("Store")["Weekly_Sales"].shift(2)
df["sales_lag_4"] = df.groupby("Store")["Weekly_Sales"].shift(4)

df.head(10)


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,year,month,week,day_of_week,is_weekend,sales_lag_1,sales_lag_2,sales_lag_4
0,1,2010-02-05,1.074309,0,-0.995136,-1.7138,1.004175,0.056964,2010,2,5,4,0,,,
1,1,2010-02-12,1.071198,1,-1.20117,-1.766089,1.00788,0.056964,2010,2,6,4,0,1.074309,,
2,1,2010-02-19,1.017382,0,-1.124178,-1.840166,1.009074,0.056964,2010,2,7,4,0,1.071198,1.074309,
3,1,2010-02-26,0.654458,0,-0.760907,-1.737766,1.009849,0.056964,2010,2,8,4,0,1.017382,1.071198,
4,1,2010-03-05,0.914805,0,-0.767955,-1.598328,1.010624,0.056964,2010,3,9,4,0,0.654458,1.017382,1.074309
5,1,2010-03-12,0.707959,0,-0.155815,-1.506821,1.011399,0.056964,2010,3,10,4,0,0.914805,0.654458,1.071198
6,1,2010-03-19,0.767132,0,-0.329861,-1.391349,1.007206,0.056964,2010,3,11,4,0,0.707959,0.914805,1.017382
7,1,2010-03-26,0.644951,0,-0.499568,-1.365204,1.002185,0.056964,2010,3,12,4,0,0.767132,0.707959,0.654458
8,1,2010-04-02,0.986875,0,0.087089,-1.393527,0.997164,-0.101907,2010,4,13,4,0,0.644951,0.767132,0.914805
9,1,2010-04-09,0.897957,0,0.281737,-1.282412,0.992143,-0.101907,2010,4,14,4,0,0.986875,0.644951,0.707959


In [8]:
# ## 5. Rolling Window Features (last 4 weeks average)
df["rolling_mean_4w"] = df.groupby("Store")["Weekly_Sales"].shift(1).rolling(4).mean()
df["rolling_std_4w"] = df.groupby("Store")["Weekly_Sales"].shift(1).rolling(4).std()

df.head(10)


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,year,month,week,day_of_week,is_weekend,sales_lag_1,sales_lag_2,sales_lag_4,rolling_mean_4w,rolling_std_4w
0,1,2010-02-05,1.074309,0,-0.995136,-1.7138,1.004175,0.056964,2010,2,5,4,0,,,,,
1,1,2010-02-12,1.071198,1,-1.20117,-1.766089,1.00788,0.056964,2010,2,6,4,0,1.074309,,,,
2,1,2010-02-19,1.017382,0,-1.124178,-1.840166,1.009074,0.056964,2010,2,7,4,0,1.071198,1.074309,,,
3,1,2010-02-26,0.654458,0,-0.760907,-1.737766,1.009849,0.056964,2010,2,8,4,0,1.017382,1.071198,,,
4,1,2010-03-05,0.914805,0,-0.767955,-1.598328,1.010624,0.056964,2010,3,9,4,0,0.654458,1.017382,1.074309,0.954337,0.20162
5,1,2010-03-12,0.707959,0,-0.155815,-1.506821,1.011399,0.056964,2010,3,10,4,0,0.914805,0.654458,1.071198,0.914461,0.185078
6,1,2010-03-19,0.767132,0,-0.329861,-1.391349,1.007206,0.056964,2010,3,11,4,0,0.707959,0.914805,1.017382,0.823651,0.171125
7,1,2010-03-26,0.644951,0,-0.499568,-1.365204,1.002185,0.056964,2010,3,12,4,0,0.767132,0.707959,0.654458,0.761088,0.112336
8,1,2010-04-02,0.986875,0,0.087089,-1.393527,0.997164,-0.101907,2010,4,13,4,0,0.644951,0.767132,0.914805,0.758712,0.115403
9,1,2010-04-09,0.897957,0,0.281737,-1.282412,0.992143,-0.101907,2010,4,14,4,0,0.986875,0.644951,0.707959,0.776729,0.148715


In [9]:
# ## 6. Handle Missing Values from Lag/Rolling Features
df = df.dropna().reset_index(drop=True)
print(" Shape after dropping NA:", df.shape)


 Shape after dropping NA: (6255, 18)


In [10]:
# ## 7. Save Feature-Engineered Dataset
features_path = "../data/processed/walmart_with_features.csv"
df.to_csv(features_path, index=False)

print(" Feature Engineered Data Saved to:", features_path)


 Feature Engineered Data Saved to: ../data/processed/walmart_with_features.csv
