## Load + Basic setup

In [1]:
import pandas as pd
import numpy as np

DATA_PATH = "../data/synthetic_kpi.csv"

df = pd.read_csv(DATA_PATH, parse_dates=["date"]).sort_values("date").reset_index(drop=True)
df.head()

Unnamed: 0,date,kpi_value,promo_index,baseline_trend,seasonality,is_anomaly
0,2016-01-01,46.780252,0.0,50.0,-5.206605,0
1,2016-01-02,51.350021,13.572203,50.03001,-11.699135,0
2,2016-01-03,43.268796,0.0,50.06002,-9.381978,0
3,2016-01-04,73.778452,17.596302,50.09003,0.0,0
4,2016-01-05,58.565404,0.0,50.12004,9.381978,0


## Feature functions (simple + reusable)

In [2]:
def add_calendar_features(d: pd.DataFrame) -> pd.DataFrame:
    d = d.copy()
    d["dow"] = d["date"].dt.dayofweek
    d["month"] = d["date"].dt.month
    d["day"] = d["date"].dt.day
    d["is_weekend"] = (d["dow"] >= 5).astype(int)
    return d

def add_lags(d: pd.DataFrame, col: str, lags=(1,7,14)) -> pd.DataFrame:
    d = d.copy()
    for lag in lags:
        d[f"{col}_lag_{lag}"] = d[col].shift(lag)
    return d

def add_rolls(d: pd.DataFrame, col: str, windows=(7,14,28)) -> pd.DataFrame:
    d = d.copy()
    for w in windows:
        d[f"{col}_roll_mean_{w}"] = d[col].shift(1).rolling(w).mean()
        d[f"{col}_roll_std_{w}"]  = d[col].shift(1).rolling(w).std()
    return d

## Build feature table

In [3]:
df_feat = add_calendar_features(df)
df_feat = add_lags(df_feat, "kpi_value", lags=(1,7,14))
df_feat = add_rolls(df_feat, "kpi_value", windows=(7,14,28))

# promo features (if exists)
if "promo_index" in df_feat.columns:
    df_feat["promo_flag"] = (df_feat["promo_index"] > 0).astype(int)

df_feat.tail(5)

Unnamed: 0,date,kpi_value,promo_index,baseline_trend,seasonality,is_anomaly,dow,month,day,is_weekend,kpi_value_lag_1,kpi_value_lag_7,kpi_value_lag_14,kpi_value_roll_mean_7,kpi_value_roll_std_7,kpi_value_roll_mean_14,kpi_value_roll_std_14,kpi_value_roll_mean_28,kpi_value_roll_std_28,promo_flag
2995,2024-03-14,145.396487,0.0,139.87996,5.206605,0,3,3,14,0,166.850431,140.35568,149.755839,143.808577,11.498869,142.226905,10.122389,142.110299,13.418334,0
2996,2024-03-15,135.734375,0.0,139.90997,-5.206605,0,4,3,15,0,145.396487,144.140002,137.213193,144.528692,11.404041,141.915522,9.938348,142.232856,13.43262,0
2997,2024-03-16,123.273803,0.0,139.93998,-11.699135,0,5,3,16,1,135.734375,134.096351,128.9756,143.327889,11.884222,141.809892,9.999839,142.559774,13.145904,0
2998,2024-03-17,131.924718,0.0,139.96999,-9.381978,0,6,3,17,1,123.273803,132.786415,132.618801,141.78181,13.830045,141.402621,10.657261,142.37381,13.387062,0
2999,2024-03-18,139.378964,0.0,140.0,0.0,0,0,3,18,0,131.924718,140.078077,135.739675,141.658711,13.926952,141.353044,10.702783,142.724739,12.957792,0


## Define target + drop NA rows created by lags/rolls

In [4]:
target_col = "kpi_value"

feature_cols = [c for c in df_feat.columns if c not in ["date", target_col, "is_anomaly"]]
df_model = df_feat.dropna().reset_index(drop=True)

print("Feature columns:", len(feature_cols))
df_model[feature_cols + [target_col]].head()

Feature columns: 17


Unnamed: 0,promo_index,baseline_trend,seasonality,dow,month,day,is_weekend,kpi_value_lag_1,kpi_value_lag_7,kpi_value_lag_14,kpi_value_roll_mean_7,kpi_value_roll_std_7,kpi_value_roll_mean_14,kpi_value_roll_std_14,kpi_value_roll_mean_28,kpi_value_roll_std_28,promo_flag,kpi_value
0,0.0,50.84028,-5.206605,4,1,29,0,57.519667,44.5205,38.313864,49.124953,9.834182,48.720285,9.891477,50.839428,9.991112,0,43.23112
1,0.0,50.87029,-11.699135,5,1,30,1,43.23112,39.231198,36.501865,48.940756,9.946236,49.071518,9.575801,50.712673,10.066744,0,38.00438
2,0.0,50.9003,-9.381978,6,1,31,1,38.00438,35.60926,37.046858,48.765496,10.154469,49.17884,9.431419,50.236043,10.347474,0,39.111496
3,0.0,50.93031,0.0,0,2,1,0,39.111496,48.542709,51.767159,49.265816,9.460799,49.326314,9.241352,50.087568,10.480125,0,58.339423
4,0.0,50.96032,9.381978,1,2,2,0,58.339423,60.575918,56.290062,50.665346,10.042717,49.795762,9.53708,49.536175,9.55261,0,60.288309


## Save feature-ready dataset (optional but useful)

In [5]:
OUT_PATH = "../data/feature_table.parquet"
df_model.to_parquet(OUT_PATH, index=False)
print("Saved:", OUT_PATH)

Saved: ../data/feature_table.parquet
