# Feature Engineering

In [4]:
import pandas as pd
from pathlib import Path

ROOT  = Path.cwd().parent
RAW   = ROOT / "data" / "raw"
CLEAN = ROOT / "data" / "clean"
OUT   = ROOT / "data" / "features"

tx   = pd.read_csv(RAW   / "transactions_log.csv",  parse_dates=["Date"])
cust = pd.read_csv(CLEAN / "cleaned_customers_v1.csv", parse_dates=["Customer_Since"])
prod = pd.read_csv(CLEAN / "cleaned_products_v1.csv")


In [5]:
last_date = tx['Date'].max()
rfm = tx.groupby('CustomerID').agg(
    recency=('Date', lambda dates: (last_date - dates.max()).days),
    frequency=('Date', 'size')
).reset_index()
spend = (
    tx.merge(prod[['SKU','Unit_Price']], on='SKU')
      .assign(spend=lambda df: df['Unit_Price'] * df['Quantity'])
      .groupby('CustomerID')['spend']
      .sum()
      .reset_index(name='monetary')
)
rfm = rfm.merge(spend, on='CustomerID')


In [6]:
cat_counts = (
    tx.merge(prod[["SKU","Rev_GL_Class"]], on="SKU")
      .groupby(["CustomerID","Rev_GL_Class"])
      .size()
      .unstack(fill_value=0)
      .add_prefix("class_")
      .reset_index()
)


In [7]:
sku_summary = (
    tx.groupby("SKU")
      .agg(interaction_count=("SKU","size"),
           avg_quantity=("Quantity","mean"))
      .reset_index()
)

price_summary = (
    prod.groupby("SKU")["Unit_Price"]
        .mean()
        .reset_index(name="avg_price")
)

sku_summary = sku_summary.merge(price_summary, on="SKU")


In [8]:
interactions = (
    tx.groupby(["CustomerID","SKU"])["Quantity"]
      .sum()
      .reset_index(name="quantity")
)

features = (
    interactions
      .merge(rfm,        on="CustomerID")
      .merge(cat_counts, on="CustomerID")
      .merge(sku_summary,on="SKU")
)

OUT.mkdir(parents=True, exist_ok=True)
features.to_csv(OUT / "features_table.csv", index=False)


In [9]:
import pandas as pd
import numpy as np
from pathlib import Path

ROOT    = Path.cwd().parent
RAW     = ROOT / "data" / "raw"
CLEAN   = ROOT / "data" / "clean"
OUT     = ROOT / "data" / "features"

tx        = pd.read_csv(RAW   / "transactions_log.csv", parse_dates=["Date"])
cust      = pd.read_csv(CLEAN / "cleaned_customers_v1.csv", parse_dates=["Customer_Since"])
prod      = pd.read_csv(CLEAN / "cleaned_products_v1.csv")
features  = pd.read_csv(OUT   / "features_table.csv")

last_date    = tx["Date"].max()
alpha        = 40
decay_rate   = 0.01

features["c_ui"]             = 1 + alpha * features["quantity"]
features["decayed_quantity"] = features["quantity"] * np.exp(-decay_rate * features["recency"])

features = features.merge(
    cust[["CustomerID","Customer_Since"]],
    on="CustomerID"
)
features["tenure_days"] = (last_date - features["Customer_Since"]).dt.days

for col in ["monetary","interaction_count","avg_price"]:
    features[f"log_{col}"] = np.log1p(features[col])

OUT.mkdir(parents=True, exist_ok=True)
features.to_csv(OUT / "features_table_v2.csv", index=False)
