# Business Enrichment & Dimension Modeling

## Objective
Transform developer-centric identifiers (Store IDs, Product IDs) into business-friendly dimensions 
(Country, Region, Product Category) to enable decision-focused analytics and international insights.

This notebook:
- Does NOT modify ML features
- Does NOT train models
- Focuses purely on business enrichment & aggregation

Output datasets from this notebook are consumed by:
- Backend APIs
- UI filters (Country, Category)
- Executive-level analytics views


In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", "{:,.2f}".format)


In [3]:


df = pd.read_csv("../data/processed/feature_engineered_data.csv", parse_dates=["week"])

print("Rows:", df.shape[0])
print("Columns:", df.shape[1])

df.head()


Rows: 9900
Columns: 16


Unnamed: 0,store_id,product_id,week,weekly_units_sold,weekly_units_ordered,avg_inventory_level,avg_price,avg_discount,holiday_promotion,lag_1_units_sold,lag_2_units_sold,lag_4_units_sold,rolling_4wk_avg,rolling_8wk_avg,rolling_4wk_std,week_over_week_change
0,S001,P0001,2022-02-14,994,847,230.86,61.83,7.86,1,708.0,1032.0,1142.0,854.75,770.12,183.63,0.4
1,S001,P0001,2022-02-21,1222,875,275.43,37.28,12.86,1,994.0,708.0,685.0,989.0,896.88,212.23,0.23
2,S001,P0001,2022-02-28,1259,644,348.71,43.0,9.29,1,1222.0,994.0,1032.0,1045.75,966.0,253.83,0.03
3,S001,P0001,2022-03-07,983,990,326.14,67.16,11.43,1,1259.0,1222.0,708.0,1114.5,1003.12,146.34,-0.22
4,S001,P0001,2022-03-14,946,704,197.86,42.1,14.29,1,983.0,1259.0,994.0,1102.5,978.62,160.77,-0.04


## Data Validation

Before enrichment, we validate that:
- Store IDs and Product IDs are intentionally repeated (fact table behavior)
- Each row represents a store–product–week observation


In [4]:
df["store_id"].nunique(), df["product_id"].nunique()


(5, 20)

In [5]:
df["store_id"].unique()
df["product_id"].unique()


array(['P0001', 'P0002', 'P0003', 'P0004', 'P0005', 'P0006', 'P0007',
       'P0008', 'P0009', 'P0010', 'P0011', 'P0012', 'P0013', 'P0014',
       'P0015', 'P0016', 'P0017', 'P0018', 'P0019', 'P0020'], dtype=object)

In [6]:
store_dim = (
    df[["store_id"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

store_dim


Unnamed: 0,store_id
0,S001
1,S002
2,S003
3,S004
4,S005


In [7]:
store_dim["market"] = [
    "APAC",
    "APAC",
    "North America",
    "Europe",
    "Europe"
]

store_dim["market_type"] = [
    "Emerging",
    "Emerging",
    "Developed",
    "Developed",
    "Developed"
]


In [8]:
df = df.merge(store_dim[["store_id", "market", "market_type"]], on="store_id", how="left")


In [9]:
product_dim = (
    df[["product_id"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

product_dim


Unnamed: 0,product_id
0,P0001
1,P0002
2,P0003
3,P0004
4,P0005
5,P0006
6,P0007
7,P0008
8,P0009
9,P0010


In [10]:
product_dim["product_category"] = [
    "Consumer Electronics",
    "Consumer Electronics",
    "Home Appliances",
    "Home Appliances",
    "Smart Devices",
    "Smart Devices",
    "Personal Accessories",
    "Personal Accessories",
    "Wearables",
    "Wearables",
    "Office Electronics",
    "Office Electronics",
    "Audio Devices",
    "Audio Devices",
    "Kitchen Appliances",
    "Kitchen Appliances",
    "IoT Devices",
    "IoT Devices",
    "Misc Electronics",
    "Misc Electronics",
][:len(product_dim)]

product_dim


Unnamed: 0,product_id,product_category
0,P0001,Consumer Electronics
1,P0002,Consumer Electronics
2,P0003,Home Appliances
3,P0004,Home Appliances
4,P0005,Smart Devices
5,P0006,Smart Devices
6,P0007,Personal Accessories
7,P0008,Personal Accessories
8,P0009,Wearables
9,P0010,Wearables


In [11]:
df_enriched = (
    df
    .merge(store_dim, on="store_id", how="left")
    .merge(product_dim, on="product_id", how="left")
)

df_enriched.head()


Unnamed: 0,store_id,product_id,week,weekly_units_sold,weekly_units_ordered,avg_inventory_level,avg_price,avg_discount,holiday_promotion,lag_1_units_sold,lag_2_units_sold,lag_4_units_sold,rolling_4wk_avg,rolling_8wk_avg,rolling_4wk_std,week_over_week_change,market_x,market_type_x,market_y,market_type_y,product_category
0,S001,P0001,2022-02-14,994,847,230.86,61.83,7.86,1,708.0,1032.0,1142.0,854.75,770.12,183.63,0.4,APAC,Emerging,APAC,Emerging,Consumer Electronics
1,S001,P0001,2022-02-21,1222,875,275.43,37.28,12.86,1,994.0,708.0,685.0,989.0,896.88,212.23,0.23,APAC,Emerging,APAC,Emerging,Consumer Electronics
2,S001,P0001,2022-02-28,1259,644,348.71,43.0,9.29,1,1222.0,994.0,1032.0,1045.75,966.0,253.83,0.03,APAC,Emerging,APAC,Emerging,Consumer Electronics
3,S001,P0001,2022-03-07,983,990,326.14,67.16,11.43,1,1259.0,1222.0,708.0,1114.5,1003.12,146.34,-0.22,APAC,Emerging,APAC,Emerging,Consumer Electronics
4,S001,P0001,2022-03-14,946,704,197.86,42.1,14.29,1,983.0,1259.0,994.0,1102.5,978.62,160.77,-0.04,APAC,Emerging,APAC,Emerging,Consumer Electronics


In [12]:
df["market"].value_counts()


market
APAC             3960
Europe           3960
North America    1980
Name: count, dtype: int64

In [13]:
df_enriched.isnull().sum()


store_id                 0
product_id               0
week                     0
weekly_units_sold        0
weekly_units_ordered     0
avg_inventory_level      0
avg_price                0
avg_discount             0
holiday_promotion        0
lag_1_units_sold         0
lag_2_units_sold         0
lag_4_units_sold         0
rolling_4wk_avg          0
rolling_8wk_avg          0
rolling_4wk_std          0
week_over_week_change    0
market_x                 0
market_type_x            0
market_y                 0
market_type_y            0
product_category         0
dtype: int64

In [14]:
market_weekly_demand = (
    df
    .groupby(["market", "week"], as_index=False)
    .agg(
        total_units_sold=("weekly_units_sold", "sum"),
        avg_units_sold=("weekly_units_sold", "mean")
    )
)

market_weekly_demand.head()


Unnamed: 0,market,week,total_units_sold,avg_units_sold
0,APAC,2022-02-14,37297,932.42
1,APAC,2022-02-21,40821,1020.52
2,APAC,2022-02-28,39565,989.12
3,APAC,2022-03-07,39537,988.42
4,APAC,2022-03-14,35128,878.2


In [15]:
market_summary = (
    market_weekly_demand
    .groupby("market", as_index=False)
    .agg(
        avg_weekly_demand=("avg_units_sold", "mean"),
        total_demand=("total_units_sold", "sum"),
        demand_volatility=("avg_units_sold", "std")
    )
)

market_summary


Unnamed: 0,market,avg_weekly_demand,total_demand,demand_volatility
0,APAC,940.61,3724801,94.76
1,Europe,946.8,3749332,91.63
2,North America,960.62,1902018,103.72


In [16]:
median_demand = market_summary["avg_weekly_demand"].median()
median_volatility = market_summary["demand_volatility"].median()


In [17]:
def market_inventory_strategy(row):
    if row["avg_weekly_demand"] > median_demand * 1.1:
        return "High demand market — prioritize inventory expansion."
    elif row["demand_volatility"] > median_volatility * 1.1:
        return "Volatile demand — maintain higher safety buffers."
    else:
        return "Stable demand — optimize inventory to reduce holding cost."


In [18]:
market_summary["inventory_strategy"] = market_summary.apply(
    market_inventory_strategy, axis=1
)

market_summary


Unnamed: 0,market,avg_weekly_demand,total_demand,demand_volatility,inventory_strategy
0,APAC,940.61,3724801,94.76,Stable demand — optimize inventory to reduce h...
1,Europe,946.8,3749332,91.63,Stable demand — optimize inventory to reduce h...
2,North America,960.62,1902018,103.72,Stable demand — optimize inventory to reduce h...


In [19]:
market_comparison = market_summary[
    ["market", "avg_weekly_demand", "total_demand", "demand_volatility"]
].sort_values("avg_weekly_demand", ascending=False)

market_comparison


Unnamed: 0,market,avg_weekly_demand,total_demand,demand_volatility
2,North America,960.62,1902018,103.72
1,Europe,946.8,3749332,91.63
0,APAC,940.61,3724801,94.76


In [20]:
market_summary.assign(
    interpretation=lambda x: x["inventory_strategy"]
)[["market", "interpretation"]]


Unnamed: 0,market,interpretation
0,APAC,Stable demand — optimize inventory to reduce h...
1,Europe,Stable demand — optimize inventory to reduce h...
2,North America,Stable demand — optimize inventory to reduce h...


In [21]:
# Normalize components
market_summary["norm_demand"] = (
    market_summary["avg_weekly_demand"] /
    market_summary["avg_weekly_demand"].max()
)

market_summary["norm_volatility"] = (
    market_summary["demand_volatility"] /
    market_summary["demand_volatility"].max()
)

# Market Pressure Index (weighted)
market_summary["market_pressure_index"] = (
    0.7 * market_summary["norm_demand"] +
    0.3 * market_summary["norm_volatility"]
)

market_summary[
    ["market", "market_pressure_index"]
].sort_values("market_pressure_index", ascending=False)


Unnamed: 0,market,market_pressure_index
2,North America,1.0
0,APAC,0.96
1,Europe,0.95


In [22]:
OUTPUT_PATH = "../data/processed/"

market_weekly_demand.to_csv(
    f"{OUTPUT_PATH}market_weekly_demand.csv", index=False
)

market_summary.to_csv(
    f"{OUTPUT_PATH}market_summary.csv", index=False
)

market_comparison.to_csv(
    f"{OUTPUT_PATH}market_comparison.csv", index=False
)

print("Market-level intelligence datasets saved successfully.")


Market-level intelligence datasets saved successfully.
