In [None]:
# pip install sdv pandas numpy

In [None]:
%pip install -U sdv


Collecting sdv
  Downloading sdv-1.32.0-py3-none-any.whl.metadata (14 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.42.16-py3-none-any.whl.metadata (6.8 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.42.16-py3-none-any.whl.metadata (5.9 kB)
Collecting copulas>=0.12.1 (from sdv)
  Downloading copulas-0.12.3-py3-none-any.whl.metadata (9.5 kB)
Collecting ctgan>=0.11.1 (from sdv)
  Downloading ctgan-0.11.1-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.7.0 (from sdv)
  Downloading deepecho-0.7.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.18.2 (from sdv)
  Downloading rdt-1.18.2-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.21.0 (from sdv)
  Downloading sdmetrics-0.24.0-py3-none-any.whl.metadata (9.3 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0.0,>=1.28->sdv)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.17.0,>=0.16.0 (from boto3<2.0.0,>=1.28->sdv)
  Downloading s

In [None]:
import sdv
print(sdv.__version__)


1.32.0


In [None]:
import pandas as pd
import numpy as np

from sdv.metadata import MultiTableMetadata
from sdv.multi_table import HMASynthesizer


Store Master

In [None]:
NUM_SKUS = 50

SKU_LIST = [f"SKU_{i}" for i in range(1, NUM_SKUS + 1)]

CATEGORY_LIST = [
    "Bev",
    "Snack",
    "Dairy",
    "Personal Care",
    "Home Care",
    "Confectionery",
    "Health",
    "Frozen",
    "Bakery",
    "Grocery"
]


In [None]:
import pandas as pd
import numpy as np

np.random.seed(42)

stores = pd.DataFrame({
    "Store_ID": [f"S{i:03d}" for i in range(1, 51)],
    "Store_Name": [f"Store_{i}" for i in range(1, 51)],
    "Region": np.random.choice(["North","South","East","West"], 50),
    "Segment": np.random.choice(["Gold","Silver","Bronze"], 50, p=[0.3, 0.4, 0.3]),
    "Revenue_L3M": np.random.uniform(4000, 15000, 50).round(0),
    "YoY_Growth_%": np.random.uniform(-5, 15, 50).round(2),
    "Market_Share_%": np.random.uniform(5, 25, 50).round(1),
    "Last_Visit_Days_Ago": np.random.randint(1, 60, 50),
    "Visit_Frequency_Target": np.random.choice(
        ["1 (per month)", "2 (per month)", "4 (per month)"], 50
    )
})

# üîß CHANGE: Segment ‚Üî Revenue sanity
stores.loc[stores.Segment == "Gold", "Revenue_L3M"] *= 1.2
stores.loc[stores.Segment == "Bronze", "Revenue_L3M"] *= 0.8
stores["Revenue_L3M"] = stores["Revenue_L3M"].round(0)


In [None]:
product_master = pd.DataFrame({
    "SKU_ID": SKU_LIST,

    # üîß CHANGE: auto-generated product names
    "Product_Name": [f"Product_{i}" for i in range(1, NUM_SKUS + 1)],

    # üîß CHANGE: 10 categories
    "Category": np.random.choice(CATEGORY_LIST, NUM_SKUS),

    # üîß CHANGE: realistic focus product ratio
    "Focus_Product?": np.random.choice(
        ["Yes", "No"],
        NUM_SKUS,
        p=[0.2, 0.8]
    ),

    "Active_Promo": np.random.choice(
        ["None", "10% Off", "Buy 5 Get 1", "Buy 2 Get 1"],
        NUM_SKUS,
        p=[0.6, 0.15, 0.15, 0.10]
    ),

    "Margin_Story": np.random.choice(
        [
            "High volume driver",
            "Impulse purchase",
            "Premium growth SKU",
            "Seasonal demand"
        ],
        NUM_SKUS
    )
})


In [None]:
pricing = pd.DataFrame({
    "SKU_ID": SKU_LIST,

    # üîß CHANGE: price in multiples of 10
    "List_Price": np.random.choice(
        np.arange(20, 201, 10),
        NUM_SKUS
    ),

    "COGS (Cost)": np.random.randint(10, 120, NUM_SKUS),

    "Min_Acceptable_Margin_%": np.random.choice(
        [15, 18, 20, 22, 25],
        NUM_SKUS
    ),

    "FSR_Max_Disc_%": np.random.choice([5, 6, 7, 8], NUM_SKUS),

    "Mgr_Approval_Disc_%": np.random.choice([10, 12, 15], NUM_SKUS)
})

# üîß CHANGE: enforce pricing sanity
pricing["COGS (Cost)"] = np.minimum(
    pricing["COGS (Cost)"],
    pricing["List_Price"] * 0.8
)

pricing["FSR_Max_Disc_%"] = np.minimum(
    pricing["FSR_Max_Disc_%"],
    pricing["Mgr_Approval_Disc_%"]
)


In [None]:
dates = pd.date_range("2023-07-01", "2024-06-30")
transactions = pd.DataFrame({
    "Store_ID": np.random.choice(stores.Store_ID, 12000),
    "SKU_ID": np.random.choice(SKU_LIST, 12000),
    "Last_Purchase_Date": np.random.choice(dates, 12000),
    "Avg_Order_Qty": np.random.poisson(30, 12000),
    "Last_Order_Qty": np.random.poisson(25, 12000)
})

# Category still derived from product_master (NO CHANGE)
transactions = transactions.merge(
    product_master[['SKU_ID', 'Category']],
    on='SKU_ID',
    how='left'
)

transactions["Avg_Order_Qty"] = np.maximum(
    transactions["Avg_Order_Qty"],
    transactions["Last_Order_Qty"]
)


In [None]:
inventory = pd.DataFrame({
    "Store_ID": np.random.choice(stores.Store_ID, 6000),
    "SKU_ID": np.random.choice(SKU_LIST, 6000),

    "Inventory_Date": np.random.choice(
        pd.date_range("2023-07-01", "2024-06-30", freq="W"),
        6000
    ),

    "Est_Current_Stock": np.random.randint(5, 60, 6000),
    "Reorder_Level": np.random.randint(10, 30, 6000)
})

inventory["Reorder_Level"] = np.minimum(
    inventory["Reorder_Level"],
    inventory["Est_Current_Stock"] - 1
).clip(lower=0)


In [None]:
# üîß CHANGE: Ensure sales happen BEFORE inventory snapshot
transactions = transactions.merge(
    inventory[['Store_ID','SKU_ID','Inventory_Date']],
    on=['Store_ID','SKU_ID'],
    how='left'
)

transactions = transactions[
    transactions["Last_Purchase_Date"] <= transactions["Inventory_Date"]
].drop(columns=["Inventory_Date"])


In [None]:
from sdv.metadata import MultiTableMetadata
from sdv.multi_table import HMASynthesizer

metadata = MultiTableMetadata()
metadata.detect_from_dataframes({
    "stores": stores,
    "product_master": product_master,
    "pricing": pricing,
    "transactions": transactions,
    "inventory": inventory
})
metadata.validate()

synthesizer = HMASynthesizer(metadata)

synthesizer.fit({
    "stores": stores,
    "product_master": product_master,
    "pricing": pricing,
    "transactions": transactions,
    "inventory": inventory
})


Preprocess Tables: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:03<00:00,  1.56it/s]



Learning relationships:


(1/4) Tables 'stores' and 'transactions' ('Store_ID'): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:00<00:00, 62.29it/s]
(2/4) Tables 'stores' and 'inventory' ('Store_ID'): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:00<00:00, 87.63it/s]
(3/4) Tables 'product_master' and 'transactions' ('SKU_ID'): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:00<00:00, 99.48it/s] 
(4/4) Tables 'product_master' and 'inventory' ('SKU_ID'): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:00<00:00, 136.76it/s]





Modeling Tables: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:01<00:00,  4.44it/s]


In [None]:
from tqdm import tqdm

chunks = []

for _ in tqdm(range(3), desc="Generating synthetic data"):
    chunks.append(synthesizer.sample(scale=1))

synthetic = {
    table: pd.concat([c[table] for c in chunks], ignore_index=True)
    for table in chunks[0]
}


Generating synthetic data: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:56<00:00, 18.84s/it]


In [None]:
product_master[product_master['Active_Promo']=='None']['Focus_Product?'].value_counts()

Unnamed: 0_level_0,count
Focus_Product?,Unnamed: 1_level_1
No,26
Yes,2


In [None]:
product_master

Unnamed: 0,SKU_ID,Product_Name,Category,Focus_Product?,Active_Promo,Margin_Story
0,SKU_1,Product_1,Grocery,Yes,Buy 5 Get 1,High volume driver
1,SKU_2,Product_2,Personal Care,No,Buy 2 Get 1,High volume driver
2,SKU_3,Product_3,Snack,No,,Impulse purchase
3,SKU_4,Product_4,Personal Care,No,Buy 2 Get 1,High volume driver
4,SKU_5,Product_5,Grocery,Yes,Buy 5 Get 1,High volume driver
5,SKU_6,Product_6,Frozen,Yes,Buy 2 Get 1,Impulse purchase
6,SKU_7,Product_7,Frozen,No,Buy 5 Get 1,Impulse purchase
7,SKU_8,Product_8,Dairy,No,,Seasonal demand
8,SKU_9,Product_9,Home Care,No,10% Off,Premium growth SKU
9,SKU_10,Product_10,Confectionery,No,,Premium growth SKU


In [None]:
stores.to_csv("stores.csv",index=False)
stores.to_csv("stores.csv",index=False)


In [None]:
print("STORE MASTER")
display(stores.head())

print("PRODUCT MASTER")
display(product_master.head())

print("PRICING")
display(pricing.head())

print("TRANSACTIONS")
display(transactions.head())

print("INVENTORY")
display(inventory.head())


STORE MASTER


Unnamed: 0,Store_ID,Store_Name,Region,Segment,Revenue_L3M,YoY_Growth_%,Market_Share_%,Last_Visit_Days_Ago,Visit_Frequency_Target
0,S001,Store_1,East,Bronze,9615.0,3.54,8.5,46,2 (per month)
1,S002,Store_2,West,Gold,14981.0,11.36,18.8,30,4 (per month)
2,S003,Store_3,North,Silver,4814.0,12.21,12.7,38,2 (per month)
3,S004,Store_4,East,Silver,7943.0,-4.86,23.7,38,2 (per month)
4,S005,Store_5,East,Gold,6330.0,5.21,7.8,45,4 (per month)


PRODUCT MASTER


Unnamed: 0,SKU_ID,Product_Name,Category,Focus_Product?,Active_Promo,Margin_Story
0,SKU_1,Product_1,Grocery,Yes,Buy 5 Get 1,High volume driver
1,SKU_2,Product_2,Personal Care,No,Buy 2 Get 1,High volume driver
2,SKU_3,Product_3,Snack,No,,Impulse purchase
3,SKU_4,Product_4,Personal Care,No,Buy 2 Get 1,High volume driver
4,SKU_5,Product_5,Grocery,Yes,Buy 5 Get 1,High volume driver


PRICING


Unnamed: 0,SKU_ID,List_Price,COGS (Cost),Min_Acceptable_Margin_%,FSR_Max_Disc_%,Mgr_Approval_Disc_%
0,SKU_1,50,40.0,18,5,15
1,SKU_2,110,88.0,25,7,12
2,SKU_3,40,32.0,22,5,10
3,SKU_4,20,16.0,20,5,15
4,SKU_5,130,104.0,22,6,15


TRANSACTIONS


Unnamed: 0,Store_ID,SKU_ID,Last_Purchase_Date,Avg_Order_Qty,Last_Order_Qty,Category
2,S022,SKU_3,2024-03-03,28,24,Snack
4,S022,SKU_43,2023-07-04,36,36,Grocery
5,S022,SKU_43,2023-07-04,36,36,Grocery
6,S022,SKU_43,2023-07-04,36,36,Grocery
8,S042,SKU_49,2024-06-13,32,24,Personal Care


INVENTORY


Unnamed: 0,Store_ID,SKU_ID,Inventory_Date,Est_Current_Stock,Reorder_Level
0,S035,SKU_10,2023-12-31,41,11
1,S003,SKU_6,2024-03-03,15,14
2,S030,SKU_13,2024-05-26,29,24
3,S048,SKU_6,2024-05-26,30,16
4,S025,SKU_36,2023-08-27,14,13


In [None]:
transactions

Unnamed: 0,Store_ID,SKU_ID,Last_Purchase_Date,Avg_Order_Qty,Last_Order_Qty,Category
2,S022,SKU_3,2024-03-03,28,24,Snack
4,S022,SKU_43,2023-07-04,36,36,Grocery
5,S022,SKU_43,2023-07-04,36,36,Grocery
6,S022,SKU_43,2023-07-04,36,36,Grocery
8,S042,SKU_49,2024-06-13,32,24,Personal Care
...,...,...,...,...,...,...
30039,S026,SKU_43,2023-09-16,27,16,Grocery
30040,S026,SKU_43,2023-09-16,27,16,Grocery
30041,S026,SKU_43,2023-09-16,27,16,Grocery
30044,S030,SKU_9,2023-11-18,25,24,Home Care


In [None]:
product_master

Unnamed: 0,SKU_ID,Product_Name,Category,Focus_Product?,Active_Promo,Margin_Story
0,SKU_1,Product_1,Grocery,Yes,Buy 5 Get 1,High volume driver
1,SKU_2,Product_2,Personal Care,No,Buy 2 Get 1,High volume driver
2,SKU_3,Product_3,Snack,No,,Impulse purchase
3,SKU_4,Product_4,Personal Care,No,Buy 2 Get 1,High volume driver
4,SKU_5,Product_5,Grocery,Yes,Buy 5 Get 1,High volume driver
5,SKU_6,Product_6,Frozen,Yes,Buy 2 Get 1,Impulse purchase
6,SKU_7,Product_7,Frozen,No,Buy 5 Get 1,Impulse purchase
7,SKU_8,Product_8,Dairy,No,,Seasonal demand
8,SKU_9,Product_9,Home Care,No,10% Off,Premium growth SKU
9,SKU_10,Product_10,Confectionery,No,,Premium growth SKU


In [None]:
transactions.drop_duplicates()

Unnamed: 0,Store_ID,SKU_ID,Last_Purchase_Date,Avg_Order_Qty,Last_Order_Qty,Category
2,S022,SKU_3,2024-03-03,28,24,Snack
4,S022,SKU_43,2023-07-04,36,36,Grocery
8,S042,SKU_49,2024-06-13,32,24,Personal Care
17,S043,SKU_42,2023-12-20,32,32,Bakery
20,S037,SKU_13,2024-01-09,29,21,Bev
...,...,...,...,...,...,...
30034,S026,SKU_39,2024-03-01,33,33,Personal Care
30037,S019,SKU_47,2024-01-11,34,26,Grocery
30039,S026,SKU_43,2023-09-16,27,16,Grocery
30044,S030,SKU_9,2023-11-18,25,24,Home Care


In [None]:
inventory[(inventory['Store_ID']=='S023') & (inventory['SKU_ID']=='SKU_D')].sort_values(by=['Inventory_Date'],ascending=True)

Unnamed: 0,Store_ID,SKU_ID,Inventory_Date,Est_Current_Stock,Reorder_Level
3820,S023,SKU_D,2023-07-02,49,19
2734,S023,SKU_D,2023-07-02,6,5
1176,S023,SKU_D,2023-07-30,39,20
1472,S023,SKU_D,2023-08-27,20,11
1055,S023,SKU_D,2023-09-03,57,17
4779,S023,SKU_D,2023-09-03,22,21
2501,S023,SKU_D,2023-09-03,16,15
3917,S023,SKU_D,2023-09-17,46,26
4597,S023,SKU_D,2023-09-24,29,22
4709,S023,SKU_D,2023-10-01,5,4
