In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import json
import os
import random

# ===============================
# Synthetic Data Generation
# ===============================

# -------------------------------
# 1. Configuration & Data Loading
# -------------------------------

# For Jupyter: current working directory
BASE_DIR = Path.cwd()

# Go up one level (adjust if needed)
PROJECT_ROOT = BASE_DIR.parent

# Relative data dirs
base_path = PROJECT_ROOT / "data" / "processed"
output_dir = PROJECT_ROOT / "data" / "generated"

print("Loading parameters...")

# Load Conditional Rules (for bin selection)
conditional_rules_df = pd.read_csv(base_path / "conditional_rules.csv")

# Load Ratio Stats (for dimension sanity checks)
with open(base_path / "ratio_stats.json", "r") as f:
    ratio_stats = json.load(f)

# Load Summary Stats (for distributions)
summary_stats_df = pd.read_csv(base_path / "summary_stats.csv", index_col=0)

print("Parameters loaded.")

# -------------------------------
# 2. Statistical Prep (Log-Normal)
# -------------------------------
def get_lognorm_params(row_name):
    """
    Converts arithmetic Mean and Std (from summary_stats.csv) 
    into Mu and Sigma for the LogNormal distribution.
    """
    stats = summary_stats_df.loc[row_name]
    m = stats["mean"]
    v = stats["std"] ** 2
    
    # Formula to convert arithmetic stats to log-space stats
    phi = np.sqrt(v + m**2)
    mu = np.log(m**2 / phi)
    sigma = np.sqrt(np.log(phi**2 / m**2))
    
    return {
        "mu": mu, 
        "sigma": sigma, 
        "min": stats["min"], 
        "max": stats["max"]
    }

# Pre-calculate parameters for the skewed fields
qty_params = get_lognorm_params("QTY_PER_BOX")
box_params = get_lognorm_params("BOXES_ON_HAND")
dem_params = get_lognorm_params("DEMAND")

# Pre-calculate bin weights to speed up the loop
bin_weights = conditional_rules_df["ROW_COUNT"] / conditional_rules_df["ROW_COUNT"].sum()

# -------------------------------
# 3. Generator Functions
# -------------------------------

def _generate_single_row(item_id):
    """
    Internal helper to generate dimensions and stats for a single item.
    """
    # ----- Select length bin -----
    chosen_bin = np.random.choice(conditional_rules_df["LEN_BIN"], p=bin_weights)
    bin_data = conditional_rules_df[conditional_rules_df["LEN_BIN"] == chosen_bin].iloc[0]

    # ----- Sample core dimensions (Uniform within bin range) -----
    length = np.random.uniform(bin_data["LEN_MIN"], bin_data["LEN_MAX"])
    width  = np.random.uniform(bin_data["WID_Q10"], bin_data["WID_Q90"])
    depth  = np.random.uniform(bin_data["DEP_Q10"], bin_data["DEP_Q90"])
    weight = np.random.uniform(bin_data["WT_Q10"],  bin_data["WT_Q90"])

    # ----- Ratio-based sanity corrections -----
    w_l_min, w_l_max = ratio_stats["W_L_RATIO_Q10"], ratio_stats["W_L_RATIO_Q90"]
    d_w_min, d_w_max = ratio_stats["D_W_RATIO_Q10"], ratio_stats["D_W_RATIO_Q90"]
    wt_v_min, wt_v_max = ratio_stats["WT_VOL_RATIO_Q10"], ratio_stats["WT_VOL_RATIO_Q90"]

    # Clip dimensions to ensure realistic aspect ratios
    width = np.clip(width, w_l_min * length, w_l_max * length)
    depth = np.clip(depth, d_w_min * width, d_w_max * width)
    
    # Clip weight based on volume
    vol = length * width * depth
    weight = np.clip(weight, wt_v_min * vol, wt_v_max * vol)

    # ----- Log-Normal Distribution Generation (Inventory/Demand) -----
    
    # 1. QTY_PER_BOX
    q_val = np.random.lognormal(qty_params["mu"], qty_params["sigma"])
    qty_per_box = int(np.clip(np.round(q_val), qty_params["min"], qty_params["max"]))

    # 2. BOXES_ON_HAND
    b_val = np.random.lognormal(box_params["mu"], box_params["sigma"])
    boxes_on_hand = int(np.clip(np.round(b_val), box_params["min"], box_params["max"]))

    # 3. DEMAND
    d_val = np.random.lognormal(dem_params["mu"], dem_params["sigma"])
    demand = int(np.clip(np.round(d_val), dem_params["min"], dem_params["max"]))

    # ----- Description Generation -----
    item_desc = f"SYNTH_PART_{item_id}"

    # ----- Return Row Dictionary -----
    return {
        "ITEM_ID": str(item_id),
        "ITEM_DESC": item_desc,
        "LEN_MM": round(length * 10, 1),
        "WID_MM": round(width  * 10, 1),
        "DEP_MM": round(depth  * 10, 1),
        "WT_KG": round(weight, 3),
        "QTY_PER_BOX": qty_per_box,
        "BOXES_ON_HAND": boxes_on_hand,
        "DEMAND": demand
    }

def generate_synthetic_dataset(num_records):
    """
    Main function to generate a dataset of size num_records.
    Guarantees unique numeric IDs.
    """
    print(f"Generating {num_records} unique records...")
    
    # 1. Generate a pool of UNIQUE 8-digit IDs (10M to 99M)
    # random.sample throws an error if num_records > range, but 10M-99M is ~90M slots.
    # It guarantees no duplicates.
    id_pool = random.sample(range(10_000_000, 99_999_999), num_records)
    
    # 2. Generate data rows
    data = [_generate_single_row(uid) for uid in id_pool]
    
    # 3. Convert to DataFrame
    df = pd.DataFrame(data)
    
    print(f"Generation complete. Shape: {df.shape}")
    return df

# -------------------------------
# 4. Execution
# -------------------------------
# Set the desired number of records
N = 100

# Generate
df_synth = generate_synthetic_dataset(N)

# Save
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "synthetic_parts_generated.csv")

df_synth.to_csv(output_path, sep=';', index=False)

print(f"Saved to: {output_path}")
print("-" * 30)
print(df_synth.head(10))


Loading parameters...
Parameters loaded.
Generating 100 unique records...
Generation complete. Shape: (100, 9)
Saved to: C:\Users\arthu\Desktop\DIT\CS IntSys\REPO\IntelligentSystemsCaseStudy\Prototype - Phase 1\Dataset Generation\01_prototype\V1.1\data\generated\synthetic_parts_generated.csv
------------------------------
    ITEM_ID            ITEM_DESC  LEN_MM  WID_MM  DEP_MM  WT_KG  QTY_PER_BOX  \
0  93091298  SYNTH_PART_93091298   726.5   485.4   107.1  8.377            1   
1  23196775  SYNTH_PART_23196775    57.5    57.5    57.3  0.171            1   
2  75845071  SYNTH_PART_75845071   156.7    62.3    62.3  0.763            9   
3  91493741  SYNTH_PART_91493741   444.2   168.6   168.6  1.670            1   
4  38029487  SYNTH_PART_38029487   393.9   242.0   112.6  8.433            1   
5  59592065  SYNTH_PART_59592065    81.2    81.2    73.6  0.662            1   
6  79272403  SYNTH_PART_79272403   941.0   646.8   129.4  5.104            2   
7  49347961  SYNTH_PART_49347961   7