In [None]:
# import pandas as pd
# import numpy as np
# import random

# # Seed for reproducibility
# np.random.seed(42)

# # Row count
# n_rows = 100000

# # Options for categorical variables
# months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
# regions = ['Region A', 'Region B', 'Region C', 'Region D']
# product_categories = ['Electronics', 'Furniture', 'Chemicals', 'Food', 'Textiles']
# transfer_flags = ['Plant-to-Plant', 'External Customer']
# pickup_or_delivery_options = ['Pickup', 'Delivery']
# distance_buckets = ['0-5', '5-10', '10-15', '15-20', '20+']

# # Data generation
# data = {
#     'shipment_month': np.random.choice(months, size=n_rows),
#     'region': np.random.choice(regions, size=n_rows),
#     'product_category': np.random.choice(product_categories, size=n_rows),
#     'product_id': [f'P{random.randint(1000, 9999)}' for _ in range(n_rows)],
#     'customer_id': [f'C{random.randint(100000, 999999)}' for _ in range(n_rows)],
#     'transfer_flag': np.random.choice(transfer_flags, size=n_rows),
#     'pickup_or_delivery': np.random.choice(pickup_or_delivery_options, size=n_rows),
#     'Job_distance': np.random.choice(distance_buckets, size=n_rows),
#     'Shipment_qty': np.round(np.random.uniform(1, 50, size=n_rows), 2),
# }

# # Simulate price per ton as a function of category and quantity (plus noise)
# base_price = {
#     'Electronics': 900,
#     'Furniture': 700,
#     'Chemicals': 800,
#     'Food': 500,
#     'Textiles': 600
# }

# noise = np.random.normal(0, 50, size=n_rows)
# data['avg_price_per_ton'] = [
#     round(base_price[cat] * (1 + 0.01 * random.uniform(-5, 5)) + n, 2)
#     for cat, n in zip(data['product_category'], noise)
# ]

# df = pd.DataFrame(data)
# df.to_csv('shipment_data.csv', index=False)


We will generate a 500,000-row synthetic CSV dataset with:

* Imbalances in region and transfer_flag
* Seasonality effects in Shipment_qty and avg_price_per_ton
* Region-based behavior differences

In [None]:
import pandas as pd
import numpy as np
import random

np.random.seed(42)
n_rows = 500000

# Column category values
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
product_categories = ['Electronics', 'Furniture', 'Chemicals', 'Food', 'Textiles']
pickup_or_delivery = ['Pickup', 'Delivery']
distance_buckets = ['0-5', '5-10', '10-15', '15-20', '20+']

# Imbalanced regions:
region_vals = np.random.choice(
    ['Region A', 'Region B', 'Region C', 'Region D'],
    size=n_rows,
    p=[0.2, 0.3, 0.4, 0.1]
)

# Imbalanced transfer_flag: 80% External Customer
transfer_flags = np.random.choice(
    ['External Customer', 'Plant-to-Plant'],
    size=n_rows,
    p=[0.8, 0.2]
)

# Seasonality multipliers
price_seasonality = {
    'Jan': 0.95, 'Feb': 0.95,
    'Mar': 1.05, 'Apr': 1.07, 'May': 1.08,
    'Jun': 1.12, 'Jul': 1.15, 'Aug': 1.12,
    'Sep': 1.00, 'Oct': 0.98, 'Nov': 0.97,
    'Dec': 1.20
}
qty_seasonality = {
    'Jan': 0.8, 'Feb': 0.85,
    'Mar': 1.0, 'Apr': 1.05, 'May': 1.1,
    'Jun': 1.2, 'Jul': 1.25, 'Aug': 1.2,
    'Sep': 1.0, 'Oct': 0.95, 'Nov': 0.9,
    'Dec': 1.3
}

# Base prices by category
base_price = {
    'Electronics': 900,
    'Furniture': 700,
    'Chemicals': 800,
    'Food': 500,
    'Textiles': 600
}

# Regional multipliers for price and quantity
region_price_adjustment = {
    'Region A': 1.00,
    'Region B': 1.10,
    'Region C': 0.95,
    'Region D': 1.15
}
region_qty_distribution = {
    'Region A': lambda: np.random.normal(25, 10),
    'Region B': lambda: np.random.normal(10, 5),
    'Region C': lambda: np.random.normal(20, 6),
    'Region D': lambda: np.random.normal(12, 4)
}

# Randomized base fields
shipment_months = np.random.choice(months, size=n_rows)
product_cats = np.random.choice(product_categories, size=n_rows)
pickup_flags = np.random.choice(pickup_or_delivery, size=n_rows)
distances = np.random.choice(distance_buckets, size=n_rows)
product_ids = [f'P{random.randint(1000, 9999)}' for _ in range(n_rows)]
customer_ids = [f'C{random.randint(100000, 999999)}' for _ in range(n_rows)]

# Generate quantities with region + season adjustments
shipment_qty = [
    max(np.round(region_qty_distribution[reg]() * qty_seasonality[mon], 2), 0.5)
    for reg, mon in zip(region_vals, shipment_months)
]

# Simulate noise for price
noise = np.random.normal(0, 50, size=n_rows)

# Generate prices with category, region, seasonality, and noise
avg_price = [
    round(
        base_price[cat] *
        region_price_adjustment[reg] *
        price_seasonality[mon] *
        (1 + 0.01 * random.uniform(-5, 5)) +
        n, 2
    )
    for cat, reg, mon, n in zip(product_cats, region_vals, shipment_months, noise)
]

# Assemble into DataFrame
df = pd.DataFrame({
    'shipment_month': shipment_months,
    'region': region_vals,
    'product_category': product_cats,
    'product_id': product_ids,
    'customer_id': customer_ids,
    'transfer_flag': transfer_flags,
    'pickup_or_delivery': pickup_flags,
    'Job_distance': distances,
    'Shipment_qty': shipment_qty,
    'avg_price_per_ton': avg_price
})

# Export to CSV
df.to_csv('seasonal_imbalanced_shipment_data.csv', index=False)
