In [1]:
import pandas as pd
import numpy as np

In [7]:
# Set seed for reproducibility
np.random.seed(42)

# Define date range
dates = pd.date_range(start="2024-01-01", end="2024-06-30", freq="D")

# Define regions
regions = ['North', 'South', 'East', 'West']

# Define categories and realistic subcategories
categories = {
    'Headwear': ['Cap', 'Hat', 'Headband'],
    'Bodywear': ['T-shirt', 'Jacket', 'Sweater'],
    'Legwear': ['Jeans', 'Shorts', 'Trousers'],
    'Footwear': ['Sneakers', 'Sandals', 'Boots'],
    'Handwear': ['Gloves', 'Wristband']
}

# Define price ranges for each subcategory based on online estimates in USD
price_lookup = {
    'Cap': (10, 20),
    'Hat': (15, 30),
    'Headband': (5, 15),
    'T-shirt': (10, 25),
    'Jacket': (40, 100),
    'Sweater': (30, 70),
    'Jeans': (30, 60),
    'Shorts': (20, 40),
    'Trousers': (25, 50),
    'Sneakers': (50, 120),
    'Sandals': (20, 50),
    'Boots': (60, 150),
    'Gloves': (10, 25),
    'Wristband': (5, 15)
}

customer_types = ['New', 'Returning']

# Discount schedule by month, simulating events (e.g., clearance in April)
discount_by_month = {
    1: (0.00, 0.10),
    2: (0.05, 0.15),
    3: (0.00, 0.10),
    4: (0.10, 0.30),  # Clearance or Spring Sale
    5: (0.00, 0.10),
    6: (0.05, 0.15)
}

# Generate transactions for a single day
def generate_transactions_for_day(date, n):
    records = []
    for _ in range(n):
        region = np.random.choice(regions)
        category = np.random.choice(list(categories.keys()))
        subcategory = np.random.choice(categories[category])
        customer_type = np.random.choice(customer_types, p=[0.3, 0.7])  # More returning customers
        units_sold = np.random.randint(1, 5)
        unit_price = np.random.uniform(*price_lookup[subcategory])
        discount = np.random.uniform(*discount_by_month[date.month])
        sales = round(units_sold * unit_price * (1 - discount), 2)

        records.append({
            'Date': date,
            'Region': region,
            'Category': category,
            'Subcategory': subcategory,
            'Customer_Type': customer_type,
            'Units_Sold': units_sold,
            'Unit_Price': round(unit_price, 2),
            'Discount': round(discount, 2),
            'Sales (USD)': sales
        })
    return records

# Generate all transactions
all_data = []
for date in dates:
    if date.month == 4:
        n_txns = np.random.randint(15, 30)  # Fewer purchases in April
    else:
        n_txns = np.random.randint(30, 50)  # Normal days
    all_data.extend(generate_transactions_for_day(date, n_txns))

# Create the DataFrame
df_clothing_sales = pd.DataFrame(all_data)

# Preview structure
df_clothing_sales.head()


Unnamed: 0,Date,Region,Category,Subcategory,Customer_Type,Units_Sold,Unit_Price,Discount,Sales (USD)
0,2024-01-01,West,Handwear,Gloves,Returning,1,18.95,0.04,18.11
1,2024-01-01,East,Legwear,Trousers,Returning,4,28.57,0.07,106.85
2,2024-01-01,North,Bodywear,Jacket,New,4,99.53,0.06,373.55
3,2024-01-01,South,Handwear,Wristband,New,3,11.12,0.01,32.89
4,2024-01-01,West,Footwear,Boots,Returning,3,115.65,0.04,333.69


In [8]:
# Save to Excel with placeholder path (you will update the path)
output_path = "C:/Users/HP/Desktop/My Data Science Journey/Python/Portfolio Project/Why did sales drop/dataset/clothing_sales_jan_jun_2024.xlsx"

df_clothing_sales.to_excel(output_path, index=False)