In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## 1. Define Business Parameters

In [None]:
# Business parameters
stores = ['Store_NYC', 'Store_LA', 'Store_Chicago', 'Store_Houston', 'Store_Phoenix']
categories = ['Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Books']
regions = ['Northeast', 'West', 'Midwest', 'South', 'Southwest']

# Time period
start_date = datetime(2022, 1, 1)
end_date = datetime(2024, 12, 31)
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

print(f"Date range: {start_date} to {end_date}")
print(f"Total days: {len(date_range)}")
print(f"\nStores: {stores}")
print(f"Categories: {categories}")
print(f"Regions: {regions}")

## 2. Generate Synthetic Sales Data

In [None]:
# Create combinations of store and category
np.random.seed(42)
data = []

for date in date_range:
    # 3-8 transactions per day
    num_transactions = np.random.randint(3, 9)
    
    for _ in range(num_transactions):
        store = np.random.choice(stores)
        category = np.random.choice(categories)
        region = np.random.choice(regions)
        
        # Base sales amount
        base_amount = np.random.uniform(50, 500)
        
        # Seasonal patterns (higher sales in Q4 - holiday season)
        month = date.month
        if month in [11, 12]:
            seasonal_factor = 1.5  # 50% increase in November-December
        elif month in [6, 7, 8]:
            seasonal_factor = 0.85  # Summer dip
        else:
            seasonal_factor = 1.0
        
        # Weekly pattern (higher sales on weekends)
        if date.weekday() >= 4:  # Friday-Sunday
            weekly_factor = 1.3
        else:
            weekly_factor = 1.0
        
        # Category-specific multipliers
        category_mult = {
            'Electronics': 1.2,
            'Clothing': 0.9,
            'Home & Garden': 1.0,
            'Sports': 0.95,
            'Books': 0.8
        }
        
        # Store-specific multipliers
        store_mult = {
            'Store_NYC': 1.3,
            'Store_LA': 1.25,
            'Store_Chicago': 1.1,
            'Store_Houston': 0.9,
            'Store_Phoenix': 0.95
        }
        
        # Calculate final amount
        sales_amount = (
            base_amount * 
            seasonal_factor * 
            weekly_factor * 
            category_mult[category] * 
            store_mult[store] *
            np.random.uniform(0.9, 1.1)  # Random variation
        )
        
        quantity = np.random.randint(1, 6)
        
        data.append({
            'Date': date,
            'Store': store,
            'Category': category,
            'Region': region,
            'Quantity': quantity,
            'Sales_Amount': round(sales_amount, 2),
            'Unit_Price': round(sales_amount / quantity, 2)
        })

# Create DataFrame
df_sales = pd.DataFrame(data)
print(f"Generated {len(df_sales)} transactions")
print(f"\nDataFrame shape: {df_sales.shape}")
print(f"\nFirst few records:")
print(df_sales.head(10))

## 3. Add Missing Values (Realistic Data Quality)

In [None]:
# Add some missing values to make it realistic
missing_indices = np.random.choice(df_sales.index, size=int(0.02 * len(df_sales)), replace=False)
df_sales.loc[missing_indices, 'Unit_Price'] = np.nan

print(f"Added missing values:")
print(df_sales.isnull().sum())
print(f"\nData types:")
print(df_sales.dtypes)

## 4. Save Data to CSV

In [None]:
# Save to CSV
output_path = '../data/sales_historical.csv'
df_sales.to_csv(output_path, index=False)

print(f"Data saved to: {output_path}")
print(f"\nDataset Summary:")
print(f"Total Transactions: {len(df_sales):,}")
print(f"Date Range: {df_sales['Date'].min()} to {df_sales['Date'].max()}")
print(f"Total Sales Value: ${df_sales['Sales_Amount'].sum():,.2f}")
print(f"Average Transaction: ${df_sales['Sales_Amount'].mean():.2f}")
print(f"\nStores: {df_sales['Store'].nunique()}")
print(f"Categories: {df_sales['Category'].nunique()}")
print(f"Regions: {df_sales['Region'].nunique()}")

## 5. Preview Aggregated Data

In [None]:
# Aggregate by date for time series view
daily_sales = df_sales.groupby('Date').agg({
    'Sales_Amount': 'sum',
    'Quantity': 'sum'
}).reset_index()

print("Daily Sales Summary:")
print(daily_sales.head(10))
print(f"\nDaily sales statistics:")
print(daily_sales['Sales_Amount'].describe())

## 6. Sales by Category and Store

In [None]:
# Sales by Category
print("Sales by Category:")
category_sales = df_sales.groupby('Category')['Sales_Amount'].sum().sort_values(ascending=False)
print(category_sales)

# Sales by Store
print("\nSales by Store:")
store_sales = df_sales.groupby('Store')['Sales_Amount'].sum().sort_values(ascending=False)
print(store_sales)

# Sales by Region
print("\nSales by Region:")
region_sales = df_sales.groupby('Region')['Sales_Amount'].sum().sort_values(ascending=False)
print(region_sales)