In [1]:
# ChainEats Analytics - Day 1: Data Generation (OPTIMIZED)
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

print("Creating ChainEats Analytics Dataset...")
print("=" * 50)

# Step 1: Create Location Master Data
print("Generating 50 restaurant locations...")

cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami']
location_types = ['Mall', 'Street', 'Food Court', 'Airport', 'Standalone']

locations_data = []
for i in range(1, 51):
    city = random.choice(cities)
    location_type = random.choice(location_types)
    
    # Create realistic rent and foot traffic based on location type
    base_rent = {'Mall': 8000, 'Street': 6000, 'Food Court': 4000, 'Airport': 12000, 'Standalone': 5000}
    base_traffic = {'Mall': 500, 'Street': 300, 'Food Court': 800, 'Airport': 1200, 'Standalone': 200}
    
    locations_data.append({
        'location_id': f'LOC_{i:03d}',
        'city': city,
        'location_type': location_type,
        'monthly_rent': base_rent[location_type] + np.random.randint(-1000, 1000),
        'avg_daily_foottraffic': base_traffic[location_type] + np.random.randint(-100, 100),
        'opening_date': datetime(2022, 1, 1) + timedelta(days=np.random.randint(0, 365)),
        'size_sqft': np.random.randint(800, 2000)
    })

locations_df = pd.DataFrame(locations_data)
print(f"Created {len(locations_df)} locations across {len(cities)} cities")

# Step 2: Create Menu Items
print("Creating menu items...")

menu_categories = ['Burgers', 'Pizza', 'Salads', 'Beverages', 'Desserts']
items_per_category = {
    'Burgers': ['Classic Burger', 'Cheese Burger', 'Chicken Burger', 'Veggie Burger'],
    'Pizza': ['Margherita', 'Pepperoni', 'Supreme', 'Veggie Pizza'],
    'Salads': ['Caesar Salad', 'Greek Salad', 'Chicken Salad', 'Garden Salad'],
    'Beverages': ['Coca Cola', 'Orange Juice', 'Water', 'Coffee'],
    'Desserts': ['Ice Cream', 'Chocolate Cake', 'Apple Pie', 'Cookies']
}

menu_data = []
item_id = 1
for category, items in items_per_category.items():
    for item in items:
        # Realistic pricing based on category
        base_prices = {'Burgers': 12, 'Pizza': 15, 'Salads': 10, 'Beverages': 3, 'Desserts': 6}
        base_costs = {'Burgers': 5, 'Pizza': 6, 'Salads': 4, 'Beverages': 1, 'Desserts': 2}
        
        menu_data.append({
            'item_id': f'ITEM_{item_id:03d}',
            'item_name': item,
            'category': category,
            'price': base_prices[category] + np.random.uniform(-2, 3),
            'cost': base_costs[category] + np.random.uniform(-1, 1)
        })
        item_id += 1

menu_df = pd.DataFrame(menu_data)
menu_df['profit_margin'] = menu_df['price'] - menu_df['cost']
print(f"Created {len(menu_df)} menu items across {len(menu_categories)} categories")

# Step 3: Generate Daily Sales Data (2 years) - OPTIMIZED
print("Generating 2 years of daily sales data...")

start_date = datetime(2022, 1, 1)
end_date = datetime(2023, 12, 31)
date_range = pd.date_range(start_date, end_date, freq='D')

# Pre-compute menu item selections for efficiency
menu_items = menu_df[['item_id', 'item_name', 'category', 'price', 'cost']].values
menu_weights = np.ones(len(menu_df))  # Equal probability for all items
menu_indices = np.arange(len(menu_df))

# Create some locations that are consistently better/worse performers
high_performers = random.sample(locations_df['location_id'].tolist(), 10)
low_performers = random.sample(locations_df['location_id'].tolist(), 10)

# Pre-compute seasonal and weekday multipliers for all dates
seasonal_multipliers = []
weekday_multipliers = []

for current_date in date_range:
    # Seasonal multiplier
    month = current_date.month
    seasonal_mult = 1.0
    if month in [6, 7, 8]:  # Summer
        seasonal_mult = 1.2
    elif month in [11, 12]:  # Holiday season
        seasonal_mult = 1.3
    seasonal_multipliers.append(seasonal_mult)
    
    # Weekend multiplier
    weekday_mult = 1.4 if current_date.weekday() in [5, 6] else 1.0
    weekday_multipliers.append(weekday_mult)

# Generate all sales data in batches
sales_data = []
batch_size = 10000  # Process in batches to manage memory

total_transactions = 0
for date_idx, current_date in enumerate(date_range):
    seasonal_mult = seasonal_multipliers[date_idx]
    weekday_mult = weekday_multipliers[date_idx]
    
    for _, location in locations_df.iterrows():
        # Base sales influenced by location characteristics
        base_sales = location['avg_daily_foottraffic'] * 0.3  # 30% conversion rate
        
        # Performance tier adjustment
        if location['location_id'] in high_performers:
            base_sales *= 1.3
        elif location['location_id'] in low_performers:
            base_sales *= 0.7
        
        # Apply multipliers
        daily_customers = int(base_sales * seasonal_mult * weekday_mult * np.random.uniform(0.7, 1.3))
        
        if daily_customers > 0:
            # Generate all item selections at once for this location-date
            total_items = 0
            for customer in range(daily_customers):
                items_ordered = np.random.randint(1, 4)
                total_items += items_ordered
            
            if total_items > 0:
                # Vectorized item selection
                selected_menu_indices = np.random.choice(
                    menu_indices, 
                    size=total_items, 
                    p=menu_weights/menu_weights.sum()
                )
                
                quantities = np.random.randint(1, 3, size=total_items)
                
                # Build batch of sales records
                for i, menu_idx in enumerate(selected_menu_indices):
                    item_data = menu_items[menu_idx]
                    quantity = quantities[i]
                    
                    sales_data.append({
                        'date': current_date,
                        'location_id': location['location_id'],
                        'item_id': item_data[0],
                        'item_name': item_data[1],
                        'category': item_data[2],
                        'quantity': quantity,
                        'price': item_data[3],
                        'cost': item_data[4]
                    })
                    
                    total_transactions += 1
                    
                    # Process in batches to manage memory
                    if len(sales_data) >= batch_size:
                        if 'sales_df_parts' not in locals():
                            sales_df_parts = []
                        
                        batch_df = pd.DataFrame(sales_data)
                        batch_df['revenue'] = batch_df['quantity'] * batch_df['price']
                        batch_df['total_cost'] = batch_df['quantity'] * batch_df['cost']
                        batch_df['profit'] = batch_df['revenue'] - batch_df['total_cost']
                        
                        sales_df_parts.append(batch_df)
                        sales_data = []  # Clear for next batch

# Process final batch
if sales_data:
    if 'sales_df_parts' not in locals():
        sales_df_parts = []
    
    batch_df = pd.DataFrame(sales_data)
    batch_df['revenue'] = batch_df['quantity'] * batch_df['price']
    batch_df['total_cost'] = batch_df['quantity'] * batch_df['cost']
    batch_df['profit'] = batch_df['revenue'] - batch_df['total_cost']
    
    sales_df_parts.append(batch_df)

# Combine all batches
if 'sales_df_parts' in locals():
    sales_df = pd.concat(sales_df_parts, ignore_index=True)
else:
    sales_df = pd.DataFrame(columns=['date', 'location_id', 'item_id', 'item_name', 'category', 'quantity', 'price', 'cost', 'revenue', 'total_cost', 'profit'])

print(f"Generated {len(sales_df):,} sales transactions")

# Step 4: Add Weather Data (simplified) - OPTIMIZED
print("Adding weather data...")

# Pre-generate all weather data at once
n_weather_records = len(date_range) * len(cities)
weather_dates = np.repeat(date_range, len(cities))
weather_cities = np.tile(cities, len(date_range))

# Vectorized temperature generation based on month
months = np.array([date.month for date in weather_dates])

# Create temperature based on season
temperatures = np.zeros(len(months))
winter_mask = np.isin(months, [12, 1, 2])
spring_mask = np.isin(months, [3, 4, 5])
summer_mask = np.isin(months, [6, 7, 8])
fall_mask = ~(winter_mask | spring_mask | summer_mask)

temperatures[winter_mask] = np.random.normal(35, 10, np.sum(winter_mask))
temperatures[spring_mask] = np.random.normal(60, 10, np.sum(spring_mask))
temperatures[summer_mask] = np.random.normal(80, 8, np.sum(summer_mask))
temperatures[fall_mask] = np.random.normal(65, 10, np.sum(fall_mask))

# Apply realistic bounds
temperatures = np.clip(temperatures, 20, 100)

# Generate precipitation and rain data
precipitation = np.random.exponential(0.1, n_weather_records)
is_rainy = np.random.choice([0, 1], size=n_weather_records, p=[0.8, 0.2])

weather_df = pd.DataFrame({
    'date': weather_dates,
    'city': weather_cities,
    'temperature': temperatures,
    'precipitation': precipitation,
    'is_rainy': is_rainy
})

print(f"Generated weather data for {len(date_range)} days across {len(cities)} cities")

# Step 5: Save all datasets
print("Saving datasets...")

locations_df.to_csv('locations.csv', index=False)
menu_df.to_csv('menu_items.csv', index=False)
sales_df.to_csv('sales_data.csv', index=False)
weather_df.to_csv('weather_data.csv', index=False)

print("All datasets saved!")
print("\nDataset Summary:")
print(f"Locations: {len(locations_df):,} records")
print(f"Menu Items: {len(menu_df):,} records") 
print(f"Sales Transactions: {len(sales_df):,} records")
print(f"Weather Records: {len(weather_df):,} records")

# Quick preview
print("\nQuick Data Preview:")
print("\n1. Top 5 Locations by Type:")
print(locations_df['location_type'].value_counts().head())

print("\n2. Sales by Category (sample):")
if len(sales_df) > 0:
    category_sales = sales_df.groupby('category')['revenue'].sum().sort_values(ascending=False)
    print(category_sales)

print("\n3. Date range:", sales_df['date'].min(), "to", sales_df['date'].max())

print("\nDay 1 Complete! Ready for data cleaning")

Creating ChainEats Analytics Dataset...
Generating 50 restaurant locations...
Created 50 locations across 5 cities
Creating menu items...
Created 20 menu items across 5 categories
Generating 2 years of daily sales data...
Generated 13,605,203 sales transactions
Adding weather data...
Generated weather data for 730 days across 5 cities
Saving datasets...
All datasets saved!

Dataset Summary:
Locations: 50 records
Menu Items: 20 records
Sales Transactions: 13,605,203 records
Weather Records: 3,650 records

Quick Data Preview:

1. Top 5 Locations by Type:
location_type
Mall          13
Street        13
Standalone    12
Airport        6
Food Court     6
Name: count, dtype: int64

2. Sales by Category (sample):
category
Pizza        6.374138e+07
Burgers      4.672071e+07
Salads       4.668045e+07
Desserts     2.830575e+07
Beverages    1.470787e+07
Name: revenue, dtype: float64

3. Date range: 2022-01-01 00:00:00 to 2023-12-31 00:00:00

Day 1 Complete! Ready for data cleaning
