# Setup Tables and Generate Sample Data

This notebook:
1. Creates source tables (orders_partitioned and orders_liquid)
2. Generates 20,000 sample orders with realistic distributions
3. Ensures IDENTICAL data in both tables for accurate comparison

## Tables Created:
- **orders_partitioned**: PARTITIONED BY order_date
- **orders_liquid**: CLUSTER BY order_date

## Parameters:
- `catalog`: Target Unity Catalog (default: main)
- `schema`: Schema name (default: your_schema)
- `num_orders`: Number of orders to generate (default: 20000)

In [None]:
# Get parameters
dbutils.widgets.text("catalog", "main", "Catalog Name")
dbutils.widgets.text("schema", "your_schema", "Schema Name")
dbutils.widgets.text("num_orders", "20000", "Number of Orders")

catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")
num_orders = int(dbutils.widgets.get("num_orders"))

print(f"Catalog: {catalog}")
print(f"Schema: {schema}")
print(f"Number of Orders: {num_orders:,}")

In [None]:
# Create schema if it doesn't exist
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.{schema}")
print(f"âœ“ Schema '{catalog}.{schema}' is ready")

In [None]:
# Drop existing tables for clean setup
spark.sql(f"DROP TABLE IF EXISTS {catalog}.{schema}.orders_partitioned")
spark.sql(f"DROP TABLE IF EXISTS {catalog}.{schema}.orders_liquid")
print("âœ“ Cleaned up existing tables")

In [None]:
# Create partitioned orders table
create_partitioned_sql = f"""
CREATE TABLE {catalog}.{schema}.orders_partitioned (
    order_id STRING NOT NULL,
    customer_id STRING NOT NULL,
    order_date DATE NOT NULL,
    product_id STRING,
    quantity INT NOT NULL,
    unit_price DECIMAL(10,2) NOT NULL,
    total_amount DECIMAL(10,2),
    status STRING NOT NULL,
    created_at TIMESTAMP NOT NULL
)
USING DELTA
PARTITIONED BY (order_date)
TBLPROPERTIES (
    'delta.enableChangeDataFeed' = 'true',
    'delta.autoOptimize.optimizeWrite' = 'true',
    'delta.autoOptimize.autoCompact' = 'true'
)
"""

spark.sql(create_partitioned_sql)
print(f"âœ“ Created table: {catalog}.{schema}.orders_partitioned (PARTITIONED BY order_date)")

In [None]:
# Calculate date range
from datetime import datetime
from dateutil.relativedelta import relativedelta

today = datetime.now().date()
first_day_of_current_month = today.replace(day=1)
first_day_of_previous_month = first_day_of_current_month - relativedelta(months=1)
days_in_range = (today - first_day_of_previous_month).days + 1

print(f"\nDate Range: {first_day_of_previous_month} to {today} ({days_in_range} days)")
print(f"Expected: ~{num_orders // days_in_range:,} orders/day")

In [None]:
# Generate sample data using PySpark DataFrame API
print(f"\nGenerating {num_orders:,} sample orders...")

from pyspark.sql import functions as F

df = spark.range(num_orders).select(
    F.concat(F.lit('ORD-'), F.lpad(F.col('id').cast('string'), 10, '0')).alias('order_id'),
    
    F.concat(
        F.lit('CUST-'),
        F.lpad(
            F.when(F.rand() < 0.3, (F.rand() * 50 + 1).cast('int'))
             .when(F.rand() < 0.6, (F.rand() * 150 + 51).cast('int'))
             .otherwise((F.rand() * 300 + 201).cast('int')).cast('string'),
            5, '0'
        )
    ).alias('customer_id'),
    
    F.date_add(
        F.date_trunc('month', F.add_months(F.current_date(), -1)),
        F.when(F.rand() < 0.60, (F.rand() * F.datediff(F.current_date(), F.date_trunc('month', F.add_months(F.current_date(), -1)))).cast('int'))
         .when(F.rand() < 0.85, F.greatest(F.datediff(F.current_date(), F.date_trunc('month', F.add_months(F.current_date(), -1))) - (F.rand() * 14).cast('int'), F.lit(0)))
         .otherwise(((F.rand() * F.datediff(F.current_date(), F.date_trunc('month', F.add_months(F.current_date(), -1))) / 2) + (F.datediff(F.current_date(), F.date_trunc('month', F.add_months(F.current_date(), -1))) / 4)).cast('int'))
    ).alias('order_date'),
    
    F.concat(
        F.lit('PROD-'),
        F.lpad(
            F.when(F.rand() < 0.4, (F.rand() * 20 + 1).cast('int'))
             .when(F.rand() < 0.7, (F.rand() * 30 + 21).cast('int'))
             .otherwise((F.rand() * 50 + 51).cast('int')).cast('string'),
            4, '0'
        )
    ).alias('product_id'),
    
    F.when(F.rand() < 0.5, F.lit(1))
     .when(F.rand() < 0.8, (F.rand() * 2 + 2).cast('int'))
     .otherwise((F.rand() * 7 + 4).cast('int')).alias('quantity'),
    
    F.when(F.rand() < 0.4, F.round(9.99 + F.rand() * 40, 2))
     .when(F.rand() < 0.7, F.round(50 + F.rand() * 100, 2))
     .otherwise(F.round(150 + F.rand() * 350, 2)).alias('unit_price'),
    
    F.lit(None).cast('decimal(10,2)').alias('total_amount'),
    
    F.when(F.rand() < 0.60, F.lit('delivered'))
     .when(F.rand() < 0.75, F.lit('shipped'))
     .when(F.rand() < 0.87, F.lit('confirmed'))
     .when(F.rand() < 0.95, F.lit('pending'))
     .otherwise(F.lit('cancelled')).alias('status'),
    
    F.current_timestamp().alias('created_at')
)

df.write.mode('append').insertInto(f"{catalog}.{schema}.orders_partitioned")
print(f"âœ“ Generated and inserted {num_orders:,} orders into partitioned table")

In [None]:
# Create liquid clustered table by copying from partitioned table (ensures IDENTICAL data)
create_liquid_sql = f"""
CREATE TABLE {catalog}.{schema}.orders_liquid
USING DELTA
CLUSTER BY (order_date)
TBLPROPERTIES (
  'delta.enableChangeDataFeed' = 'true',
  'delta.autoOptimize.optimizeWrite' = 'true',
  'delta.autoOptimize.autoCompact' = 'true'
)
AS SELECT * FROM {catalog}.{schema}.orders_partitioned
"""

spark.sql(create_liquid_sql)
print(f"âœ“ Created liquid clustered table by copying data (CLUSTER BY order_date)")

In [None]:
# Verify data
orders_partitioned_count = spark.sql(f"SELECT COUNT(*) as count FROM {catalog}.{schema}.orders_partitioned").collect()[0]['count']
orders_liquid_count = spark.sql(f"SELECT COUNT(*) as count FROM {catalog}.{schema}.orders_liquid").collect()[0]['count']

print(f"\nâœ“ Partitioned table: {orders_partitioned_count:,} rows")
print(f"âœ“ Liquid clustered table: {orders_liquid_count:,} rows")
print(f"\nâœ… Both tables have IDENTICAL data!")

In [None]:
# Optimize tables
print("\nOptimizing tables...")
spark.sql(f"OPTIMIZE {catalog}.{schema}.orders_partitioned")
spark.sql(f"OPTIMIZE {catalog}.{schema}.orders_liquid")
print("âœ“ Tables optimized")

In [None]:
# Display summary
print("\n" + "="*70)
print("âœ… SETUP AND DATA GENERATION COMPLETE")
print("="*70)
print(f"\nTables created with IDENTICAL data:")
print(f"  1. {catalog}.{schema}.orders_partitioned")
print(f"     - {orders_partitioned_count:,} rows")
print(f"     - PARTITIONED BY order_date")
print(f"  2. {catalog}.{schema}.orders_liquid")
print(f"     - {orders_liquid_count:,} rows")
print(f"     - CLUSTER BY order_date")
print(f"\nDate range: {first_day_of_previous_month} to {today}")
print(f"\nâœ¨ This notebook is part of the complete dbt_workflow job")
print(f"   The workflow automatically runs:")
print(f"   1. This setup (02a)")
print(f"   2. Initial dbt build with --full-refresh")
print(f"   3. Simulate late arrivals (02b)")
print(f"   4. dbt build --full-refresh (fast! insert_overwrite STILL only updates affected partitions)")
print(f"   5. Verify sync (02c)")
print(f"\nðŸ’¡ To run the complete workflow:")
print(f"   databricks bundle run dbt_workflow --target dev")
print("="*70)