In [0]:
%sql

CREATE SCHEMA IF NOT EXISTS data_university.dlt;

In [0]:
# SIMPLE DEMO DELTA TABLE CREATION
from pyspark.sql import SparkSession
from pyspark.sql import Row

spark = SparkSession.builder.getOrCreate()

# -- Sales Table (Very Simple Structure)
sales_data = [
    Row(order_id=1, customer_id=1, amount=100.0, order_date="2024-01-01"),
    Row(order_id=2, customer_id=2, amount=200.0, order_date="2024-01-02"),
    Row(order_id=3, customer_id=1, amount=150.0, order_date="2024-01-03"),
    Row(order_id=4, customer_id=3, amount=120.0, order_date="2024-01-03"),
    Row(order_id=5, customer_id=2, amount=90.0,  order_date="2024-01-04"),
]
sales_df = spark.createDataFrame(sales_data)
sales_df.write.format("delta").mode("overwrite").saveAsTable("data_university.dlt.demo_sales_source")

# -- Customers Table (Very Simple Structure)
customers_data = [
    Row(customer_id=1, customer_name="Alice"),
    Row(customer_id=2, customer_name="Bob"),
    Row(customer_id=3, customer_name="Carol"),
]
customers_df = spark.createDataFrame(customers_data)
customers_df.write.format("delta").mode("overwrite").saveAsTable("data_university.dlt.demo_customers_source")

print("Demo tables created!")


In [0]:
%sql

DROP SCHEMA IF EXISTS data_university.dl1 CASCADE;

In [0]:
# ===================================================================
# DEMO TABLE CREATION SCRIPT
# Run this in a separate notebook before creating the DLT pipeline
# ===================================================================

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.functions import col  # Explicit import for col function
from pyspark.sql.types import *
from datetime import datetime, timedelta
import random
import builtins  # To access Python's built-in round function

# Initialize Spark session (already available in Databricks as 'spark')
spark = SparkSession.builder.getOrCreate()

# Define your target catalog and schema (update these values)
catalog_name = "data_university"  # Replace with your catalog
schema_name = "dlt01"  # Replace with your schema

# Create schema if it doesn't exist (for Unity Catalog)
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.{schema_name}")

# ===================================================================
# Create Sales Demo Data
# ===================================================================

# Define schema for sales data
sales_schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("product_name", StringType(), True),
    StructField("category", StringType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("unit_price", DoubleType(), True),
    StructField("order_date", TimestampType(), True),
    StructField("region", StringType(), True)
])

# Generate sample sales data
sales_data = []
base_date = datetime(2024, 1, 1)
products = [
    ("PROD_001", "Laptop", "Electronics"),
    ("PROD_002", "Mouse", "Accessories"),
    ("PROD_003", "Keyboard", "Accessories"),
    ("PROD_004", "Monitor", "Electronics"),
    ("PROD_005", "Headphones", "Electronics"),
    ("PROD_006", "Tablet", "Electronics"),
    ("PROD_007", "Phone", "Electronics"),
    ("PROD_008", "Charger", "Accessories")
]

regions = ["North", "South", "East", "West", "Central"]

# Generate 200 sample sales records
for i in range(200):
    product = random.choice(products)
    order_date = base_date + timedelta(days=random.randint(0, 180))
    
    sales_data.append((
        f"ORD_{i+1:05d}",  # order_id
        f"CUST_{random.randint(1, 50):03d}",  # customer_id
        product[0],  # product_id
        product[1],  # product_name
        product[2],  # category
        random.randint(1, 5),  # quantity
        builtins.round(random.uniform(10, 500), 2),  # unit_price
        order_date,  # order_date
        random.choice(regions)  # region
    ))

# Create sales DataFrame
sales_df = spark.createDataFrame(sales_data, sales_schema)

# Add calculated column for total amount
sales_df_enhanced = sales_df.withColumn("total_amount", col("quantity") * col("unit_price"))

# Write to Delta table
sales_table_name = f"{catalog_name}.{schema_name}.demo_sales_source"
sales_df_enhanced.write.format("delta").mode("overwrite").saveAsTable(sales_table_name)

# ===================================================================
# Create Customer Demo Data
# ===================================================================

customer_data = []
for i in range(50):
    customer_data.append((
        f"CUST_{i+1:03d}",  # customer_id
        f"Customer_{i+1}",  # customer_name
        random.choice(regions),  # region
        random.choice(["Premium", "Standard", "Basic"]),  # tier
        random.randint(25, 65)  # age
    ))

customer_schema = StructType([
    StructField("customer_id", StringType(), True),
    StructField("customer_name", StringType(), True),
    StructField("region", StringType(), True),
    StructField("tier", StringType(), True),
    StructField("age", IntegerType(), True)
])

customer_df = spark.createDataFrame(customer_data, customer_schema)

# Write customer table
customer_table_name = f"{catalog_name}.{schema_name}.demo_customers_source"
customer_df.write.format("delta").mode("overwrite").saveAsTable(customer_table_name)

print("Demo tables created successfully!")
print(f"Sales table: {sales_table_name} - {sales_df_enhanced.count()} records")
print(f"Customer table: {customer_table_name} - {customer_df.count()} records")

# Display sample data
print("\n--- Sample Sales Data ---")
sales_df_enhanced.show(5)

print("\n--- Sample Customer Data ---")
customer_df.show(5)