### 1. Loading from Existing Tables

* Catalog_name = "data_university"
* Schema_Name = "lakeflow"

In [0]:
# %sql

# -- Create and populate baby names table
# CREATE TABLE IF NOT EXISTS data_university.dlt.baby_names_prepared (
#     Year_Of_Birth INT,
#     First_Name STRING,
#     Count INT,
#     Gender STRING,
#     Ethnicity STRING
# ) USING DELTA;

# INSERT INTO data_university.dlt.baby_names_prepared VALUES
# (2021, 'Emma', 1250, 'F', 'WHITE NON HISPANIC'),
# (2021, 'Liam', 1180, 'M', 'WHITE NON HISPANIC'),
# (2021, 'Olivia', 1150, 'F', 'HISPANIC'),
# (2021, 'Noah', 1100, 'M', 'BLACK NON HISPANIC'),
# (2021, 'Ava', 1050, 'F', 'ASIAN AND PACIFIC ISLANDER'),
# (2020, 'Emma', 1300, 'F', 'WHITE NON HISPANIC'),
# (2020, 'Liam', 1200, 'M', 'WHITE NON HISPANIC'),
# (2019, 'Emma', 1180, 'F', 'WHITE NON HISPANIC');


In [0]:
# import dlt
# from pyspark.sql.functions import *

# @dlt.table(
#     comment="Top baby names for 2021 with count aggregation"
# )
# def top_baby_names_2021():
#     return (
#         spark.read.table("data_university.dlt.baby_names_prepared")
#             .filter(expr("Year_Of_Birth == 2021"))
#             .groupBy("First_Name")
#             .agg(sum("Count").alias("Total_Count"))
#             .sort(desc("Total_Count"))
#     )


In [0]:
# import dlt
# from pyspark.sql.functions import *

# @dlt.table(
#     name="data_university.dlt.top_baby_names_2025",
#     comment="Top baby names for 2021 with count aggregation"
# )
# def top_baby_names_2021():
#     return (
#         spark.read.table("data_university.dlt.baby_names_prepared")
#             .filter(expr("Year_Of_Birth == 2021"))
#             .groupBy("First_Name")
#             .agg(sum("Count").alias("Total_Count"))
#             .sort(desc("Total_Count"))
#     )

### 2. Loading from Cloud Object Storage with Auto Loader

#### JSON Files from S3

In [0]:
# s3://mybucket/customers/
# ├── 2024/
# │   ├── 01/
# │   │   ├── 01/
# │   │   │   ├── customers_001.json
# │   │   │   └── customers_002.json
# │   │   └── 02/
# │   │       ├── customers_003.json
# │   │       └── customers_004.json

In [0]:
import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, TimestampType

customer_schema = StructType([
    StructField("customer_id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("phone", StringType(), True),
    StructField("address", StructType([
        StructField("street", StringType(), True),
        StructField("city", StringType(), True),
        StructField("state", StringType(), True),
        StructField("zip", StringType(), True)
    ]), True),
    StructField("registration_date", StringType(), True),  # Can be TimestampType() if parsed
    StructField("status", StringType(), True),
    StructField("preferences", StructType([
        StructField("newsletter", BooleanType(), True),
        StructField("sms_notifications", BooleanType(), True)
    ]), True)
])


@dlt.table(
    comment="Raw customer data from S3 using Auto Loader"
)
def customers_raw():
    return (
        spark.readStream.format("cloudFiles")
            .option("cloudFiles.format", "json")
            .schema(customer_schema)
            # .option("cloudFiles.inferSchema", "true")
            .option("cloudFiles.schemaLocation", "/tmp/schema/customers")
            .load("s3://one-env-uc-external-location/demo_sourav/customer/2024/01/01/*.json")
    )

#### JSON Files from Volume

In [0]:
import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, TimestampType

customer_schema = StructType([
    StructField("customer_id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("phone", StringType(), True),
    StructField("address", StructType([
        StructField("street", StringType(), True),
        StructField("city", StringType(), True),
        StructField("state", StringType(), True),
        StructField("zip", StringType(), True)
    ]), True),
    StructField("registration_date", StringType(), True),  # Can be TimestampType() if parsed
    StructField("status", StringType(), True),
    StructField("preferences", StructType([
        StructField("newsletter", BooleanType(), True),
        StructField("sms_notifications", BooleanType(), True)
    ]), True)
])


@dlt.table(
    comment="Raw customer data from S3 using Auto Loader"
)
def customers_raw_volume():
    return (
        spark.readStream.format("cloudFiles")
            .option("cloudFiles.format", "json")
            .schema(customer_schema)
            # .option("cloudFiles.inferSchema", "true")
            .option("cloudFiles.schemaLocation", "/tmp/schema/customers")
            .load("/Volumes/data_university/lakeflow/demo_volume/customer/2024/01/01/*.json")
    )
    

#### CSV Files from Unity Catalog Volume

In [0]:
# @dlt.table(
#     comment="Customer data from Unity Catalog volume"
# )
# def customers_csv():
#     return (
#         spark.readStream.format("cloudFiles")
#             .option("cloudFiles.format", "csv")
#             .option("header", "true")
#             .option("cloudFiles.inferColumnTypes", "true")
#             .load("/Volumes/data_university/lakeflow/customer")
#     )

### 3. Loading from Message Buses

####  Apache Kafka
Sample Kafka Topic: user_events

Configure your Kafka topic with these sample messages:

In [0]:
# // Message 1 - Page View Event
# {
#   "key": "user_123",
#   "value": {
#     "event_id": "evt_001",
#     "user_id": "user_123",
#     "event_type": "page_view",
#     "page_url": "/products/laptop",
#     "timestamp": "2024-01-01T10:30:00Z",
#     "session_id": "sess_abc123",
#     "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
#     "ip_address": "192.168.1.100"
#   }
# }

# // Message 2 - Purchase Event
# {
#   "key": "user_456",
#   "value": {
#     "event_id": "evt_002",
#     "user_id": "user_456",
#     "event_type": "purchase",
#     "product_id": "prod_laptop_001",
#     "amount": 1299.99,
#     "timestamp": "2024-01-01T10:35:00Z",
#     "session_id": "sess_def456",
#     "payment_method": "credit_card"
#   }
# }

# // Message 3 - Cart Add Event
# {
#   "key": "user_789",
#   "value": {
#     "event_id": "evt_003",
#     "user_id": "user_789",
#     "event_type": "cart_add",
#     "product_id": "prod_mouse_001",
#     "quantity": 2,
#     "timestamp": "2024-01-01T10:40:00Z",
#     "session_id": "sess_ghi789"
#   }
# }


#### DLT Code

In [0]:
# import dlt
# from pyspark.sql.functions import *

# @dlt.table(
#     comment="Raw events from Kafka topic"
# )
# def kafka_events():
#     return (
#         spark.readStream
#             .format("kafka")
#             .option("kafka.bootstrap.servers", "kafka_server:9092")
#             .option("subscribe", "user_events")
#             .option("kafka.security.protocol", "SASL_SSL")
#             .option("kafka.sasl.mechanism", "PLAIN")
#             .load()
#             .select(
#                 col("key").cast("string"),
#                 col("value").cast("string"),
#                 col("topic"),
#                 col("partition"),
#                 col("offset"),
#                 col("timestamp")
#             )
#     )

# # Parse the JSON value from Kafka messages
# @dlt.table(
#     comment="Parsed user events from Kafka"
# )
# def parsed_user_events():
#     return (
#         dlt.read_stream("kafka_events")
#             .select(
#                 col("key").alias("user_key"),
#                 from_json(col("value"), schema="event_id STRING, user_id STRING, event_type STRING, page_url STRING, product_id STRING, amount DOUBLE, quantity INT, timestamp TIMESTAMP, session_id STRING").alias("event_data")
#             )
#             .select("user_key", "event_data.*")
#     )


#### Amazon Kinesis
Sample Kinesis Stream Data:

Set up your Kinesis stream with these sample records:

In [0]:
# // Record 1
# {
#   "recordId": "rec_001",
#   "data": {
#     "sensor_id": "temp_sensor_01",
#     "location": "warehouse_a",
#     "temperature": 22.5,
#     "humidity": 45.2,
#     "timestamp": "2024-01-01T10:30:00Z",
#     "alert_threshold_exceeded": false
#   }
# }

# // Record 2
# {
#   "recordId": "rec_002",
#   "data": {
#     "sensor_id": "temp_sensor_02",
#     "location": "warehouse_b",
#     "temperature": 28.1,
#     "humidity": 52.8,
#     "timestamp": "2024-01-01T10:30:30Z",
#     "alert_threshold_exceeded": true
#   }
# }

# // Record 3
# {
#   "recordId": "rec_003",
#   "data": {
#     "sensor_id": "temp_sensor_03",
#     "location": "warehouse_c",
#     "temperature": 20.8,
#     "humidity": 40.1,
#     "timestamp": "2024-01-01T10:31:00Z",
#     "alert_threshold_exceeded": false
#   }
# }


DLT Code

In [0]:
# @dlt.table(
#     comment="Streaming IoT sensor data from Kinesis"
# )
# def kinesis_iot_data():
#     return (
#         spark.readStream
#             .format("kinesis")
#             .option("streamName", "iot-sensor-stream")
#             .option("region", "us-west-2")
#             .option("initialPosition", "TRIM_HORIZON")
#             .load()
#     )

#### Azure Event Hubs
Sample Event Hub Messages:

Configure your Event Hub with these sample messages:

In [0]:
# // Message 1
# {
#   "messageId": "msg_001",
#   "body": {
#     "device_id": "device_001",
#     "device_type": "temperature_sensor",
#     "reading": 23.7,
#     "unit": "celsius",
#     "location": {
#       "building": "A",
#       "floor": 2,
#       "room": "201"
#     },
#     "timestamp": "2024-01-01T10:30:00Z"
#   },
#   "properties": {
#     "priority": "normal",
#     "source": "iot_gateway_01"
#   }
# }

# // Message 2
# {
#   "messageId": "msg_002",
#   "body": {
#     "device_id": "device_002",
#     "device_type": "humidity_sensor",
#     "reading": 65.3,
#     "unit": "percentage",
#     "location": {
#       "building": "B",
#       "floor": 1,
#       "room": "101"
#     },
#     "timestamp": "2024-01-01T10:30:15Z"
#   },
#   "properties": {
#     "priority": "high",
#     "source": "iot_gateway_02"
#   }
# }


Pipeline Code:

In [0]:
# @dlt.table(
#     comment="Events from Azure Event Hubs"
# )
# def eventhub_data():
#     connection_string = "Endpoint=sb://namespace.servicebus.windows.net/;SharedAccessKeyName=policy;SharedAccessKey=key"
#     eventhub_name = "iot-events"
    
#     return (
#         spark.readStream
#             .format("eventhubs")
#             .option("eventhubs.connectionString", connection_string)
#             .option("eventhubs.eventHubName", eventhub_name)
#             .load()
#     )

### 4. Loading from External Systems

#### PostgreSQL Database
Sample PostgreSQL Schema and Data:

In [0]:
# -- Create PostgreSQL customers table
# CREATE TABLE customers (
#     customer_id VARCHAR(50) PRIMARY KEY,
#     name VARCHAR(100) NOT NULL,
#     email VARCHAR(100) UNIQUE,
#     phone VARCHAR(20),
#     address_line1 VARCHAR(200),
#     address_line2 VARCHAR(200),
#     city VARCHAR(50),
#     state VARCHAR(20),
#     postal_code VARCHAR(10),
#     country VARCHAR(50),
#     created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
#     updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
#     is_active BOOLEAN DEFAULT TRUE
# );

# -- Insert sample data
# INSERT INTO customers VALUES
# ('CUST001', 'John Smith', 'john.smith@email.com', '+1-555-0123', 
#  '123 Main St', 'Apt 4B', 'New York', 'NY', '10001', 'USA', 
#  '2024-01-01 10:30:00', '2024-01-01 10:30:00', TRUE),
# ('CUST002', 'Sarah Johnson', 'sarah.j@email.com', '+1-555-0124', 
#  '456 Oak Ave', NULL, 'Los Angeles', 'CA', '90210', 'USA', 
#  '2024-01-01 11:15:00', '2024-01-01 11:15:00', TRUE),
# ('CUST003', 'Mike Davis', 'mike.davis@email.com', '+1-555-0125', 
#  '789 Pine St', 'Suite 200', 'Chicago', 'IL', '60601', 'USA', 
#  '2024-01-01 12:00:00', '2024-01-01 12:00:00', FALSE);


Pipeline Code:

In [0]:
# @dlt.table(
#     comment="Customer data from PostgreSQL"
# )
# def postgres_customers():
#     return (
#         spark.read
#             .format("postgresql")
#             .option("dbtable", "customers")
#             .option("host", "postgres-server.example.com")
#             .option("port", 5432)
#             .option("database", "retail_db")
#             .option("user", dbutils.secrets.get("db_scope", "username"))
#             .option("password", dbutils.secrets.get("db_scope", "password"))
#             .load()
#     )


#### REST API with Custom Source
Sample API Response from https://api.example.com/orders:

In [0]:
# {
#   "status": "success",
#   "data": [
#     {
#       "id": "order_001",
#       "customer_id": "CUST001",
#       "order_date": "2024-01-01T10:30:00Z",
#       "status": "completed",
#       "items": [
#         {
#           "product_id": "PROD001",
#           "name": "Laptop",
#           "quantity": 1,
#           "price": 1299.99
#         }
#       ],
#       "total_amount": 1299.99,
#       "shipping_address": {
#         "street": "123 Main St",
#         "city": "New York",
#         "state": "NY",
#         "zip": "10001"
#       }
#     },
#     {
#       "id": "order_002",
#       "customer_id": "CUST002",
#       "order_date": "2024-01-01T11:15:00Z",
#       "status": "pending",
#       "items": [
#         {
#           "product_id": "PROD002",
#           "name": "Mouse",
#           "quantity": 2,
#           "price": 29.99
#         }
#       ],
#       "total_amount": 59.98,
#       "shipping_address": {
#         "street": "456 Oak Ave",
#         "city": "Los Angeles",
#         "state": "CA",
#         "zip": "90210"
#       }
#     }
#   ],
#   "pagination": {
#     "page": 1,
#     "per_page": 100,
#     "total": 2
#   }
# }


Pipeline Code:

In [0]:
# import requests
# import json

# @dlt.table(
#     comment="Order data loaded from REST API"
# )
# def api_orders():
#     # Custom function to fetch data from API
#     def fetch_api_data():
#         headers = {"Authorization": f"Bearer {dbutils.secrets.get('api_scope', 'token')}"}
#         response = requests.get("https://api.example.com/orders", headers=headers)
#         return response.json()["data"]
    
#     # Convert to DataFrame
#     data = fetch_api_data()
#     return spark.createDataFrame(data)


### 5. Loading Static/Small Datasets
Reference Data Sources
Setup Static Files:

In [0]:
# # Upload JSON reference data to DBFS
# dbutils.fs.put("/FileStore/reference/product_categories.json", 
# '''
# {
#   "categories": [
#     {
#       "category_id": "CAT001",
#       "name": "Electronics",
#       "description": "Electronic devices and accessories",
#       "parent_category": null,
#       "subcategories": ["CAT001_01", "CAT001_02"]
#     },
#     {
#       "category_id": "CAT001_01",
#       "name": "Computers",
#       "description": "Desktop and laptop computers",
#       "parent_category": "CAT001",
#       "subcategories": []
#     },
#     {
#       "category_id": "CAT001_02",
#       "name": "Accessories",
#       "description": "Computer accessories and peripherals",
#       "parent_category": "CAT001",
#       "subcategories": []
#     },
#     {
#       "category_id": "CAT002",
#       "name": "Clothing",
#       "description": "Apparel and fashion items",
#       "parent_category": null,
#       "subcategories": ["CAT002_01", "CAT002_02"]
#     }
#   ]
# }
# ''', True)

# # Upload CSV lookup data
# dbutils.fs.put("/FileStore/lookup/country_codes.csv",
# '''
# country_code,country_name,region,currency_code
# US,United States,North America,USD
# CA,Canada,North America,CAD
# GB,United Kingdom,Europe,GBP
# DE,Germany,Europe,EUR
# FR,France,Europe,EUR
# JP,Japan,Asia,JPY
# AU,Australia,Oceania,AUD
# BR,Brazil,South America,BRL
# IN,India,Asia,INR
# CN,China,Asia,CNY
# ''', True)

In [0]:
@dlt.table(
    comment="Static reference data from JSON file"
)
def product_categories():
    return (
        spark.read.format("json")
            .option("multiline", "true")
            .load("/FileStore/reference/product_categories.json")
            .select(explode(col("categories")).alias("category"))
            .select("category.*")
    )

@dlt.table(
    comment="Country lookup table from CSV"
)
def country_codes():
    return (
        spark.read.format("csv")
            .option("header", "true")
            .option("inferSchema", "true")
            .load("/FileStore/lookup/country_codes.csv"))