

## dimensions products

In [0]:
# The error occurs because a STREAMING_TABLE with the name "maven_uc.gold_dlt.dim_products" already exists.
# To fix this, change the table name to a new one that does not conflict.

import dlt
from pyspark.sql.functions import col

@dlt.table(
    name="maven_uc.gold_dlt.dim_products_v2",
    comment="Dimension table for products"
)
def dim_products():
    return (
        dlt.read("maven_uc.silver_dlt.products_silver")
        .select(
            "product_id",
            "product_brand",
            "product_name",
            "product_sku",
            col("product_retail_price").cast("double"),
            col("product_cost").cast("double"),
            "ingestion_ts"
        )
    )

## dimension regions

In [0]:
@dlt.table(
    name="maven_uc.gold_dlt.dim_regions",
    comment="Sales regions dimension"
)
@dlt.expect_or_drop("valid_region_id", "region_id IS NOT NULL")
def dim_regions():
    return (
        dlt.read("maven_uc.silver_dlt.regions_silver")
        .select(
            "region_id",
            "sales_region",
            "sales_district",
            "ingestion_timestamp"
        )
    )

## dimension calenders

In [0]:
from pyspark.sql.functions import col, year, month, quarter, dayofmonth, to_date, trim

@dlt.table(
    name="maven_uc.gold_dlt.dim_calenders",
    comment="Dimension table for calendars"
)
#@dlt.expect_or_drop("valid_date", "date IS NOT NULL")
def dim_calenders():
    df = dlt.read("maven_uc.silver_dlt.calenders_silver")
    return (
        df
        .withColumn("date", to_date(trim(col("date")), "yyyy-MM-dd"))
        .withColumn("day", dayofmonth("date"))
        .withColumn("month", month("date"))
        .withColumn("quarter", quarter("date"))
        .withColumn("year", year("date"))
        .select("date", "day", "month", "quarter", "year")
    )

## dimension customers

In [0]:
@dlt.table(
    name="maven_uc.gold_dlt.dim_customers",
    comment="Dimension table for customers"
)
@dlt.expect_or_drop("valid_customer_id", "customer_id IS NOT NULL")
def dim_customers():
    df = dlt.read("maven_uc.silver_dlt.customers_silver")
    return (
        df
        .withColumn("yearly_income", col("yearly_income").cast("decimal(12,2)"))
        .withColumn("homeowner", col("homeowner") == "Yes")
        .withColumn("member_card", col("member_card") == "Yes")
        .withColumn("birthdate", col("birthdate").cast("date"))
        .withColumn("acct_open_date", col("acct_open_date").cast("date"))
    )

## dimension stores

In [0]:
import dlt
from pyspark.sql.functions import col, to_date
from pyspark.sql.types import IntegerType

@dlt.table(
    name="maven_uc.gold_dlt.dim_stores",
    comment="Dimension table for stores"
)
@dlt.expect_or_drop("valid_store_id", "store_id IS NOT NULL")
def dim_stores():
    df = dlt.read("maven_uc.silver_dlt.stores_silver")
    return (
        df
        .withColumn("first_opened_date", to_date(col("first_opened_date")))
        .withColumn("last_remodel_date", to_date(col("last_remodel_date")))
        .withColumn("total_sqft", col("total_sqft").cast(IntegerType()))
        .withColumn("grocery_sqft", col("grocery_sqft").cast(IntegerType()))
        .select(
            "store_id",
            "region_id",
            "store_type",
            "store_name",
            "store_street_address",
            "store_city",
            "store_state",
            "store_country",
            "store_phone",
            "first_opened_date",
            "last_remodel_date",
            "total_sqft",
            "grocery_sqft"
        )
    )