In [0]:
%load_ext autoreload
%autoreload 2

import sys
from config.table_config import PROJECT_PATH

if PROJECT_PATH not in sys.path:
    sys.path.append(PROJECT_PATH)

from config import *
from utils import *
from pyspark.sql.functions import col, year, current_date

configure_spark(spark)

In [0]:
def customers_dimension_transform(source_dfs):
    """
    Transform silver.customers to dim_customers.
    
    Dimensional attributes:
    - Natural key: customer_id
    - Surrogate key: customer_key (same as customer_id for Type 1 SCD)
    - Derived: age from dob
    - Geography: city, state, country
    """
    
    customers_df = source_dfs[SILVER_CUSTOMERS_TABLE]
    
    dim_customers = customers_df.select(
        # Keys
        col("customer_id").alias("customer_key"),
        col("customer_id"),
        
        # Attributes
        col("customer_name"),
        col("email"),
        col("phone"),
        col("gender"),
        col("dob"),
        (year(current_date()) - year(col("dob"))).alias("age"),
        col("nationality"),
        
        # Geography
        col("city"),
        col("state"),
        col("country"),
        col("zipcode"),
        
        # Business
        col("company"),
        
        # Source lineage
        col("data_source")
    )
    
    return dim_customers

In [0]:
GoldTransformation(
    spark=spark,
    table_display_name="dim_customers",
    table_type="dimension",
    source_tables=[SILVER_CUSTOMERS_TABLE],
    target_table=f"{CATALOG_NAME}.gold.dim_customers"
).run(
    catalog_name=CATALOG_NAME,
    gold_schema="gold",
    gold_container=GOLD_BASE,
    transform_logic=customers_dimension_transform,
    clustering_cols=["country", "state"],
    scd_type=1,
    write_mode="overwrite"
)