In [0]:
%load_ext autoreload
%autoreload 2

import sys
from config.table_config import PROJECT_PATH

if PROJECT_PATH not in sys.path:
    sys.path.append(PROJECT_PATH)

from config import *
from utils import *
from pyspark.sql.functions import col, datediff, current_date

configure_spark(spark)

In [0]:
def sellers_dimension_transform(source_dfs):
    """
    Transform silver.sellers to dim_sellers.
    
    Dimensional attributes:
    - Natural key: seller_id
    - Derived: days_active, rating_category
    """
    
    sellers_df = source_dfs[SILVER_SELLERS_TABLE]
    
    dim_sellers = sellers_df.select(
        # Keys
        col("seller_id").alias("seller_key"),
        col("seller_id"),
        
        # Attributes
        col("seller_company_name").alias("company_name"),
        col("seller_business_type").alias("business_type"),
        col("seller_rating").alias("rating"),
        col("seller_active_date").alias("active_date"),
        
        # Derived
        datediff(current_date(), col("seller_active_date")).alias("days_active"),
        
        # Geography
        col("seller_address").alias("address"),
        col("seller_state").alias("state"),
        col("seller_country").alias("country"),
        col("seller_zipcode").alias("zipcode"),
        
        # Source
        col("data_source")
    )
    
    return dim_sellers

In [0]:
GoldTransformation(
    spark=spark,
    table_display_name="dim_sellers",
    table_type="dimension",
    source_tables=[SILVER_SELLERS_TABLE],
    target_table=f"{CATALOG_NAME}.gold.dim_sellers"
).run(
    catalog_name=CATALOG_NAME,
    gold_schema="gold",
    gold_container=GOLD_BASE,
    transform_logic=sellers_dimension_transform,
    clustering_cols=["country", "state"],
    scd_type=1,
    write_mode="overwrite"
)