In [0]:
%load_ext autoreload
%autoreload 2

import sys
from config.table_config import PROJECT_PATH

if PROJECT_PATH not in sys.path:
    sys.path.append(PROJECT_PATH)

from config import *
from utils import *
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, to_date, upper, trim, regexp_replace

configure_spark(spark)

log_info("=" * 60)
log_info("SILVER LAYER: Customers Transformation")
log_info("=" * 60)
log_info(f"Source: {BRONZE_CUSTOMERS_TABLE}")
log_info(f"Target: {SILVER_CUSTOMERS_TABLE}")

In [0]:
def customers_transform(df: DataFrame) -> DataFrame:
    """
    Customers specific transformations:
    - Cast dob to DateType
    - Standardize country/state to uppercase
    - Trim whitespace from string columns
    - Mask sensitive columns (credit card, paypal)
    """
    return df \
        .withColumn("dob",            to_date(col("dob"))) \
        .withColumn("country",        upper(trim(col("country")))) \
        .withColumn("state",          upper(trim(col("state")))) \
        .withColumn("customer_name",  trim(col("customer_name"))) \
        .withColumn("email",          trim(col("email"))) \
        .withColumn("city",           trim(col("city"))) \
        .withColumn("credit_card_no", regexp_replace(col("credit_card_no"), r"\d(?=\d{4})", "*")) \
        .withColumn("paypal_account", regexp_replace(col("paypal_account"), r"(?<=.{3}).(?=.*@)", "*"))

In [0]:
SilverTransformation(
    spark=spark,
    table_display_name="CUSTOMERS",
    source_table=BRONZE_CUSTOMERS_TABLE,
    target_table=SILVER_CUSTOMERS_TABLE
).run(
    catalog_name=CATALOG_NAME,
    silver_schema=SILVER_SCHEMA,
    silver_container=SILVER_BASE,
    key_columns=["customer_id"],
    drop_cols=None,
    partition_cols=["country"],
    bronze_partition_col=None,  
    custom_transform=customers_transform,
    num_partitions=16  # Parallel processing
)