# Init

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, DateType
from pyspark.sql.functions import col, trim, length

# Reading From Bronze Table

In [0]:
df = spark.table("workspace.bronze.erp_cust_az12")

# Data Transformations

## columns renaming

In [0]:
RENAME_MAP = {
    "CID": "customer_id",
    "BDATE": "birth_date",
    "GEN": "gender"
}

for old_name, new_name in RENAME_MAP.items():
    df = df.withColumnRenamed(old_name, new_name)

## birth validation against current date

In [0]:
df = df.withColumn(
    "birth_date",
    F.when(col("birth_date") > F.current_date(), None)
     .otherwise(col("birth_date"))
)

## Customer id cleanup - clearing leading "NAS" at all rows

In [0]:
df = df.withColumn(
    "customer_id",
    F.when(col("customer_id").startswith("NAS"),
           F.substring(col("customer_id"), 4, F.length(col("customer_id"))))
    .otherwise(col("customer_id"))
)

## Trimming

In [0]:
for field in df.schema.fields:
    if isinstance(field.dataType, StringType):
        df = df.withColumn(field.name, trim(col(field.name)))

# sanity check before writing to delta silver table

In [0]:
df.display()

In [0]:
df = df.write.mode("overwrite").format("delta").saveAsTable("workspace.silver.erp_cust_az12")

# Sanity check for silver table

In [0]:
%sql
SELECT * FROM workspace.silver.erp_cust_az12 LIMIT 10