## Initialization


In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pyspark.sql.functions import trim, col
     

## Read Bronze table

In [0]:
df = spark.table("workspace.bronze.erp_cust_az12")

## Trimming

In [0]:
for field in df.schema.fields:
    if isinstance(field.dataType, StringType):
        df = df.withColumn(field.name, trim(col(field.name)))
     

## Customer ID Cleanup

In [0]:
df.display()

In [0]:
df = df.withColumn(
    "cid",
    F.when(col("cid").startswith("NAS"),
           F.substring(col("cid"), 4, F.length(col("cid"))))
     .otherwise(col("cid"))
)


In [0]:
df.display()

In [0]:
null_counts = df.select([F.count(F.when(col(c).isNull(), c)).alias(c) for c in df.columns])
display(null_counts)

## Birthdate Validation

In [0]:
display(
    df.select(
        F.min(col("BDATE")).alias("min_end_date"),
        F.max(col("BDATE")).alias("max_end_date")
    )
)

In [0]:
df = df.withColumn(
    "bdate",
    F.when(col("bdate") > F.current_date(), None)
     .otherwise(col("bdate"))
)
     

## Gender Normalization

In [0]:
col_counts = df.select([F.count(col(c)).alias(c) for c in df.columns])
display(col_counts)

##null ingender

In [0]:
from pyspark.sql import Window

mode_gen = df.groupBy("GEN").count().orderBy(F.desc("count")).first()["GEN"]
df = df.withColumn(
    "GEN",
    F.when(col("GEN").isNull(), mode_gen).otherwise(col("GEN"))
)
display(df)

## Null in bdate

In [0]:
import pyspark.pandas as ps

psdf = df.toPandas()
psdf["bdate"] = psdf["bdate"].ffill()
df = spark.createDataFrame(psdf)
display(df)

## Renaming Columns

In [0]:
RENAME_MAP = {
    "cid": "customer_number",
    "bdate": "birth_date",
    "gen": "gender"
}
for old_name, new_name in RENAME_MAP.items():
    df = df.withColumnRenamed(old_name, new_name)
     

## Sanity checks of dataframe

In [0]:
df.limit(10).display()

## Writing Silver Table

In [0]:
df.write.mode("overwrite").format("delta").saveAsTable("workspace.silver.erp_customers")

## Sanity checks of silver table

In [0]:
%sql
SELECT * FROM workspace.silver.erp_customers LIMIT 10