# Init

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pyspark.sql.functions import trim, col

# Read Bronze table

In [0]:
df = spark.table("workspace.bronze.erp_loc_a101")

# Transformations

## Trimming

In [0]:
for field in df.schema.fields:
    if isinstance(field.dataType, StringType):
        df = df.withColumn(field.name, trim(col(field.name)))

## Customer ID Cleanup

In [0]:
df = df.withColumn("cid", F.regexp_replace(col("cid"), "-", ""))

## Country Normalization - noticed some countries being spelled as an country code instead of full their full name

In [0]:
%sql
SELECT DISTINCT(CNTRY) FROM workspace.bronze.erp_loc_a101

In [0]:
df = df.withColumn(
    "cntry",
    F.when(col("CNTRY") == "DE", "Germany")
    .when(col("CNTRY").isin("US","USA"), "United States")
    .when((col("CNTRY") == "") | col("CNTRY").isNull(), "n/a")
    .otherwise(col("CNTRY"))
)

## Renaming columns

In [0]:
RENAME_MAP = {
    "cid": "customer_number",
    "cntry": "country"
}

for old_name, new_name in RENAME_MAP.items():
    df = df.withColumnRenamed(old_name, new_name)   

# sanity check before writing to delta silver table

In [0]:
df.display()

# writing to silver table 

In [0]:
df.write.mode("overwrite").format("delta").saveAsTable("workspace.silver.erp_customer_location")

# sanity check for silver table

In [0]:
%sql
SELECT * FROM workspace.silver.erp_customer_location LIMIT 10