In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import *
import datetime

#####  **LOAD BRONZE CRM_INFO TO SILVER CRM_INFO**

In [0]:
# Read the Bronze Delta table as a batch DataFrame (snapshot of current data)

df_cust_info=spark.read.format("delta").load("/Volumes/workspace/bronze_crm_erp/bronze_volume/bronze_crm_data/cust_info/data/")

# Dictionaries to map abbreviations to full descriptions for marital status and gender

marital_status_map={
    "M":"Married",
    "S":"Single"
}

gender_map={
    "F":"Female",
    "M":"Male"
}


# Transformation and cleansing logic applied on the streaming DataFrame
 # Cast customer ID to integer type
 # Keep cst_key as is (assuming string or correct type already)
 # Trim leading/trailing spaces in first and last names
 # Normalize marital status: trim spaces, convert to uppercase, replace abbreviations with full words,
    # and fill any nulls with "Unknown". Operation is scoped only to the 'cst_marital_status' column.
 # Normalize gender: trim spaces, convert to uppercase, replace abbreviations with full words,
    # and fill any nulls with "Unknown". Scoped only to 'cst_gndr' column.
 # Convert create date column from string to DateType (format: yyyy-MM-dd)
 # Add current timestamp as a new column indicating when data was processed into the DWH
 # Drop the '_rescued_data' column generated by Autoloader if it exists (contains malformed rows)
silver_df=(
    df_cust_info\
        .withColumn("cst_id",col("cst_id").cast("int"))
        .withColumn("cst_key",col("cst_key"))
        .withColumn("cst_firstname", trim(col("cst_firstname")))
        .withColumn("cst_lastname",trim(col("cst_lastname")))
        .withColumn("cst_marital_status",trim(upper(col("cst_marital_status")))).na.replace(marital_status_map,subset=["cst_marital_status"]).na.fill({"cst_marital_status" : "Unknown"},subset=["cst_marital_status"])
        .withColumn("cst_gndr",trim(upper(col("cst_gndr")))).na.replace(gender_map,subset=["cst_gndr"]).na.fill({"cst_gndr" : "Unknown"},subset=["cst_gndr"])
        .withColumn("cst_create_date",to_date(col("cst_create_date"),"yyyy-MM-dd"))
        .withColumn("DWH_create_date", current_timestamp())
        .drop(col("_rescued_data"))
)


# Write transformed data to the Silver Delta table:
# - Append new records
# - Allow schema evolution if new columns appear
silver_df.write\
    .format("delta")\
        .option("overwriteSchema", "true")\
            .mode("append")\
                .saveAsTable("workspace.silver_crm_erp.cust_info")

##### **LOAD BRONZE PRD_INFO TO SILVER PRD_INFO**

In [0]:
"""
Read the Bronze Delta table as a batch DataFrame (snapshot of current data)
Split 'prd_key' to extract category components.
Define product line mapping.
Define window spec for ordering by product start date per product key.
"""

df_prd_info=spark.read.format("delta").load("/Volumes/workspace/bronze_crm_erp/bronze_volume/bronze_crm_data/prd_info/data/")

split_col= split(col("prd_key"),"-")

product_map ={
    "R":"Road",
    "S":"Other Sales",
    "M":"Mountain",
    "T":"Touring"
}

window_spec = Window.partitionBy(col("prd_key")).orderBy(col("prd_start_dt"))

"""
Transformations for Silver layer:
- Cast 'prd_id' to int type
- Create 'cat_id' by concatenating first two parts of 'prd_key' separated by '_'
- Rebuild 'prd_key' by concatenating all parts from 3rd onward separated by '-'
- Keep product name column as is
- Cast 'prd_cost' to int and fill nulls with 0
- Normalize 'prd_line' by trimming, uppercasing, replacing codes from mapping, and filling nulls with 'Unknown'
- Convert 'prd_start_dt' string to DateType
- Compute 'prd_end_dt' using lead window function on 'prd_start_dt' per 'prd_key'
- Fill null 'prd_end_dt' values with current date
- Add current timestamp column 'DWH_update_date' to mark processing time
- Drop '_rescued_data' column if exists (for malformed rows)
"""

silver_prd_df=(
  df_prd_info\
    .withColumn("prd_id",col("prd_id").cast("int"))
    .withColumn("cat_id",concat_ws("_",split_col.getItem(0), split_col.getItem(1)))\
    .withColumn("prd_key", concat_ws('-',slice(split_col,3,size(split_col)-2)))
    .withColumn("prd_nm",col("prd_nm"))
    .withColumn("prd_cost",col("prd_cost").cast("int")).na.fill({"prd_cost":"0"}, subset=["prd_cost"])
    .withColumn("prd_line",trim(upper(col("prd_line")))).na.replace(product_map,subset=["prd_line"]).na.fill({"prd_line":"Unknown"}, subset=["prd_line"])
    .withColumn("prd_start_dt",to_date(col("prd_start_dt"),"yyyy-MM-dd"))
    .withColumn("prd_end_dt",lead(col("prd_start_dt"), 1).over(window_spec))
    .withColumn('prd_end_dt',coalesce(col("prd_end_dt"),current_date()))
    .withColumn("DWH_update_date", current_timestamp())
    .drop(col("_rescued_data"))

)

"""
Reorder columns so that 'cat_id' appears right after 'prd_id'.
"""
def reorder_columns(silver_prd_df,insert_col,after_col):
  # Remove 'insert_col' if it already exists to avoid duplication
  new_silver_prd_df=[c for c in silver_prd_df.columns if c != insert_col]

  # Find index of the column after which to insert
  indx=new_silver_prd_df.index("prd_id")

  # Insert 'insert_col' right after 'after_col'
  new_silver_prd_df=new_silver_prd_df[:indx+1]+["cat_id"]+new_silver_prd_df[indx+1:]

  # Reorder DataFrame columns accordingly
  silver_prd_df =silver_prd_df.select(new_silver_prd_df)
  return silver_prd_df

silver_prd_df=reorder_columns(silver_prd_df,"cat_id","prd_id")


"""
Write transformed data to the Silver Delta table:
  - Append new records
  - Allow schema evolution if new columns appear
"""

silver_prd_df.write\
  .format("delta")\
    .option("overwriteSchema","true")\
      .mode("append")\
        .saveAsTable("workspace.silver_crm_erp.prd_info")