In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import *
from pyspark.sql.types import *

## **Read Orders Data From Bonze Layer**

In [0]:
orders = spark.read.parquet(f"/Volumes/sampleproject/practice/bronze/orders")

#### **Transformations..**

In [0]:
orders_clean = orders \
    .withColumn("customer_id", trim(col("customer_id"))) \
    .withColumn("order_id", trim(col("order_id"))) \
    .withColumn("product_id", upper(trim(col("product_id")))) \
    .dropDuplicates()

orders_casted = orders_clean \
    .withColumn("quantity", col("quantity").cast(IntegerType())) \
    .withColumn("total_amount", col("total_amount").cast(DoubleType())) \
    .withColumn("order_date", col("order_date").cast(DateType()))

orders_nulls_handled = orders_casted \
    .dropna(subset=["order_id"]) \
    .fillna({"quantity": 0, "total_amount": 0.0}) \
    .withColumn("customer_id", coalesce(col("customer_id"), lit("Unknown")))


windowSpec = Window.partitionBy("year").orderBy(desc("total_amount"))

orders_final = orders_nulls_handled \
    .withColumn("year", year(col("order_date"))) \
    .withColumn("rnk", dense_rank().over(windowSpec)) \
    .withColumn("processed_at", current_timestamp())

## **Write Transformed Orders Data Into Silver Layer**

In [0]:
orders_final.write.format( "delta") \
. mode("overwrite") \
. save(f"/Volumes/sampleproject/practice/silver/orders")

## **Read Customers Data From Bonze Layer**

In [0]:
customers = spark.read.parquet(f"/Volumes/sampleproject/practice/bronze/customers")

#### **Transformations..**

In [0]:
customers_clean = customers \
    .withColumn("customer_id", trim(col("customer_id"))) \
    .withColumn("first_name", initcap(trim(col("first_name")))) \
    .withColumn("last_name", initcap(trim(col("last_name")))) \
    .withColumn("email", lower(trim(col("email")))) \
    .withColumn("city", initcap(trim(col("city")))) \
    .dropDuplicates(["customer_id"]) 

customers_nulls = customers_clean \
    .dropna(subset=["customer_id"]) \
    .withColumn("first_name", coalesce(col("first_name"), lit("Unknown"))) \
    .withColumn("last_name", coalesce(col("last_name"), lit("Unknown")))

customers_enriched = customers_nulls \
    .withColumn("full_name", concat(col("first_name"), lit(" "), col("last_name"))) \
    .withColumn("email_domain", split(col("email"), "@")[1]) \
    .drop("first_name", "last_name") 


customers_checked = customers_enriched \
    .withColumn("is_valid_email", 
                when((col("email").contains("@")) & (length(col("email")) > 5), True)
                .otherwise(False))

## **Write Transformed Customers Data Into Silver Layer**

In [0]:
customers_checked.write.format( "delta") \
    .mode("overwrite") \
    .save(f"/Volumes/sampleproject/practice/silver/customers")

## **Read Products Data From Bonze Layer**

In [0]:
products = spark.read.parquet(f"/Volumes/sampleproject/practice/bronze/products")

#### **Transformations..**

In [0]:
products_clean = products \
    .withColumn("product_id", trim(col("product_id"))) \
    .withColumn("product_name", initcap(trim(col("product_name")))) \
    .withColumn("brand", initcap(trim(col("brand")))) \
    .withColumn("category", upper(trim(col("category")))) \
    .dropDuplicates(["product_id"])

products_safe = products_clean \
    .withColumn("price", col("price").cast(DoubleType())) \
    .fillna({"price": 0.0}) \
    .withColumn("product_name", coalesce(col("product_name"), lit("Unknown Product"))) \
    .withColumn("brand", coalesce(col("brand"), lit("Generic"))) \
    .withColumn("category", coalesce(col("category"), lit("UNCATEGORIZED")))


products_enriched = products_safe \
    .withColumn("price_tier", 
                when(col("price") < 50, "Budget")
                .when((col("price") >= 50) & (col("price") < 200), "Mid-Range")
                .otherwise("Premium"))

products_final = products_enriched \
    .withColumn("data_quality_flag", 
                when(col("price") <= 0, "INVALID_PRICE")
                .otherwise("OK"))


## **Write Transformed Products Data Into Silver Layer**

In [0]:
products_final.write.format( "delta") \
    .mode("overwrite") \
    .save(f"/Volumes/sampleproject/practice/silver/products")

## **Read Regions Data From Bonze Layer**

In [0]:
regions = spark.read.parquet(f"/Volumes/sampleproject/practice/bronze/regions")

### **Transformation...**

In [0]:
regions_clean = regions \
    .withColumn("region_id", trim(col("region_id"))) \
    .withColumn("region", trim(col("region"))) \
    .withColumn("region", 
                when(col("region") == "TWest", "West")  
                .otherwise(initcap(col("region"))))


regions_safe = regions_clean \
    .dropDuplicates(["region_id"]) \
    .dropna(subset=["region_id"]) \
    .fillna({"region": "Unknown Region"})


regions_final = regions_safe \
    .withColumn("region_code", upper(substring(col("region"), 1, 3))) \
    .withColumn("zone_type", 
                when(col("region").isin("North", "South"), "Vertical")
                .when(col("region").isin("East", "West"), "Horizontal")
                .otherwise("Other"))


## **Write Transformed Regions Data Into Silver Layer**

In [0]:
regions_final.write.format( "delta") \
    .mode("overwrite") \
    .save(f"/Volumes/sampleproject/practice/silver/regions")