## Transformation

**Importing Libraries**

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:

from pyspark.sql import functions as F

months = ["Oct", "Nov"]

for month in months:
    print(f"📂 Processing month: {month.upper()}")

    input_path = f"abfss://bronze@ecommerceprojectdl.dfs.core.windows.net/month_{month}"
    output_path = f"abfss://silver@ecommerceprojectdl.dfs.core.windows.net/month_{month}/"

    # 1️⃣ Read
    df = (spark.read.format("csv")
          .option("header", True)
          .option("inferSchema", True)
          .load(input_path))

    # 2️⃣ Transform
    df_silver = (
        df
        .withColumn("event_time", F.to_timestamp("event_time"))
        .withColumn("product_id", F.col("product_id").cast(LongType()))
        .withColumn("category_id", F.col("category_id").cast(LongType()))
        .withColumn("price", F.col("price").cast(DoubleType()))
        .withColumn("user_id", F.col("user_id").cast(LongType()))
        .withColumn("event_type", F.lower(F.trim(F.col("event_type"))))
        .withColumn("brand", F.lower(F.trim(F.coalesce(F.col("brand"), F.lit("unknown")))))
        .withColumn("category_code", F.trim(F.col("category_code")))
        .withColumn("event_date", F.to_date("event_time"))
        .withColumn("event_time_only", F.date_format("event_time", "HH:mm:ss"))
        .withColumn("parts", F.split(F.col("category_code"), r"\."))
        .withColumn("main_category", F.try_element_at(F.col("parts"), F.lit(1)))
        .withColumn("sub_category", F.try_element_at(F.col("parts"), F.lit(2)))
        .withColumn("item_category", F.try_element_at(F.col("parts"), F.lit(3)))
        .drop("parts")
        .dropDuplicates()
    )

    # 3️⃣ Fill Nulls
    df_silver = df_silver.fillna({
        "category_code": "unknown",
        "main_category": "unknown",
        "sub_category": "unknown",
        "item_category": "unknown"
    })

    # 4️⃣ Write as Delta
    df_silver.write.format("delta").mode("overwrite").save(output_path)

    print(f"✅ {month.upper()} Silver layer saved to {output_path}")

print("🎯 All months successfully processed to Silver Layer!")


📂 Processing month: OCT
✅ OCT Silver layer saved to abfss://silver@ecommerceprojectdl.dfs.core.windows.net/month_Oct/
📂 Processing month: NOV
✅ NOV Silver layer saved to abfss://silver@ecommerceprojectdl.dfs.core.windows.net/month_Nov/
🎯 All months successfully processed to Silver Layer!
