In [0]:
%run ./00_setup_and_config

In [0]:
try:
    df_bronze = spark.read \
        .format("delta") \
        .load(raw_delta_path)
except Exception as e:
    print(f"Error reding from {raw_delta_path}: {e}")
    raise e

In [0]:
df_bronze.show(5)


In [0]:
from pyspark.sql.functions import col, lit, to_timestamp, to_date, when, dayofweek, dayofmonth, hour, month, year, weekofyear
from pyspark.sql.types import IntegerType, DoubleType, TimestampType

In [0]:
df_silver = df_bronze.na.drop(subset=["InvoiceNo", "CustomerID"]) \
    .withColumn("Quantity", col("Quantity").cast(IntegerType())) \
    .withColumn("UnitPrice", col("UnitPrice").cast(DoubleType())) \
    .withColumn("CustomerID", col("CustomerID").cast(IntegerType())) \
    .withColumn("InvoiceDate", to_timestamp(col("InvoiceDate"), "M/d/yyyy H:mm")) \
    .filter(col("Quantity") > 0) \
    .filter(col("UnitPrice") > 0)

In [0]:
df_silver.printSchema()
df_silver.show(5)

In [0]:
null_invoice_date_silver_count = df_silver.filter(
    (col("InvoiceDate").isNull()) | (col("InvoiceDate") == "")
).count()
print(null_invoice_date_silver_count)

In [0]:
df_silver = df_silver.withColumn("TotalPrice", col("Quantity") * col("UnitPrice")) \
    .withColumn("Invoice_Date", to_date(col("InvoiceDate"))) \
    .withColumn("Hour_of_day", hour(col("InvoiceDate"))) \
    .withColumn("Day_of_week", dayofweek(col("InvoiceDate"))) \
    .withColumn("Day_of_month", dayofmonth(col("InvoiceDate"))) \
    .withColumn("Week", weekofyear(col("InvoiceDate"))) \
    .withColumn("Month", month(col("InvoiceDate"))) \
    .withColumn("Year", year(col("InvoiceDate"))) 
    

In [0]:
df_silver.show(5)
df_silver.printSchema()

In [0]:
try:
    df_silver.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .partitionBy("Year", "Month") \
        .save(silver_delta_path)
except Exception as e:
    print(f"Error writing to {silver_delta_path}: {e}")
    