In [0]:
# spark.conf.set("spark.sql.adaptive.enabled","true")         # AQE ON
spark.conf.set("spark.sql.shuffle.partitions", "200")      # tune as needed

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, date_format, col

# read orders CSV (adjust path)
orders = spark.read.option("header","true").option("inferSchema","true") \
    .csv("/FileStore/olist/olist_orders_dataset.csv")

# create purchase_date and yyyy-mm partition column
orders = orders.withColumn("purchase_date", to_date(col("order_purchase_timestamp"))) \
               .withColumn("purchase_yyyy_mm", date_format(col("purchase_date"), "yyyy-MM"))

# write as Delta to bronze (set maxRecordsPerFile to control file sizing)
orders.write.format("delta") \
    .mode("overwrite") \
    .option("maxRecordsPerFile", 500000) \
    .partitionBy("purchase_yyyy_mm") 

In [0]:
from pyspark.sql.functions import trim, lower, initcap

customers = spark.read.format("delta").load("/mnt/delta/bronze/customers")
customers = customers.withColumn("customer_city", initcap(trim(lower(col("customer_city"))))) \
                     .withColumn("customer_state", initcap(trim(lower(col("customer_state")))))


In [0]:
customers

In [0]:
display(customers)

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, desc

w = Window.partitionBy("customer_id").orderBy(desc("creation_date"))
dedup_customers = customers.withColumn("rn", row_number().over(w)).filter(col("rn")==1).drop("rn")
dedup_customers.write.format("delta").mode("overwrite").save("/mnt/delta/silver/customers")
