In [1]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

builder = SparkSession.builder \
    .appName("Olist Analytics") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


:: loading settings :: url = jar:file:/home/saraballkoci/miniconda3/envs/data_pipeline/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/saraballkoci/.ivy2.5.2/cache
The jars for the packages stored in: /home/saraballkoci/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-94827cc5-eab4-4eee-8db4-d9d5be72068e;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.0 in central
	found io.delta#delta-storage;4.0.0 in central
	found org.antlr#antlr4-runtime;4.13.1 in central
:: resolution report :: resolve 401ms :: artifacts dl 18ms
	:: modules in use:
	io.delta#delta-spark_2.13;4.0.0 from central in [default]
	io.delta#delta-storage;4.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.13.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            mo

In [2]:
from pyspark.sql.functions import sum, to_timestamp, year, month, dayofmonth
from pyspark.sql.window import Window


silver_df = spark.read.format("delta").load("../delta/silver/orders_enriched")


window_spec = Window.partitionBy("customer_id").orderBy("order_purchase_timestamp").rowsBetween(Window.unboundedPreceding, Window.currentRow)

cumulative_sales = silver_df \
    .withColumn("cumulative_total",sum("total_price").over(window_spec)) \
    .withColumn("year", year("order_purchase_timestamp")) \
    .withColumn("month", month("order_purchase_timestamp")) \
    .withColumn("day", dayofmonth("order_purchase_timestamp"))


In [3]:
from pyspark.sql.functions import sum,avg, to_timestamp, year, month, dayofmonth
from pyspark.sql.window import Window

rolling_window = Window.partitionBy("product_category_name") \
                       .orderBy("order_purchase_timestamp") \
                       .rowsBetween(-2, 0)

rolling_avg_delivery = silver_df \
    .withColumn("rolling_avg_delivery_time", avg("delivery_time").over(rolling_window)) \
    .withColumn("year", year("order_purchase_timestamp")) \
    .withColumn("month", month("order_purchase_timestamp")) \
    .withColumn("day", dayofmonth("order_purchase_timestamp"))


In [8]:
from pyspark.sql.functions import year, month, dayofmonth, sum

# First, add year/month/day BEFORE aggregating
silver_df = silver_df.withColumn("year", year("order_purchase_timestamp")) \
                     .withColumn("month", month("order_purchase_timestamp")) \
                     .withColumn("day", dayofmonth("order_purchase_timestamp"))

# Then do the groupBy using those columns
kpi_summary = silver_df.groupBy(
    "product_category_name", "seller_id", "customer_state", "year", "month", "day"
).agg(sum("total_price").alias("total_sales"),
    avg("delivery_time").alias("avg_delivery_time"),
    count("order_id").alias("order_count")
)


In [9]:
from pyspark.sql.functions import sum,avg, to_timestamp, year, month, dayofmonth



cumulative_sales.write.format("delta").mode("overwrite").partitionBy("year", "month", "day").save("../delta/gold/cumulative_sales")


rolling_avg_delivery.write.format("delta").mode("overwrite").partitionBy("year", "month", "day").save("../delta/gold/rolling_avg_delivery")


kpi_summary.write.format("delta").mode("overwrite").partitionBy("year", "month", "day").save("../delta/gold/kpi_summary")


                                                                                

In [10]:

cumulative_sales = spark.read.format("delta").load("../delta/gold/cumulative_sales")
rolling_avg_delivery = spark.read.format("delta").load("../delta/gold/rolling_avg_delivery")
kpi_summary = spark.read.format("delta").load("../delta/gold/kpi_summary")


In [11]:

cumulative_sales.createOrReplaceTempView("cumulative_sales")
rolling_avg_delivery.createOrReplaceTempView("rolling_avg_delivery")
kpi_summary.createOrReplaceTempView("kpi_summary")


In [12]:
spark.sql("""
    SELECT product_category_name, ROUND(SUM(total_sales), 2) AS total_sales
    FROM kpi_summary
    GROUP BY product_category_name
    ORDER BY total_sales DESC
""").show(truncate=False)

spark.sql("""
    SELECT seller_id, ROUND(AVG(avg_delivery_time), 2) AS avg_delivery_time
    FROM kpi_summary
    GROUP BY seller_id
    ORDER BY avg_delivery_time ASC
""").show(truncate=False)


spark.sql("""
    SELECT customer_state, SUM(order_count) AS total_orders
    FROM kpi_summary
    GROUP BY customer_state
    ORDER BY total_orders DESC
""").show(truncate=False)


                                                                                

+---------------------------------+-----------+
|product_category_name            |total_sales|
+---------------------------------+-----------+
|relogios_presentes               |171906.7   |
|beleza_saude                     |146353.44  |
|cama_mesa_banho                  |116902.36  |
|utilidades_domesticas            |102508.33  |
|automotivo                       |87831.31   |
|esporte_lazer                    |85452.98   |
|moveis_decoracao                 |79336.71   |
|informatica_acessorios           |75848.5    |
|ferramentas_jardim               |48234.9    |
|bebes                            |47162.35   |
|telefonia                        |42180.17   |
|brinquedos                       |41479.78   |
|cool_stuff                       |40750.48   |
|moveis_escritorio                |37010.85   |
|perfumaria                       |36607.99   |
|pet_shop                         |33189.95   |
|construcao_ferramentas_construcao|30726.47   |
|eletronicos                      |22457

                                                                                

+--------------------------------+-----------------+
|seller_id                       |avg_delivery_time|
+--------------------------------+-----------------+
|734def04b237117a09321dd6d8f3f2a2|1.0              |
|cf654707ed6a99f112c7ec3d6f314b8b|1.0              |
|655220df33262c7e0c4949a147366f94|1.0              |
|ae45df84722b1d15c5f32b23a095746c|1.0              |
|b2a6d334e2833acea353624840e25a0e|1.0              |
|6561d6bf844e464b4019442692b40e02|1.0              |
|30a81d8cf85fb2ada1b1b094c9583a95|1.5              |
|0ddefe3c7a032b91f4e25b9c3a08fca1|2.0              |
|70126eecc6aa1274392a1743866e9678|2.0              |
|099095b050cfe8eb1ddff5317587e96e|2.0              |
|e5cbe890e679490127e9a390b46bbd20|2.0              |
|a06c8ff043abea0528ee44171e2140a2|2.0              |
|5f1dc28029d2c244352a68107ec2b542|2.0              |
|c89cf7c468a48af70aada384e722f9e2|2.0              |
|e88165a185134e13fdfc85d4fa654db8|2.0              |
|d9e8c084b68fe958861d8f2c21202e6b|2.0         



+--------------+------------+
|customer_state|total_orders|
+--------------+------------+
|SP            |4386        |
|RJ            |1216        |
|MG            |1098        |
|RS            |456         |
|PR            |440         |
|BA            |397         |
|SC            |335         |
|DF            |231         |
|ES            |217         |
|GO            |208         |
|PE            |206         |
|CE            |139         |
|MT            |122         |
|PA            |115         |
|MS            |88          |
|PB            |69          |
|PI            |64          |
|RN            |63          |
|MA            |53          |
|SE            |43          |
+--------------+------------+
only showing top 20 rows


                                                                                