In [1]:
import os
os.environ["PYSPARK_PYTHON"] = r"C:\Users\ranjan\Desktop\spark-olist-pipeline\venv\Scripts\python.exe"
os.environ["PYSPARK_DRIVER_PYTHON"] = r"C:\Users\ranjan\Desktop\spark-olist-pipeline\venv\Scripts\python.exe"
os.environ["spark.python.worker.reuse"] = "true"

In [2]:
import sys
import time
from pyspark.sql.functions import broadcast
sys.path.append(os.path.abspath("../src")) 


In [3]:
from bronze import create_spark_session, ingest_csv
from silver import curate_sales
from gold import create_daily_sales_state
spark = create_spark_session("OlistPipeline")

In [4]:
# Check AQE
print("Adaptive Query Execution Enabled:", spark.conf.get("spark.sql.adaptive.enabled"))

Adaptive Query Execution Enabled: true


In [5]:
# Define paths
base_input = "../data/"
bronze_output = "../delta/bronze/"

In [6]:

# Bronze ingestion for all Olist tables
datasets = {
    "customers": "olist_customers_dataset.csv",
    "orders": "olist_orders_dataset.csv",
    "order_items": "olist_order_items_dataset.csv",
    "order_payments": "olist_order_payments_dataset.csv",
    "order_reviews": "olist_order_reviews_dataset.csv",
    "products": "olist_products_dataset.csv",
    "sellers": "olist_sellers_dataset.csv",
    "geolocation": "olist_geolocation_dataset.csv",
    "category_translation": "product_category_name_translation.csv"
}


In [7]:
for name, filename in datasets.items():
    input_path = f"{base_input}{filename}"
    output_path = f"{bronze_output}{name}"

    print(f"\n[INFO] Ingesting {name} → {output_path}")
    
    if name == "orders":  # partition only orders
        ingest_csv(spark, input_path, output_path, partition_col="order_purchase_month", target_file_rows=50000)
    else:
        ingest_csv(spark, input_path, output_path, target_file_rows=50000)

    # Read back from Delta & show
    df = spark.read.format("delta").load(output_path)
    print(f"[INFO] Showing {name} table (first 5 rows):")
    df.show(5, truncate=False)



[INFO] Ingesting customers → ../delta/bronze/customers
[INFO] Ingested ../data/olist_customers_dataset.csv → ../delta/bronze/customers
[INFO] Files written under ../delta/bronze/customers:
   part-00000-d12838bd-6ece-428b-a2d9-b0bf657bb83c-c000.snappy.parquet → 3.36 MB
   part-00001-74bd0934-7728-4d9c-834f-279c7051601f-c000.snappy.parquet → 3.35 MB
[INFO] Showing customers table (first 5 rows):
+--------------------------------+--------------------------------+------------------------+--------------------+--------------+
|customer_id                     |customer_unique_id              |customer_zip_code_prefix|customer_city       |customer_state|
+--------------------------------+--------------------------------+------------------------+--------------------+--------------+
|6ae91e04653dc3a0c205363ffff0081f|1dde89b524bd2ce5480df99d22539136|28911                   |cabo frio           |RJ            |
|ff84d0fa669f79bdc394cadf5d651cf2|4fd1b2e7c09c60c05762f31151d464f8|13236             

In [8]:
print("Bronze layer  ingestion completed successfully!")

Bronze layer  ingestion completed successfully!


In [9]:
silver_output = "../delta/silver/"


In [10]:
# Curate Silver Layer
curate_sales(spark, bronze_base=bronze_output, silver_base=silver_output)

 Silver layer sales curated and saved at ../delta/silver//sales


In [11]:
# Verify Silver
print("\n[INFO] Silver Layer Sales Table:")
sales_df = spark.read.format("delta").load(f"{silver_output}/sales")
sales_df.show(5, truncate=False)


[INFO] Silver Layer Sales Table:
+--------------------------------+--------------------------------+--------------------------------+--------------------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+--------------------+-------------+-------------------+-----+-------------+-----------+------------------+------------+--------------------+-------------+--------------------------------+------------------------+-------------------+--------------+----------------------+--------------+------------+----------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+-----------------------------+
|product_id                      |seller_id                       |customer_id                     |order_id                        |order_status|order_purchase_timestamp|order_approved_at  |order_d

In [12]:
sales_df.select("seller_id", "seller_salt").distinct().show(10)

+--------------------+-----------+
|           seller_id|seller_salt|
+--------------------+-----------+
|0ea22c1cfbdc755f8...|          2|
|b0b346d3a89f5eb4c...|          0|
|41b39e28db005d973...|          1|
|0b90b6df587eb8360...|          4|
|a5ff20ff766e7f50b...|          0|
|1430239a858e7682b...|          2|
|de722cd6dad950a92...|          1|
|1f9ab4708f3056ede...|          2|
|7142540dd4c91e223...|          1|
|0747d5bb69f0586cc...|          0|
+--------------------+-----------+
only showing top 10 rows



In [13]:
# Unique city/state samples for customers & sellers
silver_path = "../delta/silver/"
customers_df = spark.read.format("delta").load(silver_path + "customers")
sellers_df = spark.read.format("delta").load(silver_path + "sellers")

print("Customer city samples:", [row.customer_city for row in customers_df.select("customer_city").distinct().limit(5).collect()])
print("Customer state samples:", [row.customer_state for row in customers_df.select("customer_state").distinct().limit(5).collect()])

print("Seller city samples:", [row.seller_city for row in sellers_df.select("seller_city").distinct().limit(5).collect()])
print("Seller state samples:", [row.seller_state for row in sellers_df.select("seller_state").distinct().limit(5).collect()])

Customer city samples: ['igrejinha', 'aguas de sao pedro', 'camacari', 'arapiraca', 'pote']
Customer state samples: ['pi', 'pr', 'rj', 'pb', 'ro']
Seller city samples: ['igrejinha', 'brusque', 'buritama', 'sao joao de meriti', 'garca']
Seller state samples: ['pi', 'pr', 'rj', 'pb', 'ro']


In [14]:
print("Duplicate customers:", customers_df.count() - customers_df.dropDuplicates(["customer_id"]).count())
print("Duplicate sellers:", sellers_df.count() - sellers_df.dropDuplicates(["seller_id"]).count())

Duplicate customers: 0
Duplicate sellers: 0


In [15]:
sales_df = spark.read.format("delta").load(silver_path + "sales")

print("Sample seller_id_salted:")
sales_df.select("seller_salt").distinct().show(10, truncate=False)

Sample seller_id_salted:
+-----------+
|seller_salt|
+-----------+
|1          |
|3          |
|4          |
|2          |
|0          |
+-----------+



In [16]:
gold_base = "../delta/gold/"

In [17]:
# Run Gold marts
create_daily_sales_state(spark, silver_path, gold_base)

Gold table created at ../delta/gold//daily_sales_state (customer + seller state revenue)


In [18]:
# Verify Gold table
df_gold = spark.read.format("delta").load(f"{gold_base}/daily_sales_state")
print("Sample Gold table:")
df_gold.show(5, truncate=False)


Sample Gold table:
+----------+--------------+------------+-----------------+
|order_date|customer_state|seller_state|daily_revenue    |
+----------+--------------+------------+-----------------+
|2017-01-23|go            |pr          |374.28           |
|2017-01-23|mg            |mg          |146.93           |
|2017-01-23|mg            |sp          |1151.58          |
|2017-01-23|mt            |sp          |343.88           |
|2017-01-23|pa            |mg          |89.47999999999999|
+----------+--------------+------------+-----------------+
only showing top 5 rows



In [19]:
# Load sales table
sales = spark.read.format("delta").load(f"{silver_path}/sales")

In [20]:
import time
# AQE OFF
spark.conf.set("spark.sql.adaptive.enabled", "false")
start = time.time()
sales.groupBy("customer_state").sum("price").show()
print("AQE OFF runtime:", time.time() - start)

# AQE ON
spark.conf.set("spark.sql.adaptive.enabled", "true")
start = time.time()
sales.groupBy("customer_state").sum("price").show()
print("AQE ON runtime:", time.time() - start)

+--------------+------------------+
|customer_state|        sum(price)|
+--------------+------------------+
|            pi| 90156.87999999996|
|            pr| 685703.6699999984|
|            rj|1849589.9499999874|
|            pb|120634.06000000007|
|            ro| 45176.64999999999|
|            ba| 526261.0800000001|
|          NULL|439617.68000000075|
|            ms|116679.15999999999|
|            mg|1592170.1999999825|
|            go| 301968.0600000005|
|            sc| 516302.6900000002|
|            es|273704.54000000056|
|            rs| 759279.2699999973|
|            rn| 92823.27000000002|
|            pe| 263488.0700000005|
|            ce|235778.63000000032|
|            ap|           13454.3|
|            sp|  5270066.25000016|
|            al| 80083.78999999996|
|            ac|          16146.34|
+--------------+------------------+
only showing top 20 rows

AQE OFF runtime: 3.533615827560425
+--------------+------------------+
|customer_state|        sum(price)|
+--

In [22]:
# Shuffle Join 
start_time = time.time()
df1 = sales.join(customers_df, "customer_id")
df1.count()
duration_shuffle = time.time() - start_time
print(f"Shuffle Join took: {duration_shuffle:.2f} seconds")


Shuffle Join took: 1.79 seconds


In [24]:
# 2. Broadcast Join
start_time = time.time()
df2 = sales.join(broadcast(customers_df), "customer_id")
df2.count()
duration_broadcast = time.time() - start_time
print(f"Broadcast Join took: {duration_broadcast:.2f} seconds")

Broadcast Join took: 1.74 seconds


In [None]:
df = sales.join(customers_df, "customer_id")
df.explain("formatted")  # Detailed plan


In [None]:
spark.stop()