In [1]:
import os
os.environ["PYSPARK_PYTHON"] = r"C:\Users\ranjan\Desktop\spark-olist-pipeline\venv\Scripts\python.exe"
os.environ["PYSPARK_DRIVER_PYTHON"] = r"C:\Users\ranjan\Desktop\spark-olist-pipeline\venv\Scripts\python.exe"
os.environ["spark.python.worker.reuse"] = "true"

In [2]:
import sys
sys.path.append(os.path.abspath("../src")) 
from bronze import create_spark_session, ingest_csv
from silver import curate_sales

spark = create_spark_session("OlistPipeline")

In [3]:
# Check AQE
print("Adaptive Query Execution Enabled:", spark.conf.get("spark.sql.adaptive.enabled"))

Adaptive Query Execution Enabled: true


In [4]:
# Define paths
base_input = "../data/"
bronze_output = "../delta/bronze/"

In [5]:

# Bronze ingestion for all Olist tables
datasets = {
    "customers": "olist_customers_dataset.csv",
    "orders": "olist_orders_dataset.csv",
    "order_items": "olist_order_items_dataset.csv",
    "order_payments": "olist_order_payments_dataset.csv",
    "order_reviews": "olist_order_reviews_dataset.csv",
    "products": "olist_products_dataset.csv",
    "sellers": "olist_sellers_dataset.csv",
    "geolocation": "olist_geolocation_dataset.csv",
    "category_translation": "product_category_name_translation.csv"
}


In [6]:
for name, filename in datasets.items():
    input_path = f"{base_input}{filename}"
    output_path = f"{bronze_output}{name}"

    print(f"\n[INFO] Ingesting {name} → {output_path}")
    
    if name == "orders":  # partition only orders
        ingest_csv(spark, input_path, output_path, partition_col="order_purchase_month", target_file_rows=50000)
    else:
        ingest_csv(spark, input_path, output_path, target_file_rows=50000)

    # Read back from Delta & show
    df = spark.read.format("delta").load(output_path)
    print(f"[INFO] Showing {name} table (first 5 rows):")
    df.show(5, truncate=False)




[INFO] Ingesting customers → ../delta/bronze/customers
[INFO] Ingested ../data/olist_customers_dataset.csv → ../delta/bronze/customers
[INFO] Files written under ../delta/bronze/customers:
   part-00000-02c6ad1c-aea8-4d34-a24e-827acddad19b-c000.snappy.parquet → 3.36 MB
   part-00001-b2b5979f-2a0d-4642-b280-79bee37d5a3e-c000.snappy.parquet → 3.35 MB
[INFO] Showing customers table (first 5 rows):
+--------------------------------+--------------------------------+------------------------+-------------+--------------+
|customer_id                     |customer_unique_id              |customer_zip_code_prefix|customer_city|customer_state|
+--------------------------------+--------------------------------+------------------------+-------------+--------------+
|15357da0aa538a1e5efabe63f0d71095|57f66dc16e5edbe9417e1e21c783b5a9|13010                   |campinas     |SP            |
|47aa66905ec3df607310af4988d34bca|58aaf012cc88d1de4b976f45e059ddf5|4844                    |sao paulo    |SP     

In [7]:
silver_output = "../delta/silver/"

In [11]:
# Curate Silver Layer
curate_sales(spark, bronze_base=bronze_output, silver_base=silver_output)

Py4JJavaError: An error occurred while calling o277.load.
: java.lang.IllegalStateException: No active or default Spark session found
	at org.apache.spark.sql.SparkSession$.$anonfun$active$2(SparkSession.scala:1202)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.SparkSession$.$anonfun$active$1(SparkSession.scala:1202)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.SparkSession$.active(SparkSession.scala:1201)
	at org.apache.spark.sql.delta.sources.DeltaDataSource.getTable(DeltaDataSource.scala:69)
	at org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils$.getTableFromProvider(DataSourceV2Utils.scala:92)
	at org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils$.loadV2Source(DataSourceV2Utils.scala:140)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$1(DataFrameReader.scala:210)
	at scala.Option.flatMap(Option.scala:271)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:208)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:186)
	at sun.reflect.GeneratedMethodAccessor164.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)


In [None]:
# Verify Silver
print("\n[INFO] Silver Layer Sales Table:")
sales_df = spark.read.format("delta").load(f"{silver_output}/sales")
sales_df.show(5, truncate=False)

In [None]:
spark.stop()
print("Bronze layer ingestion completed successfully!")
