In [1]:
import os
os.environ["PYSPARK_PYTHON"] = r"C:\Users\ranjan\Desktop\spark-olist-pipeline\venv\Scripts\python.exe"
os.environ["PYSPARK_DRIVER_PYTHON"] = r"C:\Users\ranjan\Desktop\spark-olist-pipeline\venv\Scripts\python.exe"
os.environ["spark.python.worker.reuse"] = "true"

In [2]:
import sys
sys.path.append(os.path.abspath("../src")) 
from bronze import create_spark_session, ingest_csv

spark = create_spark_session("OlistPipeline")

In [3]:
# Check AQE
print("Adaptive Query Execution Enabled:", spark.conf.get("spark.sql.adaptive.enabled"))

Adaptive Query Execution Enabled: true


In [4]:
# Define paths
base_input = "../data/"
base_output = "../../delta/bronze/"

In [5]:

# Bronze ingestion for all Olist tables
datasets = {
    "customers": "olist_customers_dataset.csv",
    "orders": "olist_orders_dataset.csv",
    "order_items": "olist_order_items_dataset.csv",
    "order_payments": "olist_order_payments_dataset.csv",
    "order_reviews": "olist_order_reviews_dataset.csv",
    "products": "olist_products_dataset.csv",
    "sellers": "olist_sellers_dataset.csv",
    "geolocation": "olist_geolocation_dataset.csv",
    "category_translation": "product_category_name_translation.csv"
}


In [6]:
for name, filename in datasets.items():
    input_path = f"{base_input}{filename}"
    output_path = f"{base_output}{name}"

    print(f"\n[INFO] Ingesting {name} → {output_path}")
    
    if name == "orders":  # partition only orders
        ingest_csv(spark, input_path, output_path, partition_col="order_purchase_month", target_file_rows=50000)
    else:
        ingest_csv(spark, input_path, output_path, target_file_rows=50000)

    # Read back from Delta & show
    df = spark.read.format("delta").load(output_path)
    print(f"[INFO] Showing {name} table (first 5 rows):")
    df.show(5, truncate=False)




[INFO] Ingesting customers → ../../delta/bronze/customers
[INFO] Ingested ../data/olist_customers_dataset.csv → ../../delta/bronze/customers
[INFO] Files written under ../../delta/bronze/customers:
   part-00000-51912f9c-60ca-4c56-b232-eddd4cb66913-c000.snappy.parquet → 3.36 MB
   part-00001-9273b344-85cb-4d51-adca-f72628c5295f-c000.snappy.parquet → 3.35 MB
[INFO] Showing customers table (first 5 rows):
+--------------------------------+--------------------------------+------------------------+--------------------+--------------+
|customer_id                     |customer_unique_id              |customer_zip_code_prefix|customer_city       |customer_state|
+--------------------------------+--------------------------------+------------------------+--------------------+--------------+
|6ae91e04653dc3a0c205363ffff0081f|1dde89b524bd2ce5480df99d22539136|28911                   |cabo frio           |RJ            |
|ff84d0fa669f79bdc394cadf5d651cf2|4fd1b2e7c09c60c05762f31151d464f8|13236    

In [7]:
spark.stop()
print("Bronze layer ingestion completed successfully!")


Bronze layer ingestion completed successfully!
