In [1]:
import os
os.environ["PYSPARK_PYTHON"] = r"C:\Users\ranjan\Desktop\spark-olist-pipeline\venv\Scripts\python.exe"
os.environ["PYSPARK_DRIVER_PYTHON"] = r"C:\Users\ranjan\Desktop\spark-olist-pipeline\venv\Scripts\python.exe"
os.environ["spark.python.worker.reuse"] = "true"

In [2]:
import sys
sys.path.append(os.path.abspath("../src")) 
from bronze import create_spark_session, ingest_csv

spark = create_spark_session("OlistPipeline")

In [3]:
# Check AQE
print("Adaptive Query Execution Enabled:", spark.conf.get("spark.sql.adaptive.enabled"))

Adaptive Query Execution Enabled: true


In [4]:
# Define paths
base_input = "../data/"
base_output = "../../delta/bronze/"

In [5]:

# Bronze ingestion for all Olist tables
datasets = {
    "customers": "olist_customers_dataset.csv",
    "orders": "olist_orders_dataset.csv",
    "order_items": "olist_order_items_dataset.csv",
    "order_payments": "olist_order_payments_dataset.csv",
    "order_reviews": "olist_order_reviews_dataset.csv",
    "products": "olist_products_dataset.csv",
    "sellers": "olist_sellers_dataset.csv",
    "geolocation": "olist_geolocation_dataset.csv",
    "category_translation": "product_category_name_translation.csv"
}


In [6]:
for input_file, output_path, partition_col in datasets:
    print(f"[INFO] Ingesting {input_file} → {output_path}")

    if "orders" in input_file:  # partition only orders
        ingest_csv(spark, input_file, output_path, partition_col="order_purchase_month", target_file_rows=50000)
    else:
        ingest_csv(spark, input_file, output_path, target_file_rows=50000)

    df = spark.read.format("delta").load(output_path)
    print(f"[INFO] Showing table for {output_path}:")
    df.show(5, truncate=False)


[INFO] Ingesting ../data/olist_orders_dataset.csv → delta/bronze/orders
[INFO] Added derived partition column 'order_purchase_month'
[INFO] Ingested ../data/olist_orders_dataset.csv → delta/bronze/orders
[INFO] Files written under delta/bronze/orders:
   part-00000-10720bbd-fd53-4f96-b75a-7681dff75a41.c000.snappy.parquet → 0.00 MB
   part-00001-95bf9905-d999-46e9-a601-1570c128bf7e.c000.snappy.parquet → 0.00 MB
   part-00000-ce559434-a51c-420f-a79e-92376c360971.c000.snappy.parquet → 0.02 MB
   part-00001-6f0e0fbf-3804-432d-a2b2-1e1721614728.c000.snappy.parquet → 0.02 MB
   part-00000-567c5c4a-9888-4e36-b428-8c5b2ec48e24.c000.snappy.parquet → 0.00 MB
   part-00000-f3219322-9cf1-40a3-ab71-f91751d92434.c000.snappy.parquet → 0.04 MB
   part-00001-25101aac-a7ba-4d5d-931d-0d19b3aab8e1.c000.snappy.parquet → 0.04 MB
   part-00000-22569943-53bf-448d-be65-964682fd0f15.c000.snappy.parquet → 0.09 MB
   part-00001-4ada27c2-e368-4fb0-8611-38115fc989f3.c000.snappy.parquet → 0.09 MB
   part-00000-0c5ae

In [7]:
spark.stop()
print("Bronze layer ingestion completed successfully!")


Bronze layer ingestion completed successfully!
