Setup Spark with Delta Lake

In [1]:
from pathlib import Path
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession

builder = SparkSession.builder \
    .appName("Bronze Layer Ingestion") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

Define Input & Output Paths

In [2]:
# defining input path where basically are located the datasets and the output path where will be located the delta tables
input_path = Path("../data/raw_csvs")
output_base_path = Path("../delta/bronze")

Load & Save Each CSV as a Delta Table

In [None]:
# Loop through all CSV files and write each as a Delta table
csv_files = list(input_path.glob("*.csv"))

for csv_file in csv_files:
    table_name = csv_file.stem  # Use the file name without extension
    print(f"Ingesting {table_name}...")

    df = spark.read.option("header", "true").option(
        "inferSchema", "true").csv(str(csv_file))

# Save DataFrame as a Delta table
    df.write.format("delta").mode("overwrite").save(
        str(output_base_path / table_name))

print("✅ All files ingested into Delta tables.")

spark.stop()
print("Spark seassion closed!")

Ingesting olist_customers_dataset...
Ingesting olist_orders_dataset...
Ingesting olist_order_items_dataset...
Ingesting olist_order_payments_dataset...
Ingesting olist_order_reviews_dataset...
Ingesting olist_products_dataset...
Ingesting olist_sellers_dataset...
✅ All files ingested into Delta tables.
