In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month

# -------------------------------
# Initialize Spark Session
# -------------------------------
spark = SparkSession.builder \
    .appName("SuperstoreSalesPipeline") \
    .getOrCreate()

In [0]:
# -------------------------------
# Step 1: Data Ingestion
# -------------------------------
input_path = "/Volumes/workspace/data/data/Sample - Superstore.csv"

df = spark.read.csv(
    input_path,
    header=True,
    inferSchema=True
)

In [0]:
# -------------------------------
# Step 2: Data Transformation
# -------------------------------
df = df.withColumn("Year", year(col("Order Date"))) \
       .withColumn("Month", month(col("Order Date")))

# -------------------------------
# Step 3: Store as Parquet
# -------------------------------
flat_path = "/Volumes/workspace/data/data"

df.write \
  .mode("overwrite") \
  .format("parquet") \
  .save(flat_path)

# -------------------------------
# Step 4: Create Delta Table (Flat)
# -------------------------------
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS workspace.data.sales_flat (
        RowID INT,
        OrderID STRING,
        OrderDate DATE,
        ShipDate DATE,
        ShipMode STRING,
        CustomerID STRING,
        CustomerName STRING,
        Segment STRING,
        Country STRING,
        City STRING,
        State STRING,
        PostalCode STRING,
        Region STRING,
        ProductID STRING,
        Category STRING,
        SubCategory STRING,
        ProductName STRING,
        Sales DOUBLE,
        Quantity INT,
        Discount DOUBLE,
        Profit DOUBLE,
        Year INT,
        Month INT
    )
    USING DELTA
    LOCATION 's3a://databricks-bucket-69/'
""")

In [0]:
# -------------------------------
# Step 5: Create Partitioned Table
# -------------------------------
spark.sql("""
    CREATE TABLE IF NOT EXISTS workspace.data.sales_partitioned_v3 (
        SaleID INT,
        Product STRING,
        Amount DOUBLE,
        Year INT,
        Month INT
    )
    USING DELTA
    OPTIONS (
      path 's3a://databricks-bucket-69/sales_partitioned_v3'
    )
    PARTITIONED BY (Year, Month)
""")