In [None]:
# Databricks notebook source
# COMMAND ----------
# %md
# # ETL Process for Superstore Data
# This notebook performs an ETL process on Superstore data, including loading, transforming, and saving data.

# COMMAND ----------
#
# Import necessary libraries
import logging
from pyspark.sql import functions as F
from pyspark.sql.functions import col, concat_ws, year, datediff, when, sum, avg

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Assume the Spark session is already available as 'spark'

# COMMAND ----------
#
# Define state mapping for renaming states
state_mapping = {
    "California": "CA", "New York": "NY", "Texas": "TX", "Florida": "FL",
    "Illinois": "IL", "Pennsylvania": "PA", "Ohio": "OH", "Georgia": "GA",
    "North Carolina": "NC"
}

# COMMAND ----------
#
try:
    # Load Orders Central
    logger.info("Loading Orders Central data")
    orders_central = spark.read.csv("dbfs:/mnt/data/Orders_Central.csv", header=True, inferSchema=True)
    orders_central = orders_central.withColumn("Region", F.col("Region").fillna("Central"))

    # COMMAND ----------
    #
# Fix Dates
    logger.info("Fixing dates for Orders Central")
    orders_central = orders_central.withColumn("Order Date", concat_ws("-", F.col("Order Year"), F.col("Order Month"), F.col("Order Day")).cast("date")) \
                                   .withColumn("Ship Date", concat_ws("-", F.col("Ship Year"), F.col("Ship Month"), F.col("Ship Day")).cast("date")) \
                                   .drop("Order Year", "Order Month", "Order Day", "Ship Year", "Ship Month", "Ship Day") \
                                   .withColumnRenamed("Discounts", "Discount") \
                                   .withColumnRenamed("Product", "Product Name")

    # COMMAND ----------
    #
# Remove Nulls
    logger.info("Removing null Order IDs")
    orders_central = orders_central.filter(F.col("Order ID").isNotNull())

    # COMMAND ----------
    #
# Fix Data Type
    logger.info("Fixing data types for Orders Central")
    orders_central = orders_central.withColumn("Discount", F.col("Discount").cast("string")) \
                                   .withColumn("Sales", F.col("Sales").cast("double"))

    # COMMAND ----------
    #
# Rename States
    logger.info("Renaming states in Orders Central")
    orders_central = orders_central.replace(state_mapping, subset=["State"])

    # COMMAND ----------
    #
# Load and Pivot Quotas
    logger.info("Loading and pivoting Quota data")
    quota = spark.read.format("com.crealytics.spark.excel").option("header", "true").load("dbfs:/mnt/data/Quota.xlsx")
    quota = quota.selectExpr("Region", "stack(4, '2014', `2014`, '2015', `2015`, '2016', `2016`, '2017', `2017`) as (Year, Quota)")

    # COMMAND ----------
    #
# Load Orders West
    logger.info("Loading Orders West data")
    orders_west = spark.read.csv("dbfs:/mnt/data/Orders_West.csv", header=True, inferSchema=True)

    # COMMAND ----------
    #
# Load Orders East
    logger.info("Loading Orders East data")
    orders_east = spark.read.format("com.crealytics.spark.excel").option("header", "true").load("dbfs:/mnt/data/Orders_East.xlsx")

    # COMMAND ----------
    #
# Load Orders South
    logger.info("Loading Orders South data")
    orders_south = spark.read.csv("dbfs:/mnt/data/orders_south_2015.csv", header=True, inferSchema=True)

    # COMMAND ----------
    #
# Union Orders
    logger.info("Unioning all order datasets")
    orders_all = orders_central.union(orders_west).union(orders_east).union(orders_south)

    # COMMAND ----------
    #
# Load Returns
    logger.info("Loading Returns data")
    returns = spark.read.format("com.crealytics.spark.excel").option("header", "true").load("dbfs:/mnt/data/return reasons_new.xlsx")

    # COMMAND ----------
    #
# Orders and Returns
    logger.info("Joining Orders and Returns data")
    # Refactor complex operations into named variables for clarity
    returned_col = F.col("Return Reason").isNotNull()
    days_to_ship_col = datediff(F.col("Ship Date"), F.col("Order Date"))
    discount_col = when(F.col("Discount").isNull(), F.lit(0)).otherwise(F.col("Discount"))
    year_of_sale_col = year(F.col("Order Date"))

    # Use left join by reversing the order of the dataframes
    orders_returns = returns.join(orders_all, ["Order ID", "Product ID"], "left_outer") \
                            .withColumn("Returned", returned_col) \
                            .withColumn("Days to Ship", days_to_ship_col) \
                            .withColumn("Discount", discount_col) \
                            .withColumn("Year of Sale", year_of_sale_col)

    # COMMAND ----------
    #
# Roll Up Sales
    logger.info("Aggregating sales data")
    annual_performance = orders_returns.groupBy("Region", "Year of Sale").agg(
        sum("Profit").alias("Total Profit"),
        sum("Sales").alias("Total Sales"),
        sum("Quantity").alias("Total Quantity"),
        avg("Discount").alias("Average Discount")
    )

    # COMMAND ----------
    #
# Save Outputs
    logger.info("Saving Annual Regional Performance data")
    spark.sql("DROP TABLE IF EXISTS catalog.target_db.Annual_Regional_Performance")
    annual_performance.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.Annual_Regional_Performance")

    logger.info("Saving Superstore Sales data")
    spark.sql("DROP TABLE IF EXISTS catalog.target_db.Superstore_Sales")
    orders_returns.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.Superstore_Sales")

except Exception as e:
    logger.error("An error occurred during the ETL process", exc_info=True)
