In [None]:
"# Databricks notebook source\n# COMMAND ----------\n# %md\n# # ETL Process for Orders and Quota Data\n# This notebook performs an ETL process on orders and quota data, including data standardization, cleaning, and aggregation.\n\n# COMMAND ----------\n#
\n# Import necessary libraries\nimport logging\nfrom pyspark.sql import functions as F\nfrom pyspark.sql.types import IntegerType, DoubleType, StringType\nfrom pyspark.sql.functions import lit, concat_ws, to_date, col, regexp_replace, when, year, datediff, split, sum, avg\n\n# COMMAND ----------\n#
\n# Configure logging\nlogging.basicConfig(level=logging.INFO)\nlogger = logging.getLogger(__name__)\n\n# COMMAND ----------\n#
\n# Define state abbreviations mapping\nstate_abbreviations = {\n    \"California\": \"CA\",\n    \"New York\": \"NY\",\n    # Add other state mappings as needed\n}\n\n# COMMAND ----------\n#
\ntry:\n    # Load Orders data from Unity Catalog tables\n    central_orders = spark.table(\"catalog.db.orders_central\")\n    west_orders = spark.table(\"catalog.db.orders_west\")\n    east_orders = spark.table(\"catalog.db.orders_east\")\n    south_orders = spark.table(\"catalog.db.orders_south_2015\")\n\n    # Load Quota data\n    quota_data = spark.table(\"catalog.db.quota\")\n\n    # Load Returns data\n    returns_data = spark.table(\"catalog.db.return_reasons\")\n\n    # COMMAND ----------\n    #
\n    # Step 1: Data Standardization\n    logger.info(\"Starting data standardization...\")\n    central_orders = central_orders.withColumn(\"Region\", lit(\"Central\")) \\\n        .withColumn(\"Order Date\", to_date(concat_ws(\"-\", col(\"Order Year\"), col(\"Order Month\"), col(\"Order Day\")))) \\\n        .withColumn(\"Ship Date\", to_date(concat_ws(\"-\", col(\"Ship Year\"), col(\"Ship Month\"), col(\"Ship Day\")))) \\\n        .withColumnRenamed(\"Discounts\", \"Discount\") \\\n        .withColumnRenamed(\"Product\", \"Product Name\") \\\n        .drop(\"Order Year\", \"Order Month\", \"Order Day\", \"Ship Year\", \"Ship Month\", \"Ship Day\")\n\n    # COMMAND ----------\n    #
\n    # Step 2: Remove Nulls\n    logger.info(\"Removing null Order IDs...\")\n    central_orders = central_orders.filter(col(\"Order ID\").isNotNull())\n\n    # COMMAND ----------\n    #
\n    # Step 3: Fix Data Types\n    logger.info(\"Fixing data types...\")\n    central_orders = central_orders.withColumn(\"Discount\", col(\"Discount\").cast(StringType())) \\\n        .withColumn(\"Sales\", regexp_replace(col(\"Sales\"), \"[^0-9.]\", \"\").cast(DoubleType()))\n\n    # COMMAND ----------\n    #
\n    # Step 4: Rename States\n    logger.info(\"Renaming states...\")\n    central_orders = central_orders.replace(state_abbreviations, subset=[\"State\"])\n\n    # COMMAND ----------\n    #
\n    # Step 5: Pivot Quotas\n    logger.info(\"Pivoting quota data...\")\n    quota_data = quota_data.select(\n        col(\"Region\"),\n        F.expr(\"stack(4, '2015', `2015`, '2016', `2016`, '2017', `2017`, '2018', `2018`)\").alias(\"Year\", \"Quota\")\n    )\n\n    # COMMAND ----------\n    #
\n    # Step 6: Union Orders\n    logger.info(\"Unioning orders data...\")\n    all_orders = central_orders.union(west_orders).union(east_orders).union(south_orders)\n\n    # COMMAND ----------\n    #
\n    # Step 7: Join Orders and Returns\n    logger.info(\"Joining orders and returns data...\")\n    orders_returns = returns_data.join(all_orders, [\"Product ID\", \"Order ID\"], \"left_outer\") \\\n        .withColumn(\"Returned?\", col(\"Return Reason\").isNotNull()) \\\n        .withColumn(\"Days to Ship\", datediff(col(\"Ship Date\"), col(\"Order Date\"))) \\\n        .withColumn(\"Discount\", when(col(\"Discount\").isNull(), 0).otherwise(col(\"Discount\"))) \\\n        .withColumn(\"Year of Sale\", year(col(\"Order Date\")))\n\n    # Define condition for valid discounts\n    valid_discount_condition = ~(col(\"Discount\").between(17, 18))\n    orders_returns = orders_returns.filter(valid_discount_condition)\n\n    # COMMAND ----------\n    #
\n    # Step 8: Clean Notes/Approver\n    logger.info(\"Cleaning notes and approver...\")\n    orders_returns = orders_returns.withColumn(\"Return Notes\", split(col(\"Notes\"), \",\")[0]) \\\n        .withColumn(\"Approver\", split(col(\"Notes\"), \",\")[1]) \\\n        .drop(\"Notes\")\n\n    # COMMAND ----------\n    #
\n    # Step 9: Roll Up Sales\n    logger.info(\"Rolling up sales data...\")\n    aggregated_sales = orders_returns.groupBy(\"Region\", \"Year of Sale\").agg(\n        sum(\"Profit\").alias(\"Total Profit\"),\n        sum(\"Sales\").alias(\"Total Sales\"),\n        sum(\"Quantity\").alias(\"Total Quantity\"),\n        avg(\"Discount\").alias(\"Average Discount\")\n    )\n\n    # COMMAND ----------\n    #
\n    # Step 10: Join Quota and Orders\n    logger.info(\"Joining quota and aggregated sales data...\")\n    final_data = aggregated_sales.join(quota_data, [\"Region\", \"Year\"], \"inner\")\n\n    # COMMAND ----------\n    #
\n    # Output Handling\n    logger.info(\"Writing final data to Unity Catalog tables...\")\n    spark.sql(\"DROP TABLE IF EXISTS catalog.db.annual_regional_performance\")\n    final_data.write.format(\"delta\").mode(\"overwrite\").saveAsTable(\"catalog.db.annual_regional_performance\")\n\n    spark.sql(\"DROP TABLE IF EXISTS catalog.db.superstore_sales\")\n    orders_returns.write.format(\"delta\").mode(\"overwrite\").saveAsTable(\"catalog.db.superstore_sales\")\n\n    logger.info(\"ETL process completed successfully.\")\n\nexcept Exception as e:\n    logger.error(f\"An error occurred during the ETL process: {e}\")\n"
