In [None]:
import logging
from pyspark.sql import functions as F

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    # Load data from Unity Catalog tables
    orders_central_df = spark.table("genai_demo.citi.orders_central")
    orders_east_df = spark.table("genai_demo.citi.orders_east")
    orders_south_2015_df = spark.table("genai_demo.citi.orders_south_2015")
    orders_south_2016_df = spark.table("genai_demo.citi.orders_south_2016")
    orders_south_2017_df = spark.table("genai_demo.citi.orders_south_2017")
    orders_south_2018_df = spark.table("genai_demo.citi.orders_south_2018")
    orders_west_df = spark.table("genai_demo.citi.orders_west")
    quota_df = spark.table("genai_demo.citi.quota")
    returns_df = spark.table("genai_demo.citi.returns")

    # Fix Dates: Standardize date fields and column names
    orders_central_df = orders_central_df.withColumn("Region", F.lit("Central")) \
        .withColumn("Order Date", F.to_date(F.concat_ws("-", F.col("Order Year"), F.col("Order Month"), F.col("Order Day")), "yyyy-MM-dd")) \
        .withColumn("Ship Date", F.to_date(F.concat_ws("-", F.col("Ship Year"), F.col("Ship Month"), F.col("Ship Day")), "yyyy-MM-dd")) \
        .drop("Order Year", "Order Month", "Order Day", "Ship Year", "Ship Month", "Ship Day") \
        .withColumnRenamed("Discounts", "Discount") \
        .withColumnRenamed("Product", "Product Name")

    # Remove Nulls: Exclude rows with null Order ID
    orders_central_df = orders_central_df.filter(F.col("Order ID").isNotNull())

    # Fix Data Type: Ensure correct data types
    orders_central_df = orders_central_df.withColumn("Discount", F.col("Discount").cast("string")) \
        .withColumn("Sales", F.regexp_replace(F.col("Sales"), "[^0-9.]", "").cast("double"))

    # Rename States: Standardize state names to abbreviations
    state_mapping = {
        "Arizona": "AZ", "California": "CA", "Colorado": "CO", "Idaho": "ID",
        "Montana": "MT", "New Mexico": "NM", "Oregon": "OR", "Washington": "WA", "Utah": "UT"
    }
    orders_central_df = orders_central_df.replace(state_mapping, subset=["State"])

    # Pivot Quotas: Unpivot quota data to create Year and Quota fields
    quota_df = quota_df.selectExpr("Region", "stack(4, '2015', 2015, '2016', 2016, '2017', 2017, '2018', 2018) as (Year, Quota)")

    # Clean Notes/Approver: Split Notes into Return Notes and Approver, standardize names, and remove original Notes
    returns_df = returns_df.withColumn("Return Notes", F.trim(F.split(F.col("Notes"), "-").getItem(0))) \
        .withColumn("Approver", F.trim(F.split(F.col("Notes"), "-").getItem(1))) \
        .drop("Notes")

    approver_mapping = {
        "M/ Gomez": "M Gomez", "M. Gomez": "M Gomez", "S Kelly": "S Kelly", "S. Kelly": "S Kelly",
        "F Azad": "F Azad", "F. Azad": "F Azad", "L Smith": "L Smith", "L. Smith": "L Smith",
        "G Lindsay": "G Lindsay", "G. Lindsay": "G Lindsay", "R Chen": "R Chen", "R. Chen": "R Chen",
        "K Lawrence": "K Lawrence", "K. Lawrence": "K Lawrence", "L Jenkins": "L Jenkins", "L. Jenkins": "L Jenkins",
        "R Duchesne": "R Duchesne", "R. Duchesne": "R Duchesne"
    }
    returns_df = returns_df.replace(approver_mapping, subset=["Approver"])

    # All Orders: Union all order datasets into a single DataFrame
    all_orders_df = orders_central_df.union(orders_east_df).union(orders_south_2015_df) \
        .union(orders_south_2016_df).union(orders_south_2017_df).union(orders_south_2018_df).union(orders_west_df)

    # Orders + Returns: Join orders with returns data
    orders_returns_df = all_orders_df.join(returns_df, ["Order ID", "Product ID"], "right") \
        .withColumn("Returned?", F.when(F.col("Return Reason").isNotNull(), "Yes").otherwise("No")) \
        .withColumn("Days to Ship", F.datediff(F.col("Ship Date"), F.col("Order Date"))) \
        .withColumn("Discount", F.when(F.col("Discount").isNull(), 0).otherwise(F.col("Discount"))) \
        .withColumn("Year of Sale", F.year(F.col("Order Date"))) \
        .filter(~((F.col("Discount") >= 17) & (F.col("Discount") < 18)))

    # Roll Up Sales: Aggregate sales data
    rollup_sales_df = orders_returns_df.groupBy("Region", "Year of Sale") \
        .agg(F.sum("Profit").alias("Profit"), F.sum("Sales").alias("Sales"), F.sum("Quantity").alias("Quantity"), F.avg("Discount").alias("Discount"))

    # Quota + Orders: Join quota with aggregated sales data
    quota_orders_df = quota_df.join(rollup_sales_df, (quota_df.Region == rollup_sales_df.Region) & (quota_df.Year == rollup_sales_df["Year of Sale"]), "inner")

    # Write to Unity Catalog target tables
    spark.sql("DROP TABLE IF EXISTS genai_demo.citi.superstore_sales")
    orders_returns_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.citi.superstore_sales")

    spark.sql("DROP TABLE IF EXISTS genai_demo.citi.annual_regional_performance")
    quota_orders_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.citi.annual_regional_performance")

    logger.info("Data migration completed successfully.")

except Exception as e:
    logger.error(f"Error during data migration: {e}")
