In [None]:
import logging
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from datetime import datetime

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    # Load data from Unity Catalog tables
    customers_df = spark.table("catalog.source_db.customers")
    transactions_df = spark.table("catalog.source_db.transactions")

    # Step 1: Valid Transactions Filtering
    valid_txns_df = transactions_df.filter((F.col("Sales") > 0) & (F.col("Product") != ""))
    logger.info("Filtered valid transactions.")

    # Step 2: Effective Price Calculation
    trans_step2_df = valid_txns_df.withColumn("EffectivePrice", F.col("Sales") * (1 - F.col("Discount") / 100))
    logger.info("Calculated effective price.")

    # Step 3: Total Value Calculation
    trans_step3_df = trans_step2_df.withColumn("TotalValue", F.col("EffectivePrice") * F.col("Quantity"))
    logger.info("Calculated total value.")

    # Step 4: Full Data Join
    full_data_df = trans_step3_df.join(customers_df, "CustomerID", "left").select(trans_step3_df["*"], customers_df["Region"], customers_df["JoinDate"])
    logger.info("Performed full data join.")

    # Step 5: Tenure Days Calculation
    trans_step5_df = full_data_df.withColumn("TenureDays", F.datediff(F.col("TransDate"), F.col("JoinDate")))
    logger.info("Calculated tenure days.")

    # Step 6: Tenure Category Assignment
    trans_step6_df = trans_step5_df.withColumn("TenureCategory", F.when(F.col("TenureDays") < 180, "New")
                                               .when(F.col("TenureDays") < 365, "Medium")
                                               .otherwise("Loyal"))
    logger.info("Assigned tenure category.")

    # Step 7: High Value Flag
    trans_step7_df = trans_step6_df.withColumn("HighValueFlag", F.col("TotalValue") > 2000)
    logger.info("Flagged high value transactions.")

    # Step 8: Product Group Assignment
    trans_step8_df = trans_step7_df.withColumn("ProductGroup", F.when(F.col("Product").isin("A", "C"), "Core").otherwise("Non-Core"))
    logger.info("Assigned product group.")

    # Step 9: Z-score Standardization
    windowSpec = Window.partitionBy("ProductGroup")
    standardized_df = trans_step8_df.withColumn("ZScoreTotalValue", (F.col("TotalValue") - F.avg("TotalValue").over(windowSpec)) / F.stddev("TotalValue").over(windowSpec))
    logger.info("Standardized total value using Z-score.")

    # Step 10: Outlier Detection
    enhanced_final_data_df = standardized_df.withColumn("OutlierFlag", F.abs(F.col("ZScoreTotalValue")) > 2)
    logger.info("Detected outliers.")

    # Output Handling: Write to Unity Catalog tables
    spark.sql("DROP TABLE IF EXISTS catalog.target_db.enhanced_final_data")
    enhanced_final_data_df.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.enhanced_final_data")
    logger.info("Saved enhanced final data to Unity Catalog.")

    # Additional Outputs: Summary Statistics, Frequency Analysis, Correlation Analysis
    # Summary Statistics by Region and Product Group
    summary_stats_df = enhanced_final_data_df.groupBy("Region", "ProductGroup").agg(
        F.mean("TotalValue").alias("MeanTotalValue"),
        F.sum("TotalValue").alias("SumTotalValue"),
        F.mean("Quantity").alias("MeanQuantity"),
        F.sum("Quantity").alias("SumQuantity"),
        F.mean("Sales").alias("MeanSales"),
        F.sum("Sales").alias("SumSales")
    )
    logger.info("Calculated summary statistics by region and product group.")

    # Tenure Category Frequency by Region
    tenure_freq_df = enhanced_final_data_df.groupBy("TenureCategory", "Region").count()
    logger.info("Calculated tenure category frequency by region.")

    # Correlation Analysis
    correlation_df = enhanced_final_data_df.select("Sales", "Discount", "Quantity", "TotalValue").corr()
    logger.info("Performed correlation analysis.")

    # Summary by Outlier Flag
    outlier_summary_df = enhanced_final_data_df.groupBy("OutlierFlag").agg(
        F.mean("Sales").alias("MeanSales"),
        F.stddev("Sales").alias("StdDevSales"),
        F.mean("TotalValue").alias("MeanTotalValue"),
        F.stddev("TotalValue").alias("StdDevTotalValue"),
        F.mean("Quantity").alias("MeanQuantity"),
        F.stddev("Quantity").alias("StdDevQuantity")
    )
    logger.info("Calculated summary by outlier flag.")

except Exception as e:
    logger.error(f"An error occurred: {e}")
    raise
