In [None]:
import logging
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    # Load data from Unity Catalog tables
    logger.info("Loading data from Unity Catalog tables...")
    customers_df = spark.table("genai_demo.sas.customers")
    transactions_df = spark.table("genai_demo.sas.transactions")

    # Step 1: Valid Transactions Filtering
    logger.info("Filtering valid transactions...")
    valid_txns_df = transactions_df.filter((F.col("Sales") > 0) & (F.col("Product") != ""))

    # Step 2: Effective Price Calculation
    logger.info("Calculating effective price...")
    trans_step2_df = valid_txns_df.withColumn("EffectivePrice", F.col("Sales") * (1 - F.col("Discount") / 100))

    # Step 3: Total Value Calculation
    logger.info("Calculating total value...")
    trans_step3_df = trans_step2_df.withColumn("TotalValue", F.col("EffectivePrice") * F.col("Quantity"))

    # Step 4: Full Data Join
    logger.info("Joining transaction data with customer information...")
    full_data_df = trans_step3_df.join(customers_df, "CustomerID", "left").select(
        trans_step3_df["*"], customers_df["Region"], customers_df["JoinDate"]
    )

    # Step 5: Tenure Days Calculation
    logger.info("Calculating tenure days...")
    trans_step5_df = full_data_df.withColumn("TenureDays", F.datediff(F.col("TransDate"), F.col("JoinDate")))

    # Step 6: Tenure Category Assignment
    logger.info("Assigning tenure categories...")
    trans_step6_df = trans_step5_df.withColumn(
        "TenureCategory",
        F.when(F.col("TenureDays") < 180, "New")
        .when(F.col("TenureDays") < 365, "Medium")
        .otherwise("Loyal")
    )

    # Step 7: High Value Flag
    logger.info("Flagging high-value transactions...")
    trans_step7_df = trans_step6_df.withColumn("HighValueFlag", F.col("TotalValue") > 2000)

    # Step 8: Product Group Assignment
    logger.info("Assigning product groups...")
    trans_step8_df = trans_step7_df.withColumn(
        "ProductGroup",
        F.when(F.col("Product").isin("A", "C"), "Core").otherwise("Non-Core")
    )

    # Final Data Preparation
    logger.info("Preparing final data...")
    final_data_df = trans_step8_df

    # Sorting for Standardization
    logger.info("Sorting data by ProductGroup...")
    sorted_final_data_df = final_data_df.orderBy("ProductGroup")

    # Z-score Standardization
    logger.info("Standardizing TotalValue...")
    window_spec = Window.partitionBy("ProductGroup")
    standardized_df = sorted_final_data_df.withColumn(
        "StandardizedTotalValue",
        (F.col("TotalValue") - F.mean("TotalValue").over(window_spec)) / F.stddev("TotalValue").over(window_spec)
    )

    # Outlier Detection
    logger.info("Detecting outliers...")
    enhanced_final_data_df = standardized_df.withColumn(
        "OutlierFlag",
        F.when(F.abs(F.col("StandardizedTotalValue")) > 2, 1).otherwise(0)
    )

    # Write the processed data to Unity Catalog tables
    logger.info("Writing enhanced final data to Unity Catalog table...")
    spark.sql("DROP TABLE IF EXISTS genai_demo.sas.enhanced_final_data")
    enhanced_final_data_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.sas.enhanced_final_data")

    # Additional Outputs: Summary Statistics and Frequency Analysis
    logger.info("Generating summary statistics and frequency analysis...")
    summary_stats_df = enhanced_final_data_df.groupBy("Region", "ProductGroup").agg(
        F.mean("TotalValue").alias("MeanTotalValue"),
        F.sum("TotalValue").alias("SumTotalValue"),
        F.mean("Quantity").alias("MeanQuantity"),
        F.sum("Quantity").alias("SumQuantity"),
        F.mean("Sales").alias("MeanSales"),
        F.sum("Sales").alias("SumSales")
    )

    tenure_category_freq_df = enhanced_final_data_df.groupBy("TenureCategory", "Region").count()

    # Write additional outputs to Unity Catalog tables
    logger.info("Writing summary statistics to Unity Catalog table...")
    spark.sql("DROP TABLE IF EXISTS genai_demo.sas.summary_stats")
    summary_stats_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.sas.summary_stats")

    logger.info("Writing tenure category frequency analysis to Unity Catalog table...")
    spark.sql("DROP TABLE IF EXISTS genai_demo.sas.tenure_category_freq")
    tenure_category_freq_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.sas.tenure_category_freq")

    logger.info("ETL workflow completed successfully.")

except Exception as e:
    logger.error(f"An error occurred during the ETL process: {e}")
    raise
