In [None]:
import logging
from pyspark.sql.functions import col, when, datediff, current_date, avg, count, max as spark_max
from pyspark.sql import functions as F

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    # Load data from Unity Catalog tables
    policy_df = spark.table("catalog.db.policy")
    claims_df = spark.table("catalog.db.claims")
    demographics_df = spark.table("catalog.db.demographics")
    scores_df = spark.table("catalog.db.scores")
    aiml_insights_df = spark.table("catalog.db.aiml_insights")

    # Select relevant fields
    demographics_df = demographics_df.select(
        "Customer_ID", "Customer_Name", "Email", "Phone_Number", "Address", "City", "State", 
        "Postal_Code", "Date_of_Birth", "Gender", "Marital_Status", "Occupation", 
        "Income_Level", "Customer_Segment"
    )
    
    claims_df = claims_df.select(
        "Claim_ID", "Policy_ID", "Claim_Date", "Claim_Type", "Claim_Status", 
        "Claim_Amount", "Claim_Payout"
    )
    
    policy_df = policy_df.select(
        "Policy_ID", "Customer_ID", "Policy_Type", "Policy_Status", "Policy_Start_Date", 
        "Policy_End_Date", "Policy_Term", "Policy_Premium", "Total_Premium_Paid", 
        "Renewal_Status", "Policy_Addons"
    )
    
    scores_df = scores_df.select(
        "Customer_ID", "Credit_Score", "Fraud_Score", "Customer_Risk_Score"
    )
    
    aiml_insights_df = aiml_insights_df.select(
        "Customer_ID", "Churn_Probability", "Next_Best_Offer", 
        "Claims_Fraud_Probability", "Revenue_Potential"
    )

    # Data Integration
    # Join demographics and policy data on Customer_ID
    demo_policy_df = demographics_df.join(policy_df, "Customer_ID", "inner")

    # Join the result with claims data on Policy_ID
    demo_policy_claims_df = demo_policy_df.join(claims_df, "Policy_ID", "inner")

    # Aggregation and Summarization
    summary_df = demo_policy_claims_df.groupBy("Customer_ID").agg(
        count("Claim_ID").alias("Total_Claims"),
        spark_max("Claim_Date").alias("Recent_Claim_Date"),
        avg("Claim_Amount").alias("Average_Claim_Amount"),
        count("Policy_ID").alias("Policy_Count")
    )

    # Join summarized data with detailed customer data
    detailed_df = demo_policy_claims_df.join(summary_df, "Customer_ID", "inner")

    # Custom Calculations
    detailed_df = detailed_df.withColumn(
        "Age", datediff(current_date(), col("Date_of_Birth")) / 365
    ).withColumn(
        "Claim_To_Premium_Ratio", when(col("Total_Premium_Paid") != 0, col("Claim_Amount") / col("Total_Premium_Paid")).otherwise(0)
    ).withColumn(
        "Claims_Per_Policy", when(col("Policy_Count") != 0, col("Total_Claims") / col("Policy_Count")).otherwise(0)
    ).withColumn(
        "Retention_Rate", F.lit(0.85)
    ).withColumn(
        "Cross_Sell_Opportunities", F.lit("Multi-Policy Discount, Home Coverage Add-on")
    ).withColumn(
        "Upsell_Potential", F.lit("Premium Vehicle Coverage")
    )

    # Comprehensive Data Consolidation
    final_df = detailed_df.join(aiml_insights_df, "Customer_ID", "inner").join(scores_df, "Customer_ID", "inner")

    # Write the final output to Unity Catalog table
    spark.sql("DROP TABLE IF EXISTS catalog.db.final_output")
    final_df.write.format("delta").mode("overwrite").saveAsTable("catalog.db.final_output")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error(f"Error during ETL process: {e}")
