In [None]:
# Databricks notebook source
# COMMAND ----------
import logging
from pyspark.sql import functions as F
from pyspark.sql.functions import col, count, max, avg, when, lit, datediff, current_date

# COMMAND ----------
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------
try:
    # Data Loading
    logger.info("Loading data from Unity Catalog tables...")
    policy_df = spark.table("genai_demo.jnj.policy")
    claims_df = spark.table("genai_demo.jnj.claims")
    demographics_df = spark.table("genai_demo.jnj.demographics")
    scores_df = spark.table("genai_demo.jnj.scores")
    aiml_insights_df = spark.table("genai_demo.jnj.aiml_insights")

    # COMMAND ----------
    # Data Selection and Standardization
    logger.info("Selecting and standardizing relevant fields...")
    selected_demographics_df = demographics_df.select(
        "Customer_ID", "Customer_Name", "Email", "Phone_Number", "Address", "City", "State", "Postal_Code",
        "Date_of_Birth", "Gender", "Marital_Status", "Occupation", "Income_Level", "Customer_Segment"
    )

    # Cache the selected demographics DataFrame as it is used multiple times
    selected_demographics_df.cache()

    # COMMAND ----------
    # Data Integration
    logger.info("Integrating datasets...")
    customer_policy_df = selected_demographics_df.join(policy_df, "Customer_ID", "inner")
    customer_policy_claims_df = customer_policy_df.join(claims_df, "Policy_ID", "inner")

    # COMMAND ----------
    # Data Aggregation and Summarization
    logger.info("Aggregating data to calculate key metrics...")
    aggregated_df = customer_policy_claims_df.groupBy("Customer_ID").agg(
        count("Claim_ID").alias("Total_Claims"),
        count("Policy_ID").alias("Policy_Count"),
        max("Claim_Date").alias("Recent_Claim_Date"),
        avg("Claim_Amount").alias("Average_Claim_Amount")
    )

    # COMMAND ----------
    # Custom Calculations and Derived Metrics
    logger.info("Calculating custom metrics...")
    final_df = aggregated_df.join(selected_demographics_df, "Customer_ID", "inner").withColumn(
        "Age", (datediff(current_date(), col("Date_of_Birth")) / 365).cast("int")
    ).withColumn(
        "Claim_To_Premium_Ratio", when(col("total_premium_paid") != 0, col("Average_Claim_Amount") / col("total_premium_paid")).otherwise(0)
    ).withColumn(
        "Claims_Per_Policy", when(col("Policy_Count") != 0, col("Total_Claims") / col("Policy_Count")).otherwise(0)
    ).withColumn(
        "Retention_Rate", lit(0.85)
    ).withColumn(
        "Cross_Sell_Opportunities", lit("Multi-Policy Discount, Home Coverage Add-on")
    ).withColumn(
        "Upsell_Potential", lit("Premium Vehicle Coverage")
    )

    # COMMAND ----------
    # Predictive Insights Integration
    logger.info("Integrating AI/ML insights...")
    enriched_df = final_df.join(aiml_insights_df, "Customer_ID", "left").join(scores_df, "Customer_ID", "left")

    # COMMAND ----------
    # Output Generation
    logger.info("Writing the final DataFrame to the Customer_360 table...")
    enriched_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.jnj.customer_360_view")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error("An error occurred during the ETL process", exc_info=True)
