In [None]:
# Databricks notebook source
# COMMAND ----------
# MAGIC %md
# MAGIC # ETL Process for Customer 360 View
# MAGIC This notebook performs an ETL process to create a comprehensive Customer 360 view by integrating data from various sources.

# COMMAND ----------
# MAGIC
# Import necessary libraries
import logging
from pyspark.sql import functions as F
from pyspark.sql.functions import count, max, avg, datediff, current_date, when, lit

# COMMAND ----------
# MAGIC
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------
# MAGIC
try:
    # Load data from Unity Catalog tables
    logger.info("Loading data from Unity Catalog tables...")
    claims_df = spark.table("genai_demo.jnj.claims")
    demographics_df = spark.table("genai_demo.jnj.demographics")
    policy_df = spark.table("genai_demo.jnj.policy")
    scores_df = spark.table("genai_demo.jnj.scores")
    aiml_insights_df = spark.table("genai_demo.jnj.aiml_insights")

# COMMAND ----------
# MAGIC
# Select relevant fields from demographics data
    logger.info("Selecting relevant fields from demographics data for customer insights...")
    selected_demographics_df = demographics_df.select(
        "Customer_ID", "Customer_Name", "Email", "Phone_Number", "Address", "City", "State", "Postal_Code",
        "Date_of_Birth", "Gender", "Marital_Status", "Occupation", "Income_Level", "Customer_Segment"
    )

# COMMAND ----------
# MAGIC
# Select relevant fields from claims data
    logger.info("Selecting relevant fields from claims data for claims analysis...")
    selected_claims_df = claims_df.select(
        "Claim_ID", "Policy_ID", "Claim_Date", "Claim_Type", "Claim_Status", "Claim_Amount", "Claim_Payout"
    )

# COMMAND ----------
# MAGIC
# Inner join demographics and policy data on Customer_ID
    logger.info("Joining demographics and policy data on Customer_ID to link customer and policy information...")
    demographics_policy_df = selected_demographics_df.join(policy_df, "Customer_ID", "inner")

# COMMAND ----------
# MAGIC
# Inner join the result with claims data on Policy_ID
    logger.info("Joining the result with claims data on Policy_ID to associate claims with policies...")
    joined_df = demographics_policy_df.join(selected_claims_df, "Policy_ID", "inner")

# COMMAND ----------
# MAGIC
# Aggregate data to calculate metrics
    logger.info("Aggregating data to calculate metrics such as total claims and average claim amount...")
    summarized_df = joined_df.groupBy("Customer_ID").agg(
        count("Claim_ID").alias("Total_Claims"),
        count("Policy_ID").alias("Policy_Count"),
        max("Claim_Date").alias("Recent_Claim_Date"),
        avg("Claim_Amount").alias("Average_Claim_Amount")
    )

# COMMAND ----------
# MAGIC
# Inner join summarized data with the previous join result on Customer_ID
    logger.info("Joining summarized data with the previous join result on Customer_ID for comprehensive metrics...")
    final_joined_df = summarized_df.join(joined_df, "Customer_ID", "inner")

# COMMAND ----------
# MAGIC
# Calculate additional metrics
    logger.info("Calculating additional metrics for customer insights...")
    age_calculation = datediff(current_date(), F.col("Date_of_Birth")) / 365
    claim_to_premium_ratio = when(F.col("Total_Premium_Paid") > 0, F.col("Claim_Amount") / F.col("Total_Premium_Paid")).otherwise(0)
    claims_per_policy = when(F.col("Policy_Count") > 0, F.col("Total_Claims") / F.col("Policy_Count")).otherwise(0)

    final_joined_df = final_joined_df.withColumn("Age", age_calculation)
    final_joined_df = final_joined_df.withColumn("Claim_To_Premium_Ratio", claim_to_premium_ratio)
    final_joined_df = final_joined_df.withColumn("Claims_Per_Policy", claims_per_policy)
    final_joined_df = final_joined_df.withColumn("Retention_Rate", lit(0.85))
    final_joined_df = final_joined_df.withColumn("Cross_Sell_Opportunities", lit("MultiPolicy Discount, Home Coverage Addon"))
    final_joined_df = final_joined_df.withColumn("Upsell_Potential", lit("Premium Vehicle Coverage"))

# COMMAND ----------
# MAGIC
# Combine data from multiple sources including AIML insights and scores
    logger.info("Combining data from multiple sources including AIML insights and scores for a 360-degree customer view...")
    customer_360_df = final_joined_df.join(scores_df, "Customer_ID", "inner").join(aiml_insights_df, "Customer_ID", "inner")

# COMMAND ----------
# MAGIC
# Write the final DataFrame to the Customer 360 view table in Databricks
    logger.info("Writing the final DataFrame to the Customer 360 view table in Databricks...")
    spark.sql("DROP TABLE IF EXISTS genai_demo.jnj.customer_360_view")
    customer_360_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.jnj.customer_360_view")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error("An error occurred during the ETL process", exc_info=True)
