In [None]:
# Databricks notebook source
# COMMAND ----------
# MAGIC %md
# MAGIC # ETL Process for Insurance Data
# MAGIC This notebook performs an ETL process on insurance data, integrating various datasets and performing transformations to create a comprehensive customer view.

# COMMAND ----------
# MAGIC
# Import necessary libraries
import logging
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------
# MAGIC
# Step 1: Data Ingestion
def load_data():
    logger.info("Loading data from Unity Catalog tables...")
    policy_df = spark.table("catalog.insurance_db.policy")
    claims_df = spark.table("catalog.insurance_db.claims")
    demographics_df = spark.table("catalog.insurance_db.demographics")
    scores_df = spark.table("catalog.insurance_db.scores")
    aiml_insights_df = spark.table("catalog.insurance_db.aiml_insights")
    return policy_df, claims_df, demographics_df, scores_df, aiml_insights_df

policy_df, claims_df, demographics_df, scores_df, aiml_insights_df = load_data()

# COMMAND ----------
# MAGIC
# Step 2: Data Selection
def select_data(demographics_df, claims_df, policy_df):
    logger.info("Selecting relevant fields from each dataset...")
    selected_demographics_df = demographics_df.select(
        F.col("Customer_ID"), F.col("Customer_Name"), F.col("Email"), F.col("Phone_Number"), F.col("Date_of_Birth")
    )
    selected_claims_df = claims_df.select(
        F.col("Claim_ID"), F.col("Policy_ID"), F.col("Claim_Date"), F.col("Claim_Amount")
    )
    selected_policy_df = policy_df.select(
        F.col("policy_id"), F.col("customer_id"), F.col("policy_type"), F.col("policy_premium"), F.col("total_premium_paid")
    )
    return selected_demographics_df, selected_claims_df, selected_policy_df

selected_demographics_df, selected_claims_df, selected_policy_df = select_data(demographics_df, claims_df, policy_df)

# COMMAND ----------
# MAGIC
# Step 3: Data Integration
def integrate_data(selected_demographics_df, selected_policy_df, selected_claims_df):
    logger.info("Joining datasets on key identifiers...")
    joined_df = selected_demographics_df.join(
        selected_policy_df, selected_demographics_df.Customer_ID == selected_policy_df.customer_id, "inner"
    ).join(
        selected_claims_df, selected_policy_df.policy_id == selected_claims_df.Policy_ID, "inner"
    )
    return joined_df

joined_df = integrate_data(selected_demographics_df, selected_policy_df, selected_claims_df)

# COMMAND ----------
# MAGIC
# Step 4: Aggregation and Custom Calculations
def aggregate_and_calculate(joined_df):
    logger.info("Performing aggregation and custom calculations...")
    aggregated_df = joined_df.groupBy("Customer_ID").agg(
        F.count("Claim_ID").alias("Total_Claims"),
        F.avg("Claim_Amount").alias("Average_Claim_Amount"),
        F.max("Claim_Date").alias("Recent_Claim_Date")
    )

    age_expr = F.expr("DATEDIFF(current_date(), to_date(Date_of_Birth, 'yyyy-MM-dd')) / 365").cast(IntegerType())
    claim_to_premium_ratio_expr = F.expr("Claim_Amount / total_premium_paid")
    claims_per_policy_expr = F.expr("Total_Claims / count(policy_id)")

    enriched_df = aggregated_df.withColumn("Age", age_expr) \
        .withColumn("Claim_To_Premium_Ratio", claim_to_premium_ratio_expr) \
        .withColumn("Claims_Per_Policy", claims_per_policy_expr) \
        .withColumn("Retention_Rate", F.lit(0.85)) \
        .withColumn("Cross_Sell_Opportunities", F.lit("Multi-Policy Discount, Home Coverage Add-on")) \
        .withColumn("Upsell_Potential", F.lit("Premium Vehicle Coverage"))
    return enriched_df

enriched_df = aggregate_and_calculate(joined_df)

# COMMAND ----------
# MAGIC
# Step 5: Advanced Data Enrichment
def enrich_data(enriched_df, scores_df, aiml_insights_df):
    logger.info("Integrating AI/ML insights and scores data...")
    final_df = enriched_df.join(scores_df, "Customer_ID", "inner").join(aiml_insights_df, "Customer_ID", "inner")
    return final_df

final_df = enrich_data(enriched_df, scores_df, aiml_insights_df)

# COMMAND ----------
# MAGIC
# Step 6: Output Generation
def write_output(final_df):
    logger.info("Writing the final dataset to Unity Catalog table...")
    spark.sql("DROP TABLE IF EXISTS catalog.insurance_db.customer_360")
    final_df.write.format("delta").mode("overwrite").saveAsTable("catalog.insurance_db.customer_360")
    logger.info("ETL process completed successfully.")

write_output(final_df)

# COMMAND ----------
# MAGIC %md
# MAGIC ## Conclusion
# MAGIC The ETL process has been successfully completed, and the final dataset is stored in the Unity Catalog table `customer_360`.
