In [None]:
# Databricks notebook source
# COMMAND ----------
# MAGIC %md
# MAGIC # ETL Process for Insurance Data
# MAGIC This notebook performs an ETL process on insurance data, including data ingestion, transformation, integration of AI/ML insights, and output to a Unity Catalog table.

# COMMAND ----------
# MAGIC
# Import necessary libraries
import logging
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import lit, when, datediff, current_date

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------
# MAGIC
# Step 1: Data Ingestion
def load_data():
    try:
        logger.info("Loading CSV files into DataFrames")
        policy_df = spark.read.csv("tfs://dataeconomy-9k42/62457/uploads/62457/29aabe9d-c354-4176-8f98-2dd7a5fd7216/policy.csv", header=True, inferSchema=True)
        claims_df = spark.read.csv("tfs://dataeconomy-9k42/62457/uploads/62457/29aabe9d-c354-4176-8f98-2dd7a5fd7216/claims.csv", header=True, inferSchema=True)
        demographics_df = spark.read.csv("tfs://dataeconomy-9k42/62457/uploads/62457/29aabe9d-c354-4176-8f98-2dd7a5fd7216/demographics.csv", header=True, inferSchema=True)
        scores_df = spark.read.csv("tfs://dataeconomy-9k42/62457/uploads/62457/29aabe9d-c354-4176-8f98-2dd7a5fd7216/scores.csv", header=True, inferSchema=True)
        aiml_insights_df = spark.read.csv("tfs://dataeconomy-9k42/62457/uploads/62457/29aabe9d-c354-4176-8f98-2dd7a5fd7216/aiml_insights.csv", header=True, inferSchema=True)
        return policy_df, claims_df, demographics_df, scores_df, aiml_insights_df
    except Exception as e:
        logger.error(f"Error loading CSV files: {e}")
        raise

policy_df, claims_df, demographics_df, scores_df, aiml_insights_df = load_data()

# COMMAND ----------
# MAGIC
# Step 2: Data Transformation
def transform_data(demographics_df, policy_df, claims_df):
    try:
        logger.info("Applying transformations")

        # Join demographics and policy data
        demo_policy_df = demographics_df.join(policy_df, F.col('Customer_ID') == F.col('customer_id'), "inner")

        # Join with claims data
        demo_policy_claims_df = demo_policy_df.join(claims_df, F.col('policy_id') == F.col('Policy_ID'), "inner")

        # Aggregate claims data
        summary_df = demo_policy_claims_df.groupBy("Customer_ID").agg(
            F.count("Claim_ID").alias("Total_Claims"),
            F.count("policy_id").alias("Policy_Count"),
            F.max("Claim_Date").alias("Recent_Claim_Date"),
            F.avg("Claim_Amount").alias("Average_Claim_Amount")
        )

        # Calculate additional metrics
        age_expr = (datediff(current_date(), F.col("Date_of_Birth")) / 365).cast(IntegerType())
        claim_to_premium_ratio_expr = when(F.col("total_premium_paid") != 0, F.col("Claim_Amount") / F.col("total_premium_paid")).otherwise(0)
        claims_per_policy_expr = when(F.col("Policy_Count") != 0, F.col("Total_Claims") / F.col("Policy_Count")).otherwise(0)

        final_df = summary_df.withColumn("Age", age_expr) \
                             .withColumn("Claim_To_Premium_Ratio", claim_to_premium_ratio_expr) \
                             .withColumn("Claims_Per_Policy", claims_per_policy_expr) \
                             .withColumn("Retention_Rate", lit(0.85)) \
                             .withColumn("Cross_Sell_Opportunities", lit("Multi-Policy Discount, Home Coverage Add-on")) \
                             .withColumn("Upsell_Potential", lit("Premium Vehicle Coverage"))

        # Define schema contract
        final_df = final_df.select(
            "Customer_ID", "Age", "Claim_To_Premium_Ratio", "Claims_Per_Policy", 
            "Retention_Rate", "Cross_Sell_Opportunities", "Upsell_Potential"
        )
        return final_df
    except Exception as e:
        logger.error(f"Error during data transformation: {e}")
        raise

final_df = transform_data(demographics_df, policy_df, claims_df)

# COMMAND ----------
# MAGIC
# Step 3: Integrate AI/ML Insights and Scores
def integrate_insights(final_df, aiml_insights_df, scores_df):
    try:
        logger.info("Integrating AI/ML insights and scores")
        customer_360_df = final_df.join(aiml_insights_df, "Customer_ID", "inner") \
                                  .join(scores_df, "Customer_ID", "inner")
        return customer_360_df
    except Exception as e:
        logger.error(f"Error integrating AI/ML insights and scores: {e}")
        raise

customer_360_df = integrate_insights(final_df, aiml_insights_df, scores_df)

# COMMAND ----------
# MAGIC
# Step 4: Output Data
def output_data(customer_360_df):
    try:
        logger.info("Writing output data to Unity Catalog table")
        # Drop existing table if necessary
        spark.sql("DROP TABLE IF EXISTS catalog.target_db.customer_360")

        # Write to Unity Catalog target table
        customer_360_df.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.customer_360")
    except Exception as e:
        logger.error(f"Error writing output data: {e}")
        raise

output_data(customer_360_df)

logger.info("ETL process completed successfully")
