In [None]:
# Databricks notebook source
# COMMAND ----------
# MAGIC %md
# MAGIC # Insurance Data Processing
# MAGIC This notebook processes insurance data by loading, transforming, and summarizing it, and finally writing the results to a Delta table.

# COMMAND ----------
# MAGIC
# Import necessary libraries
import logging
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Assume the Spark session is already available as 'spark'

# COMMAND ----------
# MAGIC
# Load data from CSV files stored in cloud storage
def load_data():
    try:
        logger.info("Loading data from CSV files...")
        policy_df = spark.read.csv("s3://bucket/path/policy.csv", header=True, inferSchema=True)
        claims_df = spark.read.csv("s3://bucket/path/claims.csv", header=True, inferSchema=True)
        demographics_df = spark.read.csv("s3://bucket/path/demographics.csv", header=True, inferSchema=True)
        scores_df = spark.read.csv("s3://bucket/path/scores.csv", header=True, inferSchema=True)
        aiml_insights_df = spark.read.csv("s3://bucket/path/aiml_insights.csv", header=True, inferSchema=True)
        logger.info("Data loaded successfully.")
        return policy_df, claims_df, demographics_df, scores_df, aiml_insights_df
    except Exception as e:
        logger.error(f"Error loading data: {e}")
        raise

policy_df, claims_df, demographics_df, scores_df, aiml_insights_df = load_data()

# COMMAND ----------
# MAGIC
# Select relevant fields
def select_fields(demographics_df, claims_df, policy_df):
    try:
        logger.info("Selecting relevant fields...")
        demographics_selected = demographics_df.select("Customer_ID", "Customer_Name", "Email", "Date_of_Birth")
        claims_selected = claims_df.select("Claim_ID", "Policy_ID", "Claim_Date", "Claim_Amount")
        policy_selected = policy_df.select("policy_id", "customer_id", "policy_type", "total_premium_paid")
        logger.info("Fields selected successfully.")
        return demographics_selected, claims_selected, policy_selected
    except Exception as e:
        logger.error(f"Error selecting fields: {e}")
        raise

demographics_selected, claims_selected, policy_selected = select_fields(demographics_df, claims_df, policy_df)

# COMMAND ----------
# MAGIC
# Join DataFrames
def join_dataframes(demographics_selected, policy_selected, claims_selected):
    try:
        logger.info("Joining DataFrames...")
        joined_df = demographics_selected.join(policy_selected, F.col('Customer_ID') == F.col('customer_id'), "inner")
        joined_df = joined_df.join(claims_selected, F.col('policy_id') == F.col('Policy_ID'), "inner")
        logger.info("DataFrames joined successfully.")
        return joined_df
    except Exception as e:
        logger.error(f"Error joining DataFrames: {e}")
        raise

joined_df = join_dataframes(demographics_selected, policy_selected, claims_selected)

# COMMAND ----------
# MAGIC
# Summarize Data
def summarize_data(joined_df):
    try:
        logger.info("Summarizing data...")
        summarized_df = joined_df.groupBy("Customer_ID").agg(
            F.count("Claim_ID").alias("Total_Claims"),
            F.count("policy_id").alias("Policy_Count"),
            F.max("Claim_Date").alias("Recent_Claim_Date"),
            F.avg("Claim_Amount").alias("Average_Claim_Amount")
        )
        logger.info("Data summarized successfully.")
        return summarized_df
    except Exception as e:
        logger.error(f"Error summarizing data: {e}")
        raise

summarized_df = summarize_data(joined_df)

# COMMAND ----------
# MAGIC
# Calculate Additional Metrics
def calculate_metrics(summarized_df):
    try:
        logger.info("Calculating additional metrics...")
        claim_to_premium_ratio_expr = F.when(F.col("total_premium_paid") > 0, F.col("Average_Claim_Amount") / F.col("total_premium_paid")).otherwise(0)
        final_df = summarized_df.withColumn("Age", F.datediff(F.current_date(), F.col("Date_of_Birth")) / 365)
        final_df = final_df.withColumn("Claim_To_Premium_Ratio", claim_to_premium_ratio_expr)
        final_df = final_df.withColumn("Claims_Per_Policy", F.when(F.col("Policy_Count") > 0, F.col("Total_Claims") / F.col("Policy_Count")).otherwise(0))
        final_df = final_df.withColumn("Retention_Rate", F.lit(0.85))
        final_df = final_df.withColumn("Cross_Sell_Opportunities", F.lit("MultiPolicy Discount, Home Coverage Addon"))
        final_df = final_df.withColumn("Upsell_Potential", F.lit("Premium Vehicle Coverage"))
        logger.info("Additional metrics calculated successfully.")
        return final_df
    except Exception as e:
        logger.error(f"Error calculating additional metrics: {e}")
        raise

final_df = calculate_metrics(summarized_df)

# COMMAND ----------
# MAGIC
# Join with AIML Insights and Scores
def join_with_insights(final_df, aiml_insights_df, scores_df):
    try:
        logger.info("Joining with AIML insights and scores...")
        comprehensive_df = final_df.join(aiml_insights_df, "Customer_ID", "inner").join(scores_df, "Customer_ID", "inner")
        logger.info("Joined with AIML insights and scores successfully.")
        return comprehensive_df
    except Exception as e:
        logger.error(f"Error joining with AIML insights and scores: {e}")
        raise

comprehensive_df = join_with_insights(final_df, aiml_insights_df, scores_df)

# COMMAND ----------
# MAGIC
# Output Configuration
def write_output(comprehensive_df):
    try:
        logger.info("Writing output to Unity Catalog...")
        spark.sql("DROP TABLE IF EXISTS catalog.target_db.Customer_360")
        comprehensive_df.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.Customer_360")
        logger.info("Output written successfully.")
    except Exception as e:
        logger.error(f"Error writing output: {e}")
        raise

write_output(comprehensive_df)
