In [None]:
# Databricks notebook source
# COMMAND ----------
# %md
# # Customer 360 View Data Processing
# This notebook processes customer data to create a comprehensive 360-degree view by integrating various datasets.

# COMMAND ----------
#
import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, max, floor, to_date, when, current_date, datediff, expr

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------
# %md
# ## Load Data
# Load data from CSV files into DataFrames.

# COMMAND ----------
#
def load_data():
    try:
        logger.info("Loading data from CSV files...")
        claims_df = spark.read.csv("dbfs:/FileStore/claims.csv", header=True, inferSchema=True)
        demographics_df = spark.read.csv("dbfs:/FileStore/demographics.csv", header=True, inferSchema=True)
        policy_df = spark.read.csv("dbfs:/FileStore/policy.csv", header=True, inferSchema=True)
        scores_df = spark.read.csv("dbfs:/FileStore/scores.csv", header=True, inferSchema=True)
        aiml_insights_df = spark.read.csv("dbfs:/FileStore/aiml_insights.csv", header=True, inferSchema=True)
        logger.info("Data loaded successfully.")
        return claims_df, demographics_df, policy_df, scores_df, aiml_insights_df
    except Exception as e:
        logger.error(f"Error loading data: {e}")
        raise

claims_df, demographics_df, policy_df, scores_df, aiml_insights_df = load_data()

# COMMAND ----------
# %md
# ## Select Relevant Fields
# Select relevant fields from each dataset.

# COMMAND ----------
#
def select_fields(demographics_df, claims_df, policy_df):
    try:
        logger.info("Selecting relevant fields from datasets...")
        demographics_selected = demographics_df.select(
            "Customer_ID", "Customer_Name", "Email", "Phone_Number", "Address", "City", "State", "Postal_Code",
            "Date_of_Birth", "Gender", "Marital_Status", "Occupation", "Income_Level", "Customer_Segment"
        )
        claims_selected = claims_df.select(
            "Claim_ID", "Policy_ID", "Claim_Date", "Claim_Type", "Claim_Status", "Claim_Amount", "Claim_Payout"
        )
        policy_selected = policy_df.select(
            "Policy_ID", "Customer_ID", "Policy_Type", "Policy_Status", "Policy_Start_Date", "Policy_End_Date",
            "Policy_Term", "Policy_Premium", "Total_Premium_Paid", "Renewal_Status", "Policy_Addons"
        )
        logger.info("Field selection completed.")
        return demographics_selected, claims_selected, policy_selected
    except Exception as e:
        logger.error(f"Error selecting fields: {e}")
        raise

demographics_selected, claims_selected, policy_selected = select_fields(demographics_df, claims_df, policy_df)

# COMMAND ----------
# %md
# ## Data Integration
# Merge datasets based on common identifiers.

# COMMAND ----------
#
def join_datasets(demographics_selected, claims_selected, policy_selected):
    try:
        logger.info("Joining datasets...")
        demo_policy_joined = demographics_selected.join(policy_selected, "Customer_ID", "inner")
        demo_policy_claims_joined = demo_policy_joined.join(claims_selected, "Policy_ID", "inner")
        logger.info("Datasets joined successfully.")
        return demo_policy_claims_joined
    except Exception as e:
        logger.error(f"Error joining datasets: {e}")
        raise

demo_policy_claims_joined = join_datasets(demographics_selected, claims_selected, policy_selected)

# COMMAND ----------
# %md
# ## Data Aggregation and Summarization
# Aggregate and summarize the data.

# COMMAND ----------
#
def aggregate_data(demo_policy_claims_joined):
    try:
        logger.info("Aggregating data...")
        summarized_data = demo_policy_claims_joined.groupBy("Customer_ID").agg(
            count("Claim_ID").alias("Total_Claims"),
            count("Policy_ID").alias("Policy_Count"),
            max("Claim_Date").alias("Recent_Claim_Date"),
            avg("Claim_Amount").alias("Average_Claim_Amount")
        )
        logger.info("Data aggregation completed.")
        return summarized_data
    except Exception as e:
        logger.error(f"Error aggregating data: {e}")
        raise

summarized_data = aggregate_data(demo_policy_claims_joined)

# COMMAND ----------
# %md
# ## Custom Calculations
# Perform custom calculations on the summarized data.

# COMMAND ----------
#
def perform_custom_calculations(summarized_data):
    try:
        logger.info("Performing custom calculations...")
        final_data = summarized_data.withColumn(
            "Age", floor(datediff(current_date(), to_date(col("Date_of_Birth"), 'yyyy-MM-dd')) / 365)
        ).withColumn(
            "Claim_To_Premium_Ratio",
            when(col("Total_Premium_Paid") != 0, col("Claim_Amount") / col("Total_Premium_Paid")).otherwise(0)
        ).withColumn(
            "Claims_Per_Policy",
            when(col("Policy_Count") != 0, col("Total_Claims") / col("Policy_Count")).otherwise(0)
        ).withColumn(
            "Retention_Rate", expr("0.85")
        ).withColumn(
            "Cross_Sell_Opportunities", expr("'Multi-Policy Discount, Home Coverage Add-on'")
        ).withColumn(
            "Upsell_Potential", expr("'Premium Vehicle Coverage'")
        )
        logger.info("Custom calculations completed.")
        return final_data
    except Exception as e:
        logger.error(f"Error in custom calculations: {e}")
        raise

final_data = perform_custom_calculations(summarized_data)

# COMMAND ----------
# %md
# ## Comprehensive Data Joining
# Join with AI/ML insights and scores.

# COMMAND ----------
#
def join_comprehensive_data(final_data, scores_df, aiml_insights_df):
    try:
        logger.info("Joining with AI/ML insights and scores...")
        final_customer_profile = final_data.join(scores_df, "Customer_ID", "inner") \
            .join(aiml_insights_df, "Customer_ID", "inner")
        logger.info("Comprehensive data joining completed.")
        return final_customer_profile
    except Exception as e:
        logger.error(f"Error joining comprehensive data: {e}")
        raise

final_customer_profile = join_comprehensive_data(final_data, scores_df, aiml_insights_df)

# COMMAND ----------
# %md
# ## Output Generation
# Write the final output to a Unity Catalog table.

# COMMAND ----------
#
def write_output(final_customer_profile):
    try:
        logger.info("Writing final output to Unity Catalog table...")
        spark.sql("DROP TABLE IF EXISTS catalog.target_db.customer_360_view")
        final_customer_profile.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.customer_360_view")
        logger.info("Output written successfully.")
    except Exception as e:
        logger.error(f"Error writing output: {e}")
        raise

write_output(final_customer_profile)
