In [None]:
# Databricks notebook source
# COMMAND ----------
# %md
# # ETL Process for Customer 360 Profile
# This notebook performs an ETL process to create a comprehensive customer profile using various datasets.

# COMMAND ----------
#
import logging
from pyspark.sql import functions as F
from pyspark.sql.functions import count, avg, max, expr, datediff, current_date

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Assume the Spark session is already initialized as 'spark'

# COMMAND ----------
# %md
# ## Step 1: Data Ingestion
# Load data from CSV files into DataFrames.

# COMMAND ----------
#
def load_data():
    try:
        logger.info("Loading data from CSV files into DataFrames")
        policy_df = spark.read.csv("dbfs:/path/to/policy.csv", header=True, inferSchema=True)
        claims_df = spark.read.csv("dbfs:/path/to/claims.csv", header=True, inferSchema=True)
        demographics_df = spark.read.csv("dbfs:/path/to/demographics.csv", header=True, inferSchema=True)
        scores_df = spark.read.csv("dbfs:/path/to/scores.csv", header=True, inferSchema=True)
        aiml_insights_df = spark.read.csv("dbfs:/path/to/aiml_insights.csv", header=True, inferSchema=True)
        return policy_df, claims_df, demographics_df, scores_df, aiml_insights_df
    except Exception as e:
        logger.error(f"Error loading data: {e}")
        raise

policy_df, claims_df, demographics_df, scores_df, aiml_insights_df = load_data()

# COMMAND ----------
# %md
# ## Step 2: Data Selection and Filtering
# Select relevant fields from each dataset.

# COMMAND ----------
#
def select_fields(demographics_df, claims_df, policy_df, scores_df, aiml_insights_df):
    try:
        logger.info("Selecting relevant fields from each dataset")
        demographics_selected = demographics_df.select(
            "Customer_ID", "Customer_Name", "Email", "Phone_Number", "Address", 
            "City", "State", "Postal_Code", "Date_of_Birth", "Gender", 
            "Marital_Status", "Occupation", "Income_Level", "Customer_Segment"
        )
        claims_selected = claims_df.select(
            "Claim_ID", "Policy_ID", "Claim_Date", "Claim_Type", 
            "Claim_Status", "Claim_Amount", "Claim_Payout"
        )
        policy_selected = policy_df.select(
            "policy_id", "customer_id", "policy_type", "policy_status", 
            "policy_start_date", "policy_end_date", "policy_term", 
            "policy_premium", "total_premium_paid", "renewal_status", "policy_addons"
        )
        scores_selected = scores_df.select(
            "Customer_ID", "Credit_Score", "Fraud_Score", "Customer_Risk_Score"
        )
        aiml_insights_selected = aiml_insights_df.select(
            "Customer_ID", "Churn_Probability", "Next_Best_Offer", 
            "Claims_Fraud_Probability", "Revenue_Potential"
        )
        return demographics_selected, claims_selected, policy_selected, scores_selected, aiml_insights_selected
    except Exception as e:
        logger.error(f"Error selecting fields: {e}")
        raise

demographics_selected, claims_selected, policy_selected, scores_selected, aiml_insights_selected = select_fields(
    demographics_df, claims_df, policy_df, scores_df, aiml_insights_df
)

# COMMAND ----------
# %md
# ## Step 3: Data Integration
# Join datasets based on common identifiers.

# COMMAND ----------
#
def integrate_data(demographics_selected, policy_selected, claims_selected):
    try:
        logger.info("Joining datasets based on common identifiers")
        demographics_policy_joined = demographics_selected.join(
            policy_selected, F.col('Customer_ID') == F.col('customer_id'), "inner"
        )
        all_data_joined = demographics_policy_joined.join(
            claims_selected, F.col('policy_id') == F.col('Policy_ID'), "inner"
        )
        return all_data_joined
    except Exception as e:
        logger.error(f"Error joining datasets: {e}")
        raise

all_data_joined = integrate_data(demographics_selected, policy_selected, claims_selected)

# COMMAND ----------
# %md
# ## Step 4: Data Aggregation
# Aggregate data to compute key metrics.

# COMMAND ----------
#
def aggregate_data(all_data_joined):
    try:
        logger.info("Aggregating data to compute key metrics")
        aggregated_data = all_data_joined.groupBy("Customer_ID").agg(
            count("Claim_ID").alias("Total_Claims"),
            count("policy_id").alias("Policy_Count"),
            max("Claim_Date").alias("Recent_Claim_Date"),
            avg("Claim_Amount").alias("Average_Claim_Amount")
        )
        return aggregated_data
    except Exception as e:
        logger.error(f"Error aggregating data: {e}")
        raise

aggregated_data = aggregate_data(all_data_joined)

# COMMAND ----------
# %md
# ## Step 5: Custom Calculations
# Implement custom calculations for additional metrics.

# COMMAND ----------
#
def custom_calculations(aggregated_data):
    try:
        logger.info("Implementing custom calculations for additional metrics")
        age_expr = datediff(current_date(), F.col('Date_of_Birth')) / 365
        claim_to_premium_ratio_expr = expr("Claim_Amount / total_premium_paid")
        claims_per_policy_expr = expr("Total_Claims / Policy_Count")
        retention_rate_expr = expr("0.85")
        cross_sell_opportunities_expr = expr("'Multi-Policy Discount, Home Coverage Add-on'")
        upsell_potential_expr = expr("'Premium Vehicle Coverage'")

        calculated_data = aggregated_data.withColumn("Age", age_expr) \
            .withColumn("Claim_To_Premium_Ratio", claim_to_premium_ratio_expr) \
            .withColumn("Claims_Per_Policy", claims_per_policy_expr) \
            .withColumn("Retention_Rate", retention_rate_expr) \
            .withColumn("Cross_Sell_Opportunities", cross_sell_opportunities_expr) \
            .withColumn("Upsell_Potential", upsell_potential_expr)
        return calculated_data
    except Exception as e:
        logger.error(f"Error in custom calculations: {e}")
        raise

calculated_data = custom_calculations(aggregated_data)

# COMMAND ----------
# %md
# ## Step 6: Comprehensive Data Consolidation
# Combine all processed data into a single customer profile.

# COMMAND ----------
#
def consolidate_data(calculated_data, aiml_insights_selected, scores_selected):
    try:
        logger.info("Combining all processed data into a single customer profile")
        customer_360 = calculated_data.join(aiml_insights_selected, "Customer_ID", "inner") \
            .join(scores_selected, "Customer_ID", "inner")
        return customer_360
    except Exception as e:
        logger.error(f"Error consolidating data: {e}")
        raise

customer_360 = consolidate_data(calculated_data, aiml_insights_selected, scores_selected)

# COMMAND ----------
# %md
# ## Step 7: Output Data Source
# Write the comprehensive customer profile to Unity Catalog table.

# COMMAND ----------
#
def write_to_catalog(customer_360):
    try:
        logger.info("Writing the comprehensive customer profile to Unity Catalog table")
        spark.sql("DROP TABLE IF EXISTS catalog.target_db.customer_360")
        customer_360.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.customer_360")
    except Exception as e:
        logger.error(f"Error writing data to Unity Catalog: {e}")
        raise

write_to_catalog(customer_360)

logger.info("ETL process completed successfully")
