In [None]:
# Databricks notebook source
# COMMAND ----------

# MAGIC %md
# MAGIC # Data Processing and Analysis with PySpark
# MAGIC This notebook performs data processing and analysis using PySpark, including data loading, integration, aggregation, and custom calculations.

# COMMAND ----------

# MAGIC
# Import necessary libraries
import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, max, avg, datediff, current_date, lit, when

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Assume the Spark session is pre-initialized as 'spark'

# COMMAND ----------

# MAGIC
# Load data from Unity Catalog tables
def load_data():
    try:
        demographics_df = spark.table("genai_demo.jnj.demographics")
        claims_df = spark.table("genai_demo.jnj.claims")
        policy_df = spark.table("genai_demo.jnj.policy")
        scores_df = spark.table("genai_demo.jnj.scores")
        aiml_df = spark.table("genai_demo.jnj.aiml_insights")
        logger.info("Data loaded successfully from Unity Catalog tables.")
        return demographics_df, claims_df, policy_df, scores_df, aiml_df
    except Exception as e:
        logger.error(f"Error loading data from Unity Catalog tables: {e}")
        raise

demographics_df, claims_df, policy_df, scores_df, aiml_df = load_data()

# COMMAND ----------

# MAGIC
# Step 1: Data Selection and Filtering
def select_and_filter_data(demographics_df, claims_df, policy_df):
    try:
        demographics_df = demographics_df.select(
            col("Customer_ID"), col("Customer_Name"), col("Email"), col("Phone_Number"), col("Address"),
            col("City"), col("State"), col("Postal_Code"), col("Date_of_Birth"), col("Gender"),
            col("Marital_Status"), col("Occupation"), col("Income_Level"), col("Customer_Segment")
        )

        claims_df = claims_df.select(
            col("Claim_ID"), col("Policy_ID"), col("Claim_Date"), col("Claim_Type"),
            col("Claim_Status"), col("Claim_Amount"), col("Claim_Payout")
        )

        policy_df = policy_df.select(
            col("Policy_ID"), col("Customer_ID"), col("Policy_Type"), col("Policy_Status"),
            col("Policy_Start_Date"), col("Policy_End_Date"), col("Policy_Term"), col("Policy_Premium"),
            col("Total_Premium_Paid"), col("Renewal_Status"), col("Policy_Addons")
        )
        logger.info("Data selection and filtering completed.")
        return demographics_df, claims_df, policy_df
    except Exception as e:
        logger.error(f"Error during data selection and filtering: {e}")
        raise

demographics_df, claims_df, policy_df = select_and_filter_data(demographics_df, claims_df, policy_df)

# COMMAND ----------

# MAGIC
# Step 2: Data Integration
def integrate_data(demographics_df, policy_df, claims_df):
    try:
        joined_df = demographics_df.join(policy_df, demographics_df["Customer_ID"] == policy_df["Customer_ID"], "inner")
        joined_df = joined_df.join(claims_df, joined_df["Policy_ID"] == claims_df["Policy_ID"], "inner")
        logger.info("Data integration completed.")
        return joined_df
    except Exception as e:
        logger.error(f"Error during data integration: {e}")
        raise

joined_df = integrate_data(demographics_df, policy_df, claims_df)

# COMMAND ----------

# MAGIC
# Step 3: Data Aggregation
def aggregate_data(joined_df):
    try:
        summarized_df = joined_df.groupBy("Customer_ID").agg(
            count("Claim_ID").alias("Total_Claims"),
            count("Policy_ID").alias("Policy_Count"),
            max("Claim_Date").alias("Recent_Claim_Date"),
            avg("Claim_Amount").alias("Average_Claim_Amount")
        )
        logger.info("Data aggregation completed.")
        return summarized_df
    except Exception as e:
        logger.error(f"Error during data aggregation: {e}")
        raise

summarized_df = aggregate_data(joined_df)

# COMMAND ----------

# MAGIC
# Step 4: Custom Calculations
def perform_custom_calculations(summarized_df, joined_df):
    try:
        age_calculation = datediff(current_date(), col("Date_of_Birth")) / 365
        claim_to_premium_ratio = when(col("Total_Premium_Paid") != 0, col("Claim_Amount") / col("Total_Premium_Paid")).otherwise(0)
        claims_per_policy = when(col("Policy_Count") != 0, col("Total_Claims") / col("Policy_Count")).otherwise(0)

        final_df = summarized_df.join(joined_df, "Customer_ID", "inner").withColumn(
            "Age", age_calculation
        ).withColumn(
            "Claim_To_Premium_Ratio", claim_to_premium_ratio
        ).withColumn(
            "Claims_Per_Policy", claims_per_policy
        ).withColumn(
            "Retention_Rate", lit(0.85)
        ).withColumn(
            "Cross_Sell_Opportunities", lit("Multi-Policy Discount, Home Coverage Add-on")
        ).withColumn(
            "Upsell_Potential", lit("Premium Vehicle Coverage")
        )
        logger.info("Custom calculations completed.")
        return final_df
    except Exception as e:
        logger.error(f"Error during custom calculations: {e}")
        raise

final_df = perform_custom_calculations(summarized_df, joined_df)

# COMMAND ----------

# MAGIC
# Step 5: Comprehensive Data Joining
def comprehensive_data_joining(final_df, aiml_df, scores_df):
    try:
        final_df = final_df.join(aiml_df, final_df["Customer_ID"] == aiml_df["Customer_ID"], "inner") \
                           .join(scores_df, final_df["Customer_ID"] == scores_df["Customer_ID"], "inner")
        logger.info("Comprehensive data joining completed.")
        return final_df
    except Exception as e:
        logger.error(f"Error during comprehensive data joining: {e}")
        raise

final_df = comprehensive_data_joining(final_df, aiml_df, scores_df)

# COMMAND ----------

# MAGIC
# Output Handling
def write_output(final_df):
    try:
        spark.sql("DROP TABLE IF EXISTS genai_demo.jnj.customer_360_view")
        final_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.jnj.customer_360_view")
        logger.info("Data written successfully to Unity Catalog target table.")
    except Exception as e:
        logger.error(f"Error writing data to Unity Catalog target table: {e}")
        raise

write_output(final_df)
