In [None]:
import logging
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, DoubleType

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    # Node 1: Load Demographics Data
    demographics_df = spark.table("genai_demo.jnj.demographics")
    logger.info(f"Loaded demographics data with {demographics_df.count()} records")
    logger.info(f"Demographics schema: {demographics_df.schema}")

    # Node 2: Load Claims Data
    claims_df = spark.table("genai_demo.jnj.claims")
    logger.info(f"Loaded claims data with {claims_df.count()} records")
    logger.info(f"Claims schema: {claims_df.schema}")

    # Node 3: Load Policy Data
    policy_df = spark.table("genai_demo.jnj.policy")
    logger.info(f"Loaded policy data with {policy_df.count()} records")
    logger.info(f"Policy schema: {policy_df.schema}")

    # Node 4: Load Scores Data
    scores_df = spark.table("genai_demo.jnj.scores")
    logger.info(f"Loaded scores data with {scores_df.count()} records")
    logger.info(f"Scores schema: {scores_df.schema}")

    # Node 5: Load AI/ML Insights Data
    aiml_insights_df = spark.table("genai_demo.jnj.aiml_insights")
    logger.info(f"Loaded AI/ML insights data with {aiml_insights_df.count()} records")
    logger.info(f"AI/ML Insights schema: {aiml_insights_df.schema}")

    # Node 6: Select Fields from Demographics
    demographics_selected_df = demographics_df.select("Customer_ID", "Email", "Date_of_Birth", "Gender", "City")
    logger.info("Selected fields from demographics data")

    # Node 7: Select Fields from Claims
    claims_selected_df = claims_df.select("Claim_ID", "Policy_ID", "Claim_Amount", "Claim_Date")
    logger.info("Selected fields from claims data")

    # Node 8: Select Fields from Policy
    policy_selected_df = policy_df.select("Policy_ID", "Customer_ID", "Policy_Premium", "Policy_Type")
    logger.info("Selected fields from policy data")

    # Node 9: Join Demographics and Policy Data
    demographics_policy_df = demographics_selected_df.join(policy_selected_df, "Customer_ID", "inner")
    logger.info(f"Joined demographics and policy data with {demographics_policy_df.count()} records")

    # Node 10: Join Claims and Policy Data
    claims_policy_df = claims_selected_df.join(policy_selected_df, "Policy_ID", "inner")
    logger.info(f"Joined claims and policy data with {claims_policy_df.count()} records")

    # Node 11: Summarize Data
    summarized_df = claims_policy_df.groupBy("Customer_ID").agg(
        F.count("Claim_ID").alias("Total_Claims"),
        F.countDistinct("Policy_ID").alias("Policy_Count"),
        F.max("Claim_Date").alias("Recent_Claim_Date"),
        F.avg("Claim_Amount").alias("Average_Claim_Amount")
    )
    logger.info("Summarized claims data")

    # Node 12: Join Summarized Data with Combined Data
    combined_df = demographics_policy_df.join(summarized_df, "Customer_ID", "inner")
    logger.info(f"Joined summarized data with combined data, resulting in {combined_df.count()} records")

    # Node 13: Apply Formula Calculations
    calculated_df = combined_df.withColumn("Claim_To_Premium_Ratio", F.col("Average_Claim_Amount") / F.col("Policy_Premium")) \
                               .withColumn("Claims_Per_Policy", F.col("Total_Claims") / F.col("Policy_Count")) \
                               .withColumn("Retention_Rate", F.lit(0.85)) \
                               .withColumn("Cross_Sell_Opportunities", F.when(F.col("Policy_Type") == "Home", 1).otherwise(0).cast(IntegerType())) \
                               .withColumn("Upsell_Potential", F.when(F.col("Policy_Premium") < 500, 1).otherwise(0).cast(IntegerType()))
    logger.info("Applied formula calculations")

    # Node 14: Join with Scores Data
    combined_scores_df = calculated_df.join(scores_df, "Customer_ID", "inner")
    logger.info(f"Joined with scores data, resulting in {combined_scores_df.count()} records")

    # Node 15: Join with AI/ML Insights Data
    final_df = combined_scores_df.join(aiml_insights_df, "Customer_ID", "inner")
    logger.info(f"Joined with AI/ML insights data, resulting in {final_df.count()} records")

    # Node 16: Output Customer 360 Data
    target_catalog = "genai_demo"
    target_schema = "jnj"
    target_table = "customer_360"

    # Ensure schema exists before creating table
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {target_catalog}.{target_schema}")
    logger.info(f"Schema {target_catalog}.{target_schema} ensured")

    # Write to Unity Catalog target table (overwrite mode handles table replacement)
    # Ensure the column types match the existing Delta table schema
    final_df = final_df.withColumn("Cross_Sell_Opportunities", F.col("Cross_Sell_Opportunities").cast(IntegerType()))
    final_df.write.format("delta").mode("overwrite").saveAsTable(f"{target_catalog}.{target_schema}.{target_table}")
    logger.info("Customer 360 data written to Unity Catalog")

except Exception as e:
    logger.error(f"An error occurred: {str(e)}")
