In [None]:
import logging
from pyspark.sql import functions as F

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_table(table_name):
    try:
        df = spark.table(table_name)
        logger.info(f"Loaded {table_name} with {df.count()} records")
        logger.info(f"{table_name} schema: {df.schema}")
        return df
    except Exception as e:
        logger.error(f"Failed to load table {table_name}: {str(e)}")
        raise

try:
    # Node 1: Load Demographics Data
    demographics_df = load_table("genai_demo.jnj.demographics")

    # Node 2: Load Claims Data
    claims_df = load_table("genai_demo.jnj.claims")

    # Node 3: Load Policy Data
    policy_df = load_table("genai_demo.jnj.policy")

    # Node 4: Load Scores Data
    scores_df = load_table("genai_demo.jnj.scores")

    # Node 5: Load AI/ML Insights Data
    aiml_insights_df = load_table("genai_demo.jnj.aiml_insights")

    # Node 6: Select Fields from Demographics
    demographics_selected_df = demographics_df.select("Customer_ID", "Gender", "Email")
    logger.info("Selected relevant fields from demographics")

    # Node 7: Select Fields from Claims
    claims_selected_df = claims_df.select("Claim_ID", "Policy_ID", "Claim_Amount", "Claim_Date")
    logger.info("Selected relevant fields from claims")

    # Node 8: Select Fields from Policy
    policy_selected_df = policy_df.select("Policy_ID", "Customer_ID", "Policy_Premium", "Policy_Type")
    logger.info("Selected relevant fields from policy")

    # Node 9: Join Demographics and Policy Data
    demographics_policy_df = demographics_selected_df.join(policy_selected_df, on="Customer_ID", how="inner")
    logger.info(f"Joined demographics and policy data with {demographics_policy_df.count()} records")

    # Node 10: Join Claims and Policy Data
    claims_policy_df = claims_selected_df.join(policy_selected_df, on="Policy_ID", how="inner")
    logger.info(f"Joined claims and policy data with {claims_policy_df.count()} records")

    # Node 11: Summarize Data
    summarized_df = claims_policy_df.groupBy("Customer_ID").agg(
        F.count("Claim_ID").alias("Total_Claims"),
        F.countDistinct("Policy_ID").alias("Policy_Count"),
        F.max("Claim_Date").alias("Recent_Claim_Date"),
        F.avg("Claim_Amount").alias("Average_Claim_Amount")
    )
    logger.info(f"Summarized data with {summarized_df.count()} records")

    # Node 12: Join Summarized Data with Combined Data
    combined_df = demographics_policy_df.join(summarized_df, on="Customer_ID", how="inner")
    logger.info(f"Joined summarized data with combined data with {combined_df.count()} records")

    # Node 13: Apply Formula Tool
    customer_360_df = combined_df.withColumn("Claim_To_Premium_Ratio", F.col("Average_Claim_Amount") / F.col("Policy_Premium")) \
                                 .withColumn("Claims_Per_Policy", F.col("Total_Claims") / F.col("Policy_Count")) \
                                 .withColumn("Retention_Rate", F.lit(0.85)) \
                                 .withColumn("Cross_Sell_Opportunities", F.when(F.col("Policy_Count") > 1, F.lit("Yes")).otherwise(F.lit("No"))) \
                                 .withColumn("Upsell_Potential", F.when(F.col("Policy_Premium") < 500, F.lit("High")).otherwise(F.lit("Low")))
    logger.info("Applied formula tool to calculate new fields")

    # Node 14: Save Customer 360 Data
    output_path = "/mnt/output/Customer_360.csv"
    customer_360_df.write.mode("overwrite").option("header", "true").csv(output_path)
    logger.info(f"Saved Customer 360 data to {output_path}")

except Exception as e:
    logger.error(f"An error occurred: {str(e)}")
    raise
