In [None]:
import logging
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    # Step 1: Load Demographics Data
    demographics_df = spark.table("genai_demo.jnj.demographics")
    logger.info(f"Demographics Data Loaded: {demographics_df.count()} records")

    # Step 2: Load Claims Data
    claims_df = spark.table("genai_demo.jnj.claims")
    logger.info(f"Claims Data Loaded: {claims_df.count()} records")

    # Step 3: Load Policy Data
    policy_df = spark.table("genai_demo.jnj.policy")
    logger.info(f"Policy Data Loaded: {policy_df.count()} records")

    # Step 4: Load Scores Data
    scores_df = spark.table("genai_demo.jnj.scores")
    logger.info(f"Scores Data Loaded: {scores_df.count()} records")

    # Step 5: Load AI/ML Insights Data
    aiml_insights_df = spark.table("genai_demo.jnj.aiml_insights")
    logger.info(f"AI/ML Insights Data Loaded: {aiml_insights_df.count()} records")

    # Step 6: Select Tool (Demographics)
    demographics_selected_df = demographics_df.withColumn(
        "Age", 
        F.floor(F.datediff(F.current_date(), F.col("Date_of_Birth")) / 365.25).cast(IntegerType())
    ).select("Customer_ID", "Customer_Name", "Age", "Gender")
    logger.info("Selected fields from Demographics with calculated Age")

    # Step 7: Select Tool (Claims)
    claims_selected_df = claims_df.select("Claim_ID", "Policy_ID", "Claim_Date", "Claim_Amount")
    logger.info("Selected fields from Claims")

    # Step 8: Select Tool (Policy)
    policy_selected_df = policy_df.select("Policy_ID", "Customer_ID", "Policy_Premium", "Policy_Type")
    logger.info("Selected fields from Policy")

    # Step 9: Join Demographics and Policy Data
    demographics_policy_joined_df = demographics_selected_df.join(
        policy_selected_df, on="Customer_ID", how="inner"
    ).drop(policy_selected_df.Customer_ID)
    logger.info(f"Demographics and Policy Data Joined: {demographics_policy_joined_df.count()} records")

    # Step 10: Join Claims and Policy Data
    # Retain Policy_ID for aggregation
    claims_policy_joined_df = claims_selected_df.join(
        policy_selected_df, on="Policy_ID", how="inner"
    )
    logger.info(f"Claims and Policy Data Joined: {claims_policy_joined_df.count()} records")

    # Step 11: Summarize Tool
    summarized_df = claims_policy_joined_df.groupBy("Customer_ID").agg(
        F.count("Claim_ID").alias("Total_Claims"),
        F.count("Policy_ID").alias("Total_Policies"),
        F.max("Claim_Date").alias("Latest_Claim_Date"),
        F.avg("Claim_Amount").alias("Average_Claim_Amount")
    )
    logger.info("Data Summarized")

    # Step 12: Join Summarized Data and Combined Data
    combined_df = demographics_policy_joined_df.join(
        summarized_df, on="Customer_ID", how="inner"
    ).drop(summarized_df.Customer_ID)
    logger.info(f"Summarized and Combined Data Joined: {combined_df.count()} records")

    # Step 13: Formula Tool
    final_df = combined_df.withColumn("Claim_To_Premium_Ratio", F.col("Average_Claim_Amount") / F.col("Policy_Premium")) \
                          .withColumn("Claims_Per_Policy", F.col("Total_Claims") / F.col("Total_Policies")) \
                          .withColumn("Retention_Rate", F.lit(0.85)) \
                          .withColumn("Cross_Sell_Opportunities", F.lit("High")) \
                          .withColumn("Upsell_Potential", F.lit("Medium"))
    logger.info("Calculated new fields")

    # Step 14: Output Customer 360 Data
    target_catalog = "genai_demo"
    target_schema = "jnj"
    target_table = "customer_360"

    # Ensure schema exists before creating table
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {target_catalog}.{target_schema}")
    logger.info(f"Schema {target_catalog}.{target_schema} ensured")

    # Write to Unity Catalog target table (overwrite mode handles table replacement)
    final_df.write.format("delta").mode("overwrite").saveAsTable(f"{target_catalog}.{target_schema}.{target_table}")
    logger.info("Customer 360 Data Written Successfully")

except Exception as e:
    logger.error(f"An error occurred: {str(e)}")
