In [None]:
import logging
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType
from datetime import datetime

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    # Load data from Unity Catalog tables
    policy_df = spark.table("genai_demo.jnj.policy")
    claims_df = spark.table("genai_demo.jnj.claims")
    demographics_df = spark.table("genai_demo.jnj.demographics")
    scores_df = spark.table("genai_demo.jnj.scores")
    aiml_insights_df = spark.table("genai_demo.jnj.aiml_insights")

    # Select relevant fields from each DataFrame
    demographics_selected_df = demographics_df.select(
        "Customer_ID", "Customer_Name", "Email", "Phone_Number", "Address", "City", "State", "Postal_Code",
        "Date_of_Birth", "Gender", "Marital_Status", "Occupation", "Income_Level", "Customer_Segment"
    )

    claims_selected_df = claims_df.select(
        "Claim_ID", "Policy_ID", "Claim_Date", "Claim_Type", "Claim_Status", "Claim_Amount", "Claim_Payout"
    )

    policy_selected_df = policy_df.select(
        "Policy_ID", "Customer_ID", "Policy_Type", "Policy_Status", "Policy_Start_Date", "Policy_End_Date",
        "Policy_Term", "Policy_Premium", "Total_Premium_Paid", "Renewal_Status", "Policy_Addons"
    )

    scores_selected_df = scores_df.select(
        "Customer_ID", "Credit_Score", "Fraud_Score", "Customer_Risk_Score"
    )

    aiml_insights_selected_df = aiml_insights_df.select(
        "Customer_ID", "Churn_Probability", "Next_Best_Offer", "Claims_Fraud_Probability", "Revenue_Potential"
    )

    # Join demographics and policy data on Customer_ID
    demographics_policy_joined_df = demographics_selected_df.join(
        policy_selected_df, "Customer_ID", "inner"
    )

    # Join claims and policy data on Policy_ID
    claims_policy_joined_df = claims_selected_df.join(
        policy_selected_df, "Policy_ID", "inner"
    )

    # Summarize data
    summarized_df = claims_policy_joined_df.groupBy("Customer_ID").agg(
        F.count("Claim_ID").alias("Total_Claims"),
        F.count("Policy_ID").alias("Policy_Count"),
        F.max("Claim_Date").alias("Recent_Claim_Date"),
        F.avg("Claim_Amount").alias("Average_Claim_Amount"),
        F.sum("Claim_Amount").alias("Total_Claim_Amount")
    )

    # Join summarized data with demographics and policy data
    combined_df = demographics_policy_joined_df.join(
        summarized_df, "Customer_ID", "inner"
    )

    # Custom Calculations
    combined_df = combined_df.withColumn(
        "Age", F.datediff(F.current_date(), F.to_date("Date_of_Birth")) / 365
    ).withColumn(
        "Claim_To_Premium_Ratio", F.when(combined_df["Total_Premium_Paid"] != 0, combined_df["Total_Claim_Amount"] / combined_df["Total_Premium_Paid"]).otherwise(0)
    ).withColumn(
        "Claims_Per_Policy", F.when(combined_df["Policy_Count"] != 0, combined_df["Total_Claims"] / combined_df["Policy_Count"]).otherwise(0)
    ).withColumn(
        "Retention_Rate", F.lit(0.85)
    ).withColumn(
        "Cross_Sell_Opportunities", F.lit("Multi-Policy Discount, Home Coverage Add-on")
    ).withColumn(
        "Upsell_Potential", F.lit("Premium Vehicle Coverage")
    )

    # Join with AI/ML insights and scores data
    final_df = combined_df.join(
        aiml_insights_selected_df, "Customer_ID", "inner"
    ).join(
        scores_selected_df, "Customer_ID", "inner"
    )

    # Ensure no duplicate columns exist
    final_df = final_df.dropDuplicates(["Customer_ID"])

    # Rename columns to avoid duplicates
    final_df = final_df.withColumnRenamed("Age", "Customer_Age")

    # Define target schema and table
    target_catalog = "genai_demo"
    target_schema = "guardian"
    target_table = "customer_360"

    # Ensure schema exists
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {target_catalog}.{target_schema}")
    logger.info(f"Schema {target_catalog}.{target_schema} ensured")

    # Enable schema auto-merging for Delta table
    spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")

    # Write final DataFrame to Unity Catalog table with schema merge option
    final_df.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable(f"{target_catalog}.{target_schema}.{target_table}")
    logger.info(f"Data written to {target_catalog}.{target_schema}.{target_table}")

except Exception as e:
    logger.error(f"An error occurred: {e}")
    raise
