In [None]:
import logging
from pyspark.sql import functions as F

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    # Load PES_prep.yxdb from Unity Catalog
    pes_prep_df = spark.table("genai_demo.jnj.pes_prep")
    logger.info(f"PES_prep.yxdb loaded with {pes_prep_df.count()} records")
    pes_prep_df.printSchema()

    # Apply multi-field formula transformation
    # Assuming transformation involves trimming and removing specific values
    pes_prep_transformed_df = pes_prep_df.withColumn("Business unit code", F.trim(F.col("Business unit code"))) \
                                         .withColumn("FMRC code", F.trim(F.col("FMRC code"))) \
                                         .withColumn("FSID code", F.trim(F.col("FSID code"))) \
                                         .withColumn("LE code", F.trim(F.col("LE code"))) \
                                         .withColumn("MRC code", F.trim(F.col("MRC code"))) \
                                         .withColumn("Plant code", F.trim(F.col("Plant code"))) \
                                         .withColumn("PO business unit code", F.trim(F.col("PO business unit code"))) \
                                         .withColumn("PO LE code", F.trim(F.col("PO LE code"))) \
                                         .withColumn("PO MRC code", F.trim(F.col("PO MRC code"))) \
                                         .withColumn("PO site code", F.trim(F.col("PO site code"))) \
                                         .withColumn("Site code", F.trim(F.col("Site code"))) \
                                         .withColumn("Vision sourced data", F.trim(F.col("Vision sourced data")))
    logger.info(f"Multi-field formula transformation applied with {pes_prep_transformed_df.count()} records")

    # Apply select transformation
    pes_prep_selected_df = pes_prep_transformed_df.select("Business unit code", "FMRC code", "FSID code", "LE code", 
                                                          "MRC code", "Plant code", "PO business unit code", "PO LE code", 
                                                          "PO MRC code", "PO site code", "Site code", "Vision sourced data")
    pes_prep_selected_df.printSchema()

    # Load C19_ivl_data.yxdb from Unity Catalog
    c19_ivl_data_df = spark.table("genai_demo.jnj.c19_ivl_data")
    logger.info(f"C19_ivl_data.yxdb loaded with {c19_ivl_data_df.count()} records")
    c19_ivl_data_df.printSchema()

    # Apply select transformation on C19_ivl_data.yxdb
    c19_ivl_selected_df = c19_ivl_data_df.select("PES ID", "Invoice source system", "IVL business unit code")
    c19_ivl_selected_df.printSchema()

    # Join PES_prep.yxdb with C19_ivl_data.yxdb on PES ID
    joined_df = pes_prep_selected_df.join(c19_ivl_selected_df, on="PES ID", how="inner").drop(c19_ivl_selected_df["PES ID"])
    logger.info(f"Join operation completed with {joined_df.count()} records")
    joined_df.printSchema()

    # Apply formula transformation (custom calculations)
    # Assuming custom calculations involve creating/modifying fields
    formula_df = joined_df.withColumn("New Field", F.expr("some_custom_calculation"))
    logger.info(f"Formula transformation applied with {formula_df.count()} records")
    formula_df.printSchema()

    # Apply union transformation
    # Assuming union involves merging with another DataFrame (not specified in plan)
    # For demonstration, using the same DataFrame
    union_df = formula_df.union(formula_df)
    logger.info(f"Union transformation applied with {union_df.count()} records")
    union_df.printSchema()

    # Apply filter transformation
    filtered_df = union_df.filter(F.col("POrg") == 'P001')
    logger.info(f"Filter transformation applied with {filtered_df.count()} records")

    # Apply summarize transformation
    summarized_df = filtered_df.groupBy("PJOTR").agg(F.sum("Spend").alias("sum_Spend"), F.sum("Records").alias("sum_Records"))
    logger.info(f"Summarize transformation applied with {summarized_df.count()} records")
    summarized_df.printSchema()

    # Output results to C03_pjotr.yxdb
    target_catalog = "genai_demo"
    target_schema = "jnj"
    target_table = "c03_pjotr"

    # Ensure schema exists before creating table
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {target_catalog}.{target_schema}")
    logger.info(f"Schema {target_catalog}.{target_schema} ensured")

    # Write to Unity Catalog target table (overwrite mode handles table replacement)
    summarized_df.write.format("delta").mode("overwrite").saveAsTable(f"{target_catalog}.{target_schema}.{target_table}")
    logger.info(f"Data written to {target_catalog}.{target_schema}.{target_table}")

except Exception as e:
    logger.error(f"An error occurred: {str(e)}")
