In [None]:
import logging
from pyspark.sql import functions as F

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Step 1: Source Extraction
try:
    # Extract data from Unity Catalog tables
    pes_prep_df = spark.table("genai_demo.jnj.pes_prep")
    c19_ivl_data_df = spark.table("genai_demo.jnj.c19_ivl_data")
    c04_ekpo_df = spark.table("genai_demo.jnj.c04_ekpo")
    c04_bseg_df = spark.table("genai_demo.jnj.c04_bseg")
    pjotr_df = spark.table("genai_demo.jnj.pjotr_")
    pjotr_in_pes_df = spark.table("genai_demo.jnj.pjotr_in_pes")

    # Log record count and schema
    logger.info(f"PES_prep: {pes_prep_df.count()} records, Schema: {pes_prep_df.schema}")
    logger.info(f"C19_ivl_data: {c19_ivl_data_df.count()} records, Schema: {c19_ivl_data_df.schema}")
    logger.info(f"C04_EKPO: {c04_ekpo_df.count()} records, Schema: {c04_ekpo_df.schema}")
    logger.info(f"C04_BSEG: {c04_bseg_df.count()} records, Schema: {c04_bseg_df.schema}")
    logger.info(f"PJOTR_: {pjotr_df.count()} records, Schema: {pjotr_df.schema}")
    logger.info(f"PJOTR_in_PES: {pjotr_in_pes_df.count()} records, Schema: {pjotr_in_pes_df.schema}")
except Exception as e:
    logger.error(f"Error in Source Extraction: {str(e)}")
    raise

# Step 2: Transformation - Multi-Field Formula
try:
    # Example transformation: trimming and removing specific values
    pes_prep_df = pes_prep_df.withColumn("trimmed_field", F.trim(F.col("field_name"))).drop("irrelevant_column")
    c19_ivl_data_df = c19_ivl_data_df.withColumn("cleaned_field", F.when(F.col("field_name") != "unwanted_value", F.col("field_name")))

    # Log record count and schema
    logger.info(f"PES_prep after transformation: {pes_prep_df.count()} records, Schema: {pes_prep_df.schema}")
    logger.info(f"C19_ivl_data after transformation: {c19_ivl_data_df.count()} records, Schema: {c19_ivl_data_df.schema}")
except Exception as e:
    logger.error(f"Error in Multi-Field Formula Transformation: {str(e)}")
    raise

# Step 3: Transformation - Select
try:
    # Select specific fields
    pes_prep_selected_df = pes_prep_df.select("field1", "field2")
    c19_ivl_data_selected_df = c19_ivl_data_df.select("field3", "field4")

    # Log record count and schema
    logger.info(f"PES_prep after selection: {pes_prep_selected_df.count()} records, Schema: {pes_prep_selected_df.schema}")
    logger.info(f"C19_ivl_data after selection: {c19_ivl_data_selected_df.count()} records, Schema: {c19_ivl_data_selected_df.schema}")
except Exception as e:
    logger.error(f"Error in Select Transformation: {str(e)}")
    raise

# Step 4: Transformation - Join
try:
    # Perform join operations
    joined_df = pes_prep_selected_df.join(c19_ivl_data_selected_df, pes_prep_selected_df.field1 == c19_ivl_data_selected_df.field3, "inner")

    # Drop or rename duplicate columns
    joined_df = joined_df.drop(c19_ivl_data_selected_df.field3)

    # Log record count and schema
    logger.info(f"Joined DataFrame: {joined_df.count()} records, Schema: {joined_df.schema}")
except Exception as e:
    logger.error(f"Error in Join Transformation: {str(e)}")
    raise

# Step 5: Transformation - Formula
try:
    # Apply custom calculations
    formula_df = joined_df.withColumn("new_field", F.col("field1") + F.col("field2"))

    # Log record count and schema
    logger.info(f"DataFrame after formula application: {formula_df.count()} records, Schema: {formula_df.schema}")
except Exception as e:
    logger.error(f"Error in Formula Transformation: {str(e)}")
    raise

# Step 6: Transformation - Union
try:
    # Merge multiple data streams
    union_df = formula_df.union(pjotr_df)

    # Log record count and schema
    logger.info(f"DataFrame after union: {union_df.count()} records, Schema: {union_df.schema}")
except Exception as e:
    logger.error(f"Error in Union Transformation: {str(e)}")
    raise

# Step 7: Transformation - Filter
try:
    # Apply filter conditions
    filtered_df = union_df.filter(F.col("new_field") > 100)

    # Log record count and schema
    logger.info(f"DataFrame after filtering: {filtered_df.count()} records, Schema: {filtered_df.schema}")
except Exception as e:
    logger.error(f"Error in Filter Transformation: {str(e)}")
    raise

# Step 8: Transformation - Summarize
try:
    # Aggregate data
    summary_df = filtered_df.groupBy("field1").agg(F.sum("new_field").alias("sum_new_field"))

    # Log record count and schema
    logger.info(f"DataFrame after summarization: {summary_df.count()} records, Schema: {summary_df.schema}")
except Exception as e:
    logger.error(f"Error in Summarize Transformation: {str(e)}")
    raise

# Step 9: Output Loading
try:
    # Ensure schema exists before creating table
    target_catalog = "genai_demo"
    target_schema = "jnj"
    target_table = "target_table"

    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {target_catalog}.{target_schema}")
    logger.info(f"Schema {target_catalog}.{target_schema} ensured")

    # Write to Unity Catalog target table
    summary_df.write.format("delta").mode("overwrite").saveAsTable(f"{target_catalog}.{target_schema}.{target_table}")

    # Log record count and schema
    logger.info(f"Data loaded to {target_catalog}.{target_schema}.{target_table}: {summary_df.count()} records, Schema: {summary_df.schema}")
except Exception as e:
    logger.error(f"Error in Output Loading: {str(e)}")
    raise
