In [None]:
import logging
from pyspark.sql import functions as F
import psycopg2
from pyspark.sql.types import StringType

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Step 1: Source Extraction
try:
    # Load data from Unity Catalog tables
    pes_prep_df = spark.table("genai_demo.jnj.pes_prep")
    c19_ivl_data_df = spark.table("genai_demo.jnj.c19_ivl_data")
    c04_ekpo_df = spark.table("genai_demo.jnj.c04_ekpo")
    c04_bseg_df = spark.table("genai_demo.jnj.c04_bseg")
    pjotr_df = spark.table("genai_demo.jnj.pjotr_")
    pjotr_in_pes_df = spark.table("genai_demo.jnj.pjotr_in_pes")
    
    # Connect to MySQL database and fetch data
    mysql_host = dbutils.secrets.get(scope="mysql_scope", key="host")
    mysql_db = dbutils.secrets.get(scope="mysql_scope", key="database")
    mysql_user = dbutils.secrets.get(scope="mysql_scope", key="username")
    mysql_password = dbutils.secrets.get(scope="mysql_scope", key="password")
    
    conn = psycopg2.connect(
        host=mysql_host,
        database=mysql_db,
        user=mysql_user,
        password=mysql_password
    )
    cursor = conn.cursor()
    cursor.execute("SELECT * FROM pjotr")
    pjotr_data = cursor.fetchall()
    cursor.execute("SELECT * FROM pjotr_prod")
    pjotr_prod_data = cursor.fetchall()
    conn.close()
    
    # Convert fetched data to DataFrame
    pjotr_mysql_df = spark.createDataFrame(pjotr_data, schema=["column1", "column2", "column3"])
    pjotr_prod_mysql_df = spark.createDataFrame(pjotr_prod_data, schema=["column1", "column2", "column3"])
    
    logger.info("Data extraction completed successfully.")
except Exception as e:
    logger.error(f"Error during data extraction: {str(e)}")
    raise

# Step 2: Transformation
try:
    # Node 8: Multi-Field Formula
    def clean_field(value):
        if value in ["", "NULL", "NA"]:
            return None
        return value.strip()
    
    clean_udf = F.udf(clean_field, StringType())
    
    pes_prep_df = pes_prep_df.withColumn("cleaned_field", clean_udf(F.col("field_to_clean")))
    
    # Node 9: Select
    selected_df = pes_prep_df.select("field1", "field2", "cleaned_field")
    
    # Node 10: Join
    joined_df = selected_df.join(c19_ivl_data_df, selected_df.field1 == c19_ivl_data_df.field1, "inner")
    
    # Node 11: Formula
    transformed_df = joined_df.withColumn("new_field", F.expr("field1 + field2"))
    
    # Node 12: Union
    union_df = transformed_df.union(pjotr_df)
    
    # Node 13: Filter
    filtered_df = union_df.filter(F.col("new_field") > 100)
    
    # Node 14: Summarize
    summarized_df = filtered_df.groupBy("field1").agg(F.sum("new_field").alias("sum_new_field"))
    
    # Node 15: Custom Calculations
    final_df = summarized_df.withColumn("custom_field", F.expr("sum_new_field * 2"))
    
    logger.info("Data transformation completed successfully.")
except Exception as e:
    logger.error(f"Error during data transformation: {str(e)}")
    raise

# Step 3: Output
try:
    target_catalog = "genai_demo"
    target_schema = "jnj"
    
    # Ensure schema exists
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {target_catalog}.{target_schema}")
    logger.info(f"Schema {target_catalog}.{target_schema} ensured")
    
    # Node 16: Output C03_pjotr.yxdb
    final_df.write.format("delta").mode("overwrite").saveAsTable(f"{target_catalog}.{target_schema}.c03_pjotr")
    
    # Node 17: Output PJOTR_in_PES.yxdb
    pjotr_in_pes_df.write.format("delta").mode("overwrite").saveAsTable(f"{target_catalog}.{target_schema}.pjotr_in_pes")
    
    # Node 18: Output C03_unmapped_to_PJOTR.yxdb
    unmapped_df = final_df.filter(F.col("custom_field").isNull())
    unmapped_df.write.format("delta").mode("overwrite").saveAsTable(f"{target_catalog}.{target_schema}.c03_unmapped_to_pjotr")
    
    # Node 19: Output C03_pjotr_midway.yxdb
    midway_df = final_df.filter(F.col("custom_field").isNotNull())
    midway_df.write.format("delta").mode("overwrite").saveAsTable(f"{target_catalog}.{target_schema}.c03_pjotr_midway")
    
    logger.info("Data output completed successfully.")
except Exception as e:
    logger.error(f"Error during data output: {str(e)}")
    raise
