In [None]:
import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, trim, col, concat, count
import psycopg2
from pyspark.sql.utils import AnalysisException

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Assume Spark session is pre-initialized as 'spark'

# Node 1: Data Source Loading
try:
    # Load data from Unity Catalog tables
    pes_prep_df = spark.table("genai_demo.jnj.pes_prep")
    c19_ivl_data_df = spark.table("genai_demo.jnj.c19_ivl_data")
    c04_ekpo_df = spark.table("genai_demo.jnj.c04_ekpo")
    c04_bseg_df = spark.table("genai_demo.jnj.c04_bseg")
    pjotr_df = spark.table("genai_demo.jnj.pjotr_")
    pjotr_in_pes_df = spark.table("genai_demo.jnj.pjotr_in_pes")
    
    # Load data from external PostgreSQL database
    pg_host = dbutils.secrets.get(scope="my_scope", key="pg_host")
    pg_port = dbutils.secrets.get(scope="my_scope", key="pg_port")
    pg_db = dbutils.secrets.get(scope="my_scope", key="pg_db")
    pg_user = dbutils.secrets.get(scope="my_scope", key="pg_user")
    pg_password = dbutils.secrets.get(scope="my_scope", key="pg_password")
    
    conn = psycopg2.connect(
        host=pg_host,
        port=pg_port,
        database=pg_db,
        user=pg_user,
        password=pg_password
    )
    cursor = conn.cursor()
    cursor.execute("SELECT * FROM pjotr")
    pg_data = cursor.fetchall()
    pg_columns = [desc[0] for desc in cursor.description]
    mysql_df = spark.createDataFrame(pg_data, schema=pg_columns)
    cursor.close()
    conn.close()
    
    logger.info("Data source loading completed successfully.")
except Exception as e:
    logger.error(f"Error loading data sources: {e}")
    raise

# Node 2: Multi-Field Formula Transformation
try:
    transformed_df = pes_prep_df.withColumn("_Business_unit_code", when(trim(col("Business unit code")) == "#", None).otherwise(trim(col("Business unit code"))))
    # Apply similar transformations for other fields as needed
    logger.info("Multi-field formula transformation completed successfully.")
except Exception as e:
    logger.error(f"Error in multi-field formula transformation: {e}")
    raise

# Node 3: Select Transformation
try:
    selected_df = transformed_df.select("_Business_unit_code", "_FMRC_code", "_FSID_code", "_LE_code", "_MRC_code")
    logger.info("Select transformation completed successfully.")
except Exception as e:
    logger.error(f"Error in select transformation: {e}")
    raise

# Node 4: Join Transformation
try:
    joined_df = selected_df.join(c04_ekpo_df, selected_df["_LE_code"] == c04_ekpo_df["LE code"], "inner")
    logger.info("Join transformation completed successfully.")
except Exception as e:
    logger.error(f"Error in join transformation: {e}")
    raise

# Node 5: Formula Transformation
try:
    formula_df = joined_df.withColumn("PJOTR_ID", when(col("PJOTR ID") == 1144, 2806).otherwise(col("PJOTR ID")))
    logger.info("Formula transformation completed successfully.")
except Exception as e:
    logger.error(f"Error in formula transformation: {e}")
    raise

# Node 6: Union Transformation
try:
    union_df = formula_df.union(pjotr_in_pes_df)
    logger.info("Union transformation completed successfully.")
except Exception as e:
    logger.error(f"Error in union transformation: {e}")
    raise

# Node 7: Filter Transformation
try:
    filtered_df = union_df.filter(col("map") == "yes")
    logger.info("Filter transformation completed successfully.")
except Exception as e:
    logger.error(f"Error in filter transformation: {e}")
    raise

# Node 8: Summarize Transformation
try:
    summarized_df = filtered_df.groupBy("LE code").agg(count("*").alias("record_count"))
    logger.info("Summarize transformation completed successfully.")
except Exception as e:
    logger.error(f"Error in summarize transformation: {e}")
    raise

# Node 9: Custom Calculations
try:
    custom_df = summarized_df.withColumn("temp_LE_MRC", concat(col("_LE code"), col("_MRC code")))
    logger.info("Custom calculations completed successfully.")
except Exception as e:
    logger.error(f"Error in custom calculations: {e}")
    raise

# Node 10: Output Data Sources
try:
    target_catalog = "genai_demo"
    target_schema = "jnj"
    target_table = "c03_pjotr"
    
    # Ensure schema exists before creating table
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {target_catalog}.{target_schema}")
    logger.info(f"Schema {target_catalog}.{target_schema} ensured")
    
    # Write to Unity Catalog target table (overwrite mode handles table replacement)
    custom_df.write.format("delta").mode("overwrite").saveAsTable(f"{target_catalog}.{target_schema}.{target_table}")
    logger.info("Output data sources written successfully.")
except AnalysisException as ae:
    logger.error(f"Error in output data sources: {ae}")
    raise
except Exception as e:
    logger.error(f"Unexpected error in output data sources: {e}")
    raise
