In [None]:
import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, sum, broadcast

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Assume Spark session is already initialized as 'spark'

# Function to load data from Unity Catalog tables with enhanced error handling
def load_table(table_name):
    try:
        df = spark.table(table_name)
        logger.info(f"Successfully loaded table: {table_name}")
        return df
    except Exception as e:
        logger.error(f"Error loading table {table_name}: {e}")
        raise

# Correct table names and ensure they exist in Unity Catalog
def ensure_table_exists(catalog_name, schema_name, table_name):
    try:
        # Verify catalog and schema existence
        catalog_exists = spark.sql(f"SHOW CATALOGS LIKE '{catalog_name}'").count() > 0
        if not catalog_exists:
            raise Exception(f"Catalog '{catalog_name}' does not exist.")

        schema_exists = spark.sql(f"SHOW SCHEMAS IN {catalog_name} LIKE '{schema_name}'").count() > 0
        if not schema_exists:
            spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.{schema_name}")
            logger.info(f"Schema {schema_name} created in catalog {catalog_name}")

        table_location = f"dbfs:/mnt/{table_name}"
        spark.sql(f"CREATE TABLE IF NOT EXISTS {catalog_name}.{schema_name}.{table_name} USING DELTA LOCATION '{table_location}'")
        logger.info(f"Table {table_name} ensured in Unity Catalog")
    except Exception as e:
        logger.error(f"Error ensuring table {table_name}: {e}")
        raise

# Define correct catalog and schema names
catalog_name = "actual_catalog_name"  # Replace with actual catalog name
schema_name = "actual_schema_name"    # Replace with actual schema name

# Ensure tables exist before loading
tables = [
    "PES_prep",
    "C19_ivl_data",
    "C04_EKPO",
    "C04_BSEG",
    "PJOTR_",
    "PJOTR_in_PES"
]

for table in tables:
    ensure_table_exists(catalog_name, schema_name, table)

# Load tables
pes_prep_df = load_table(f"{catalog_name}.{schema_name}.PES_prep")
c19_ivl_data_df = load_table(f"{catalog_name}.{schema_name}.C19_ivl_data")
c04_ekpo_df = load_table(f"{catalog_name}.{schema_name}.C04_EKPO")
c04_bseg_df = load_table(f"{catalog_name}.{schema_name}.C04_BSEG")
pjotr_df = load_table(f"{catalog_name}.{schema_name}.PJOTR_")
pjotr_in_pes_df = load_table(f"{catalog_name}.{schema_name}.PJOTR_in_PES")

# Multi-Field Formula Transformation
try:
    transformed_df = pes_prep_df.withColumn("_Business unit code", when(col("Business unit code").contains("#"), None).otherwise(col("Business unit code"))) \
                                .withColumn("_FMRC code", when(col("FMRC code").contains("UNMAPPED"), None).otherwise(col("FMRC code"))) \
                                .withColumn("_FSID code", when(col("FSID code").contains("NULL"), None).otherwise(col("FSID code"))) \
                                .withColumn("_LE code", when(col("LE code").startswith("00"), col("LE code").substr(3, 100)).otherwise(col("LE code"))) \
                                .withColumn("_MRC code", when(col("MRC code").startswith("00"), col("MRC code").substr(3, 100)).otherwise(col("MRC code"))) \
                                .withColumn("_Plant code", when(col("Plant code").startswith("00"), col("Plant code").substr(3, 100)).otherwise(col("Plant code"))) \
                                .withColumn("_PO business unit code", when(col("PO business unit code").startswith("00"), col("PO business unit code").substr(3, 100)).otherwise(col("PO business unit code"))) \
                                .withColumn("_PO LE code", when(col("PO LE code").startswith("00"), col("PO LE code").substr(3, 100)).otherwise(col("PO LE code"))) \
                                .withColumn("_PO MRC code", when(col("PO MRC code").startswith("00"), col("PO MRC code").substr(3, 100)).otherwise(col("PO MRC code"))) \
                                .withColumn("_PO site code", when(col("PO site code").startswith("00"), col("PO site code").substr(3, 100)).otherwise(col("PO site code"))) \
                                .withColumn("_Site code", when(col("Site code").startswith("00"), col("Site code").substr(3, 100)).otherwise(col("Site code"))) \
                                .withColumn("_Vision sourced data", when(col("Vision sourced data").startswith("00"), col("Vision sourced data").substr(3, 100)).otherwise(col("Vision sourced data")))
    logger.info("Multi-Field Formula Transformation applied successfully.")
except Exception as e:
    logger.error(f"Error in Multi-Field Formula Transformation: {e}")
    raise

# Select Transformation
try:
    selected_df = transformed_df.select("_Business unit code", "_FMRC code", "_FSID code", "_LE code", "_MRC code", "_Plant code", "_PO business unit code", "_PO LE code", "_PO MRC code", "_PO site code", "_Site code", "_Vision sourced data")
    logger.info("Select Transformation applied successfully.")
except Exception as e:
    logger.error(f"Error in Select Transformation: {e}")
    raise

# Join Transformation
try:
    joined_df = selected_df.join(broadcast(c04_bseg_df), selected_df["_LE code"] == c04_bseg_df["LE code"], "inner") \
                           .drop(c04_bseg_df["LE code"]) \
                           .join(broadcast(c04_ekpo_df), selected_df["_PO LE code"] == c04_ekpo_df["PO line SLoc"], "inner") \
                           .drop(c04_ekpo_df["PO line SLoc"])
    logger.info("Join Transformation applied successfully.")
except Exception as e:
    logger.error(f"Error in Join Transformation: {e}")
    raise

# Formula Transformation
try:
    formula_df = joined_df.withColumn("Calculated Field", col("_Business unit code") + col("_FMRC code"))
    logger.info("Formula Transformation applied successfully.")
except Exception as e:
    logger.error(f"Error in Formula Transformation: {e}")
    raise

# Union Transformation
try:
    union_df = formula_df.union(pjotr_in_pes_df)
    logger.info("Union Transformation applied successfully.")
except Exception as e:
    logger.error(f"Error in Union Transformation: {e}")
    raise

# Filter Transformation
try:
    filtered_df = union_df.filter(col("Calculated Field").isNotNull())
    logger.info("Filter Transformation applied successfully.")
except Exception as e:
    logger.error(f"Error in Filter Transformation: {e}")
    raise

# Summarize Transformation
try:
    summarized_df = filtered_df.groupBy("PJOTR").agg(sum("Spend").alias("Total Spend"), sum("Records").alias("Total Records"))
    logger.info("Summarize Transformation applied successfully.")
except Exception as e:
    logger.error(f"Error in Summarize Transformation: {e}")
    raise

# Output Data
try:
    # Ensure schema exists before creating table
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.{schema_name}")
    logger.info(f"Schema {catalog_name}.{schema_name} ensured")
    
    # Save final processed data to Unity Catalog tables
    summarized_df.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{schema_name}.C03_pjotr")
    logger.info("Data saved to C03_pjotr")

    pjotr_in_pes_df.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{schema_name}.PJOTR_in_PES")
    logger.info("Data saved to PJOTR_in_PES")

    union_df.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{schema_name}.C03_unmapped_to_PJOTR")
    logger.info("Data saved to C03_unmapped_to_PJOTR")

    formula_df.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{schema_name}.C03_pjotr_midway")
    logger.info("Data saved to C03_pjotr_midway")
except Exception as e:
    logger.error(f"Error saving data to Unity Catalog tables: {e}")
    raise
