In [None]:
import logging
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Assume Spark session is already initialized as 'spark'
# Load data from Unity Catalog tables with enhanced error handling
def load_data(path):
    try:
        # Attempt to access the path and check permissions
        try:
            dbutils.fs.ls(path)
        except Exception as e:
            logger.error(f"Insufficient permissions or path does not exist: {path}. Error: {e}")
            raise Exception(f"Insufficient permissions or path does not exist: {path}. Error: {e}")
        
        df = spark.read.format("parquet").load(path)
        logger.info(f"Data loaded successfully from {path}.")
        return df
    except Exception as e:
        logger.error(f"Error loading data from {path}: {e}")
        raise

# Paths to data sources
paths = {
    "pes_prep": "dbfs:/mnt/data/PES/B/workfolder/PES_prep",
    "c19_ivl_data": "dbfs:/mnt/data/PES/C/workfolder/C19_ivl_data",
    "c04_ekpo": "dbfs:/mnt/data/PES/C/workfolder/C04_EKPO",
    "c04_bseg": "dbfs:/mnt/data/PES/C/workfolder/C04_BSEG",
    "pjotr": "dbfs:/mnt/data/PES/M/C03/PJOTR/PJOTR_",
    "pjotr_in_pes": "dbfs:/mnt/data/PES/M/C03/PJOTR/PJOTR_in_PES"
}

# Load data with permission checks
def load_data_with_permission_check(path):
    try:
        # Check if the user has permission to access the path
        if not dbutils.fs.ls(path):
            logger.error(f"Insufficient permissions or path does not exist: {path}")
            raise Exception(f"Insufficient permissions or path does not exist: {path}")
        
        df = spark.read.format("parquet").load(path)
        logger.info(f"Data loaded successfully from {path}.")
        return df
    except Exception as e:
        logger.error(f"Error loading data from {path}: {e}")
        raise

# Load data
pes_prep_df = load_data_with_permission_check(paths["pes_prep"])
c19_ivl_data_df = load_data_with_permission_check(paths["c19_ivl_data"])
c04_ekpo_df = load_data_with_permission_check(paths["c04_ekpo"])
c04_bseg_df = load_data_with_permission_check(paths["c04_bseg"])
pjotr_df = load_data_with_permission_check(paths["pjotr"])
pjotr_in_pes_df = load_data_with_permission_check(paths["pjotr_in_pes"])

# Transformation logic
def clean_transform(df):
    try:
        for field in ["Business unit code", "FMRC code", "FSID code", "LE code", "MRC code", "Plant code", "PO business unit code", "PO LE code", "PO MRC code", "PO site code", "Site code", "Vision sourced data"]:
            df = df.withColumn(f"_{field}", F.when(df[field].rlike("#|UNMAPPED|NULL"), None).otherwise(df[field].substr(3, len(df[field]))))
        logger.info("Multi-field formula transformation applied.")
        return df
    except Exception as e:
        logger.error(f"Error in clean_transform: {e}")
        raise

transformed_df = clean_transform(pes_prep_df)
selected_df = transformed_df.select("PES ID", "Addressable", "Ariba source system", "Base unit of measure", "Business unit", "Invoice amount (USD)")
logger.info("Select transformation applied.")

# Join Transformation
try:
    joined_df = selected_df.join(c19_ivl_data_df, "PES ID", "inner").join(c04_ekpo_df, "PO", "inner").join(c04_bseg_df, "MM Doc.", "inner")
    joined_df = joined_df.drop("duplicate_column_name")  # Example of dropping duplicate columns
    logger.info("Join transformation applied.")
except Exception as e:
    logger.error(f"Error in join transformation: {e}")
    raise

# Formula Transformation
try:
    formula_df = joined_df.withColumn("Spend", joined_df["Invoice amount (USD)"]).withColumn("Records", F.lit(1))
    logger.info("Formula transformation applied.")
except Exception as e:
    logger.error(f"Error in formula transformation: {e}")
    raise

# Union Transformation
try:
    union_df = formula_df.union(pjotr_in_pes_df)
    logger.info("Union transformation applied.")
except Exception as e:
    logger.error(f"Error in union transformation: {e}")
    raise

# Filter Transformation
try:
    filtered_df = union_df.filter(union_df["Business unit"] != "UNMAPPED")
    logger.info("Filter transformation applied.")
except Exception as e:
    logger.error(f"Error in filter transformation: {e}")
    raise

# Summarize Transformation
try:
    summarized_df = filtered_df.groupBy("PJOTR").agg(F.sum("Spend").alias("Total Spend"), F.sum("Records").alias("Total Records"))
    logger.info("Summarize transformation applied.")
except Exception as e:
    logger.error(f"Error in summarize transformation: {e}")
    raise

# Output Handling
try:
    target_catalog = "catalog_name"
    target_schema = "schema_name"
    
    # Ensure schema exists before creating table
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {target_catalog}.{target_schema}")
    logger.info(f"Schema {target_catalog}.{target_schema} ensured")
    
    # Write to Unity Catalog target table (overwrite mode handles table replacement)
    summarized_df.write.format("delta").mode("overwrite").saveAsTable(f"{target_catalog}.{target_schema}.C03_pjotr")
    pjotr_in_pes_df.write.format("delta").mode("overwrite").saveAsTable(f"{target_catalog}.{target_schema}.PJOTR_in_PES")
    
    unmapped_df = filtered_df.filter(filtered_df["Business unit"] == "UNMAPPED")
    unmapped_df.write.format("delta").mode("overwrite").saveAsTable(f"{target_catalog}.{target_schema}.C03_unmapped_to_PJOTR")
    
    union_df.write.format("delta").mode("overwrite").saveAsTable(f"{target_catalog}.{target_schema}.C03_pjotr_midway")
    logger.info("Data written to Unity Catalog target tables successfully.")
except Exception as e:
    logger.error(f"Error in output handling: {e}")
    raise

# Additional logging for permissions check
logger.info("Ensure that the user has the necessary permissions to access the specified DBFS paths.")
