In [None]:
import logging
from pyspark.sql import functions as F
from pyspark.sql.types import StringType
import psycopg2

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Function to load data from MySQL using psycopg2
def load_mysql_data(query, secret_scope, secret_key):
    try:
        # Retrieve credentials securely
        host = dbutils.secrets.get(secret_scope, f"{secret_key}_host")
        port = dbutils.secrets.get(secret_scope, f"{secret_key}_port")
        database = dbutils.secrets.get(secret_scope, f"{secret_key}_database")
        user = dbutils.secrets.get(secret_scope, f"{secret_key}_user")
        password = dbutils.secrets.get(secret_scope, f"{secret_key}_password")
        
        # Connect to MySQL
        conn = psycopg2.connect(
            host=host,
            port=port,
            database=database,
            user=user,
            password=password
        )
        cursor = conn.cursor()
        cursor.execute(query)
        data = cursor.fetchall()
        columns = [desc[0] for desc in cursor.description]
        cursor.close()
        conn.close()
        
        # Convert to DataFrame
        df = spark.createDataFrame(data, schema=columns)
        logger.info(f"Loaded data from MySQL with {df.count()} records")
        return df
    except Exception as e:
        logger.error(f"Error loading data from MySQL: {str(e)}")
        raise

# Load data from Unity Catalog tables with error handling
def load_data_from_catalog(table_path):
    try:
        df = spark.table(table_path)
        logger.info(f"Loaded data from {table_path} with {df.count()} records")
        return df
    except Exception as e:
        logger.error(f"Error loading data from {table_path}: {str(e)}")
        raise

# Step 1: Load Data from MySQL (Node 99)
mysql_query_99 = "SELECT pjotr_prod.* FROM pjotr_prod"
try:
    mysql_df_99 = load_mysql_data(mysql_query_99, "mysql_scope", "pjotr_prod")
except Exception as e:
    logger.error(f"Failed to load MySQL data for Node 99: {str(e)}")
    # Handle missing secret or fallback logic here

# Step 2: Load Static Data from TextInput (Node 330)
try:
    text_input_df = load_data_from_catalog("catalog.db.TextInput")
except Exception as e:
    logger.error(f"Failed to load data from TextInput: {str(e)}")
    # Handle missing table or fallback logic here

# Step 3: Load Data from C04_BSEG.yxdb (Node 499)
try:
    bseg_df = load_data_from_catalog("catalog.db.C04_BSEG")
except Exception as e:
    logger.error(f"Failed to load data from C04_BSEG: {str(e)}")
    # Handle missing table or fallback logic here

# Step 4: Load Data from C04_EKPO.yxdb (Node 500)
try:
    ekpo_df = load_data_from_catalog("catalog.db.C04_EKPO")
except Exception as e:
    logger.error(f"Failed to load data from C04_EKPO: {str(e)}")
    # Handle missing table or fallback logic here

# Step 5: Load Data from PJOTR_in_PES.yxdb (Node 384)
try:
    pjotr_in_pes_df = load_data_from_catalog("catalog.db.PJOTR_in_PES")
except Exception as e:
    logger.error(f"Failed to load data from PJOTR_in_PES: {str(e)}")
    # Handle missing table or fallback logic here

# Step 6: Load Data from C19_ivl_data.yxdb (Node 291)
try:
    ivl_data_df = load_data_from_catalog("catalog.db.C19_ivl_data")
except Exception as e:
    logger.error(f"Failed to load data from C19_ivl_data: {str(e)}")
    # Handle missing table or fallback logic here

# Step 7: Load Data from MySQL (Node 403)
mysql_query_403 = "SELECT pjotr.* FROM pjotr"
try:
    mysql_df_403 = load_mysql_data(mysql_query_403, "mysql_scope", "pjotr")
except Exception as e:
    logger.error(f"Failed to load MySQL data for Node 403: {str(e)}")
    # Handle missing secret or fallback logic here

# Step 8: Load Data from MySQL (Node 98)
mysql_query_98 = "SELECT pjotr.* FROM pjotr"
try:
    mysql_df_98 = load_mysql_data(mysql_query_98, "mysql_scope", "pjotr")
except Exception as e:
    logger.error(f"Failed to load MySQL data for Node 98: {str(e)}")
    # Handle missing secret or fallback logic here

# Step 9: Load Data from PJOTR_.yxdb (Node 401)
try:
    pjotr_df = load_data_from_catalog("catalog.db.PJOTR_")
except Exception as e:
    logger.error(f"Failed to load data from PJOTR_: {str(e)}")
    # Handle missing table or fallback logic here

# Step 10: Load Data from PES_prep.yxdb (Node 249)
try:
    pes_prep_df = load_data_from_catalog("catalog.db.PES_prep")
except Exception as e:
    logger.error(f"Failed to load data from PES_prep: {str(e)}")
    # Handle missing table or fallback logic here

# Step 11: Apply Multi-Field Formula Transformation
def apply_multi_field_formula(df):
    for col in df.columns:
        df = df.withColumn(col, F.when(F.col(col).contains("#") | F.col(col).contains("UNMAPPED") | F.col(col).contains("NULL"), None)
                           .when(F.col(col).startswith("00"), F.expr(f"substring({col}, 3, length({col}))"))
                           .otherwise(F.col(col)))
    logger.info(f"Applied multi-field formula transformation")
    return df

transformed_df = apply_multi_field_formula(pjotr_df)

# Step 12: Select Specific Fields for Processing
selected_fields_df = transformed_df.select("Business_unit_code", "FMRC_code", "FSID_code", "LE_code", "MRC_code", "Plant_code", "PO_business_unit_code", "PO_LE_code", "PO_MRC_code", "PO_site_code", "Site_code", "Vision_sourced_data")
logger.info(f"Selected specific fields for processing")

# Step 13: Join Data Sources
joined_df = selected_fields_df.join(bseg_df, selected_fields_df.LE_code == bseg_df.LE_code, "inner").drop(bseg_df.LE_code)
logger.info(f"Joined data sources")

# Step 14: Apply Custom Formula Calculations
def apply_custom_calculations(df):
    df = df.withColumn("PJOTR_ID", F.when(F.col("PJOTR_ID") == 1144, 2806).otherwise(F.col("PJOTR_ID")))
    logger.info(f"Applied custom formula calculations")
    return df

custom_calculated_df = apply_custom_calculations(joined_df)

# Step 15: Union Data Streams
union_df = custom_calculated_df.union(pjotr_in_pes_df)
logger.info(f"Unioned data streams")

# Step 16: Filter Data
filtered_df = union_df.filter(F.col("PJOTR_ID").isNotNull())
logger.info(f"Filtered data")

# Step 17: Summarize Data
summarized_df = filtered_df.groupBy("PJOTR").agg(F.sum("Spend").alias("Total_Spend"), F.count("Records").alias("Record_Count"))
logger.info(f"Summarized data")

# Step 18: Apply Custom Calculations
final_df = apply_custom_calculations(summarized_df)

# Step 19: Write Output to C03_pjotr.yxdb (Node 488)
try:
    spark.sql("CREATE SCHEMA IF NOT EXISTS catalog.output")
    final_df.write.format("delta").mode("overwrite").saveAsTable("catalog.output.C03_pjotr")
    logger.info(f"Written output to C03_pjotr.yxdb")
except Exception as e:
    logger.error(f"Failed to write output to C03_pjotr.yxdb: {str(e)}")

# Step 20: Write Output to C03_pjotr_midway.yxdb (Node 331)
try:
    final_df.write.format("delta").mode("overwrite").saveAsTable("catalog.output.C03_pjotr_midway")
    logger.info(f"Written output to C03_pjotr_midway.yxdb")
except Exception as e:
    logger.error(f"Failed to write output to C03_pjotr_midway.yxdb: {str(e)}")

# Step 21: Write Output to C03_unmapped_to_PJOTR.yxdb (Node 350)
try:
    final_df.write.format("delta").mode("overwrite").saveAsTable("catalog.output.C03_unmapped_to_PJOTR")
    logger.info(f"Written output to C03_unmapped_to_PJOTR.yxdb")
except Exception as e:
    logger.error(f"Failed to write output to C03_unmapped_to_PJOTR.yxdb: {str(e)}")

# Step 22: Write Output to C03_PJOTR.yxdb (Node 317)
try:
    final_df.write.format("delta").mode("overwrite").saveAsTable("catalog.output.C03_PJOTR")
    logger.info(f"Written output to C03_PJOTR.yxdb")
except Exception as e:
    logger.error(f"Failed to write output to C03_PJOTR.yxdb: {str(e)}")

# Step 23: Write Output to PJOTR_in_PES.yxdb (Node 315)
try:
    final_df.write.format("delta").mode("overwrite").saveAsTable("catalog.output.PJOTR_in_PES")
    logger.info(f"Written output to PJOTR_in_PES.yxdb")
except Exception as e:
    logger.error(f"Failed to write output to PJOTR_in_PES.yxdb: {str(e)}")
