In [None]:
import logging
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Function to load data from MySQL using JDBC
def load_mysql_data_jdbc(query, secret_scope, secret_key):
    try:
        # Retrieve credentials securely
        user = dbutils.secrets.get(secret_scope, f"{secret_key}_user")
        password = dbutils.secrets.get(secret_scope, f"{secret_key}_password")
        host = dbutils.secrets.get(secret_scope, f"{secret_key}_host")
        database = dbutils.secrets.get(secret_scope, f"{secret_key}_database")

        # JDBC URL for MySQL
        jdbc_url = f"jdbc:mysql://{host}/{database}?user={user}&password={password}"

        # Load data using JDBC
        df = spark.read.format("jdbc").option("url", jdbc_url).option("query", query).load()
        logger.info(f"Loaded data from MySQL: {df.count()} records")
        return df
    except Exception as e:
        logger.error(f"Error loading data from MySQL: {str(e)}")
        raise

# Load data from Unity Catalog tables
def load_unity_catalog_table(table_path):
    try:
        df = spark.table(table_path)
        logger.info(f"Loaded data from Unity Catalog table {table_path}: {df.count()} records")
        return df
    except Exception as e:
        logger.error(f"Error loading data from Unity Catalog table {table_path}: {str(e)}")
        return None

# Load data from MySQL (Node 99)
mysql_query_99 = "SELECT pjotr_prod.* FROM pjotr_prod"
try:
    mysql_df_99 = load_mysql_data_jdbc(mysql_query_99, "mysql_scope", "mysql_key_99")
except Exception as e:
    logger.error(f"Failed to load MySQL data for Node 99: {str(e)}")
    mysql_df_99 = None

# Load static data from TextInput (Node 330)
text_input_df = load_unity_catalog_table("catalog.db.text_input")
if text_input_df is None:
    logger.warning("TextInput table not found, proceeding with alternative logic or skipping this step.")

# Load data from C04_BSEG.yxdb (Node 499)
bseg_df = load_unity_catalog_table("catalog.db.C04_BSEG")

# Load data from C04_EKPO.yxdb (Node 500)
ekpo_df = load_unity_catalog_table("catalog.db.C04_EKPO")

# Load data from PJOTR_in_PES.yxdb (Node 384)
pjotr_in_pes_df = load_unity_catalog_table("catalog.db.PJOTR_in_PES")

# Load data from C19_ivl_data.yxdb (Node 291)
ivl_data_df = load_unity_catalog_table("catalog.db.C19_ivl_data")

# Load data from mysql_editable (Node 403)
mysql_query_403 = "SELECT pjotr_prod.* FROM pjotr_prod"
try:
    mysql_df_403 = load_mysql_data_jdbc(mysql_query_403, "mysql_scope", "mysql_key_403")
except Exception as e:
    logger.error(f"Failed to load MySQL data for Node 403: {str(e)}")
    mysql_df_403 = None

# Load data from mysql_editable (Node 98)
mysql_query_98 = "SELECT pjotr.* FROM pjotr"
try:
    mysql_df_98 = load_mysql_data_jdbc(mysql_query_98, "mysql_scope", "mysql_key_98")
except Exception as e:
    logger.error(f"Failed to load MySQL data for Node 98: {str(e)}")
    mysql_df_98 = None

# Load data from PJOTR_.yxdb (Node 401)
pjotr_df = load_unity_catalog_table("catalog.db.PJOTR_")

# Load data from PES_prep.yxdb (Node 249)
pes_prep_df = load_unity_catalog_table("catalog.db.PES_prep")

# Apply Multi-Field Formula Transformation
def apply_transformations(df):
    try:
        transformed_df = df.withColumn("_field1", F.when(F.col("field1").isin("#", "UNMAPPED", "NULL"), None).otherwise(F.col("field1"))) \
                           .withColumn("_field2", F.when(F.col("field2").startswith("00"), F.expr("substring(field2, 3, length(field2))")).otherwise(F.col("field2")))
        logger.info(f"Applied transformations: {transformed_df.count()} records")
        return transformed_df
    except Exception as e:
        logger.error(f"Error applying transformations: {str(e)}")
        raise

if mysql_df_99 is not None:
    transformed_df = apply_transformations(mysql_df_99)
    # Select Relevant Fields
    selected_df = transformed_df.select("_field1", "_field2")

    # Perform Join Operations
    def perform_joins(df1, df2, join_condition):
        try:
            if df2 is not None:
                joined_df = df1.join(df2, join_condition, "inner")
                logger.info(f"Performed join: {joined_df.count()} records")
                return joined_df
            else:
                logger.warning("Skipping join operation due to missing DataFrame.")
                return df1
        except Exception as e:
            logger.error(f"Error performing join: {str(e)}")
            raise

    joined_df = perform_joins(selected_df, text_input_df, "_field1")

    # Apply Custom Formula Calculations
    def apply_custom_calculations(df):
        try:
            calculated_df = df.withColumn("new_field", F.expr("field1 + field2"))
            logger.info(f"Applied custom calculations: {calculated_df.count()} records")
            return calculated_df
        except Exception as e:
            logger.error(f"Error applying custom calculations: {str(e)}")
            raise

    calculated_df = apply_custom_calculations(joined_df)

    # Union Data Streams
    def union_dataframes(df_list):
        try:
            union_df = df_list[0]
            for df in df_list[1:]:
                union_df = union_df.union(df)
            logger.info(f"Unioned dataframes: {union_df.count()} records")
            return union_df
        except Exception as e:
            logger.error(f"Error unioning dataframes: {str(e)}")
            raise

    union_df = union_dataframes([calculated_df, bseg_df, ekpo_df])

    # Filter Data
    filtered_df = union_df.filter(F.col("new_field") > 100)

    # Summarize Data
    summary_df = filtered_df.groupBy("_field1").agg(F.sum("new_field").alias("sum_new_field"))

    # Write Output to Unity Catalog tables
    def write_to_unity_catalog(df, catalog, schema, table):
        try:
            spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.{schema}")
            df.write.format("delta").mode("overwrite").saveAsTable(f"{catalog}.{schema}.{table}")
            logger.info(f"Written data to {catalog}.{schema}.{table}: {df.count()} records")
        except Exception as e:
            logger.error(f"Error writing data to {catalog}.{schema}.{table}: {str(e)}")
            raise

    write_to_unity_catalog(summary_df, "catalog_name", "schema_name", "C03_pjotr")
    write_to_unity_catalog(summary_df, "catalog_name", "schema_name", "C03_pjotr_midway")
    write_to_unity_catalog(summary_df, "catalog_name", "schema_name", "C03_unmapped_to_PJOTR")
    write_to_unity_catalog(summary_df, "catalog_name", "schema_name", "C03_PJOTR")
    write_to_unity_catalog(summary_df, "catalog_name", "schema_name", "PJOTR_in_PES")
else:
    logger.error("MySQL data for Node 99 is not available, skipping transformations and subsequent steps.")
