In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, avg
import logging

# Initialize Spark session (assumed pre-initialized in Databricks)
# spark = SparkSession.builder.getOrCreate()  # Not needed in Databricks

# Initialize logger for Databricks environment
logger = logging.getLogger("DatabricksLogger")
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

# Load data from Unity Catalog
try:
    source_df = spark.table("source_catalog.source_db.source_table")
    logger.info("Successfully loaded source table from Unity Catalog")
except Exception as e:
    logger.error(f"Failed to load source table from Unity Catalog: {str(e)}")
    source_df = None

# Load data from external PostgreSQL system
try:
    # Ensure secrets are correctly set up in Databricks
    jdbc_url = f"jdbc:postgresql://legacy-db-host:5432/legacy_db"
    connection_properties = {
        "user": dbutils.secrets.get(scope="my_scope", key="legacy_db_user"),
        "password": dbutils.secrets.get(scope="my_scope", key="legacy_db_password")
    }
    external_df = spark.read.jdbc(url=jdbc_url, table="(SELECT * FROM legacy_table) AS legacy_table", properties=connection_properties)
    logger.info("Successfully loaded data from external PostgreSQL system")
except Exception as e:
    logger.error(f"Failed to load data from external PostgreSQL system: {str(e)}")
    external_df = None

# Perform join operation
if source_df is not None and external_df is not None:
    try:
        joined_df = source_df.alias("left_table").join(
            external_df.alias("right_table"),
            col("left_table.id") == col("right_table.id"),
            how="inner"
        )
        logger.info("Successfully performed join operation")
    except Exception as e:
        logger.error(f"Join operation failed: {str(e)}")
        joined_df = None
else:
    logger.error("Join operation skipped due to missing DataFrames")
    joined_df = None

# Filter the joined data
if joined_df is not None:
    try:
        filtered_df = joined_df.filter(col("status") == "active")
        logger.info("Successfully filtered joined data")
    except Exception as e:
        logger.error(f"Filter operation failed: {str(e)}")
        filtered_df = None
else:
    logger.error("Filter operation skipped due to missing joined DataFrame")
    filtered_df = None

# Perform aggregation
if filtered_df is not None:
    try:
        aggregated_df = filtered_df.groupBy("category").agg(
            sum("amount").alias("total_amount"),
            avg("amount").alias("average_amount")
        )
        logger.info("Successfully performed aggregation")
    except Exception as e:
        logger.error(f"Aggregation operation failed: {str(e)}")
        aggregated_df = None
else:
    logger.error("Aggregation operation skipped due to missing filtered DataFrame")
    aggregated_df = None

# Write the result to Unity Catalog
if aggregated_df is not None:
    try:
        spark.sql("CREATE SCHEMA IF NOT EXISTS target_catalog.target_db")
        logger.info("Schema ensured before table creation")
        aggregated_df.write.format("delta").mode("overwrite").saveAsTable("target_catalog.target_db.target_table")
        logger.info("Successfully wrote aggregated data to Unity Catalog")
    except Exception as e:
        logger.error(f"Failed to write data to Unity Catalog: {str(e)}")
else:
    logger.error("Write operation skipped due to missing aggregated DataFrame")
