In [None]:
import logging
import psycopg2
from pyspark.sql import functions as F

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    # Step 1: Load data from Unity Catalog source tables
    logger.info("Loading data from Unity Catalog source tables")
    try:
        source_df = spark.table("catalog.source_db.source_table")
        logger.info("Data loaded successfully from Unity Catalog source tables")
    except Exception as e:
        logger.error(f"Failed to load data from Unity Catalog source tables: {e}")
        raise

    # Step 2: Connect to external PostgreSQL database and fetch data
    try:
        logger.info("Connecting to external PostgreSQL database")
        # Check if secrets exist before attempting to retrieve them
        secret_keys = ["pg_host", "pg_port", "pg_dbname", "pg_user", "pg_password"]
        secrets = {}
        missing_secrets = []

        for key in secret_keys:
            try:
                # Corrected: Ensure dbutils.secrets.get() is used correctly
                secrets[key] = dbutils.secrets.get(scope="my_scope", key=key)
            except Exception as e:
                logger.error(f"Failed to retrieve secret for key '{key}': {e}")
                missing_secrets.append(key)

        if missing_secrets:
            raise ValueError(f"Missing secrets for keys: {', '.join(missing_secrets)}")

        # Corrected: Ensure psycopg2 connection parameters are correctly passed
        conn = psycopg2.connect(
            host=secrets["pg_host"],
            port=int(secrets["pg_port"]),  # Ensure port is an integer
            dbname=secrets["pg_dbname"],
            user=secrets["pg_user"],
            password=secrets["pg_password"]
        )
        cursor = conn.cursor()
        cursor.execute("SELECT * FROM legacy_table")
        legacy_data = cursor.fetchall()
        logger.info("Data fetched successfully from PostgreSQL database")
    except Exception as e:
        logger.error(f"Failed to connect to PostgreSQL database: {e}")
        raise

    # Step 3: Transform data
    try:
        logger.info("Transforming data")
        # Example transformation: Join source_df with legacy_data
        legacy_df = spark.createDataFrame(legacy_data, schema=["key", "column1", "column2", "column3"])
        transformed_df = source_df.join(legacy_df, source_df["key"] == legacy_df["key"], "inner")

        # Additional transformations (e.g., aggregations, filtering)
        transformed_df = transformed_df.filter(F.col("column1") > 100)
        transformed_df = transformed_df.groupBy("column2").agg(F.sum("column3").alias("total_column3"))
        logger.info("Data transformation completed successfully")
    except Exception as e:
        logger.error(f"Data transformation failed: {e}")
        raise

    # Step 4: Write transformed data to Unity Catalog target tables
    try:
        logger.info("Writing transformed data to Unity Catalog target tables")
        target_catalog = "catalog_name"
        target_schema = "schema_name"
        target_table = "table_name"

        # Ensure schema exists before creating table
        spark.sql(f"CREATE SCHEMA IF NOT EXISTS {target_catalog}.{target_schema}")
        logger.info(f"Schema {target_catalog}.{target_schema} ensured")

        # Write to Unity Catalog target table (overwrite mode handles table replacement)
        transformed_df.write.format("delta").mode("overwrite").saveAsTable(f"{target_catalog}.{target_schema}.{target_table}")
        logger.info("Data written successfully to Unity Catalog target tables")
    except Exception as e:
        logger.error(f"Failed to write data to Unity Catalog target tables: {e}")
        raise

except Exception as e:
    logger.error(f"ETL process failed: {e}")
    raise
