In [None]:
import logging
import psycopg2
from pyspark.sql import functions as F
from pyspark.sql.utils import AnalysisException

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Function to retrieve secrets
def get_secret(scope, key):
    try:
        secret_value = dbutils.secrets.get(scope, key)
        logger.info(f"Successfully retrieved secret {key} from scope {scope}")
        return secret_value
    except Exception as e:
        logger.error(f"Failed to retrieve secret {key} from scope {scope}: {str(e)}")
        return None

# Step 1: Load data from Unity Catalog tables
try:
    source_df = spark.table("catalog.source_db.source_table")
    logger.info("Loaded source data from Unity Catalog")
except AnalysisException as e:
    logger.error(f"Failed to load source data: {str(e)}")
    raise

# Step 2: Connect to external PostgreSQL database and fetch data
conn = None
cursor = None
try:
    # Check if secrets exist before attempting to retrieve them
    secret_keys = ["pg_host", "pg_port", "pg_dbname", "pg_user", "pg_password"]
    secrets = {key: get_secret("my_scope", key) for key in secret_keys}

    # Ensure all secrets are retrieved successfully
    if all(secrets.values()):
        conn = psycopg2.connect(
            host=secrets["pg_host"],
            port=secrets["pg_port"],
            dbname=secrets["pg_dbname"],
            user=secrets["pg_user"],
            password=secrets["pg_password"]
        )
        cursor = conn.cursor()
        cursor.execute("SELECT * FROM legacy_table")
        pg_data = cursor.fetchall()
        logger.info("Fetched data from PostgreSQL")
    else:
        raise ValueError("One or more secrets could not be retrieved.")
except Exception as e:
    logger.error(f"Failed to connect to PostgreSQL: {str(e)}")
    raise
finally:
    if cursor is not None:
        cursor.close()
    if conn is not None:
        conn.close()

# Step 3: Transform data
try:
    # Example transformation: Join with PostgreSQL data
    pg_df = spark.createDataFrame(pg_data, schema=["column1", "column2", "column3"])
    transformed_df = source_df.join(pg_df, source_df.id == pg_df.column1, "inner")

    # Additional transformations
    transformed_df = transformed_df.withColumn("new_column", F.expr("column2 * 2"))
    logger.info("Data transformation completed")
except Exception as e:
    logger.error(f"Data transformation failed: {str(e)}")
    raise

# Step 4: Write transformed data to Unity Catalog target table
try:
    target_catalog = "catalog_name"
    target_schema = "schema_name"
    target_table = "table_name"

    # Ensure schema exists
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {target_catalog}.{target_schema}")
    logger.info(f"Schema {target_catalog}.{target_schema} ensured")

    # Write to Unity Catalog target table
    transformed_df.write.format("delta").mode("overwrite").saveAsTable(f"{target_catalog}.{target_schema}.{target_table}")
    logger.info(f"Data written to {target_catalog}.{target_schema}.{target_table}")
except AnalysisException as e:
    logger.error(f"Failed to write data to target table: {str(e)}")
    raise

# Step 5: Performance optimizations
try:
    # Cache intermediate DataFrame if beneficial
    transformed_df.cache()
    logger.info("Cached transformed DataFrame")

    # Use broadcast join for small dimension tables
    small_df = spark.table("catalog.db.small_table")
    transformed_df = transformed_df.join(F.broadcast(small_df), "key")
    logger.info("Applied broadcast join")
except Exception as e:
    logger.error(f"Performance optimization failed: {str(e)}")
    raise

logger.info("ETL workflow completed successfully")
