In [None]:
import logging
import psycopg2
from pyspark.sql import functions as F

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Function to connect to PostgreSQL and fetch data
def fetch_data_from_postgres(query, host, dbname, user, password):
    try:
        conn = psycopg2.connect(host=host, dbname=dbname, user=user, password=password)
        cursor = conn.cursor()
        cursor.execute(query)
        data = cursor.fetchall()
        cursor.close()
        conn.close()
        return data
    except Exception as e:
        logger.error(f"Error fetching data from PostgreSQL: {e}")
        raise

# Securely retrieve PostgreSQL credentials
postgres_host = dbutils.secrets.get(scope="my_scope", key="postgres_host")
postgres_dbname = dbutils.secrets.get(scope="my_scope", key="postgres_dbname")
postgres_user = dbutils.secrets.get(scope="my_scope", key="postgres_user")
postgres_password = dbutils.secrets.get(scope="my_scope", key="postgres_password")

# Fetch data from PostgreSQL
postgres_query = "SELECT * FROM legacy_table"
postgres_data = fetch_data_from_postgres(postgres_query, postgres_host, postgres_dbname, postgres_user, postgres_password)

# Load data from Unity Catalog tables
try:
    source_df = spark.table("catalog.source_db.source_table")
    logger.info("Source data loaded successfully from Unity Catalog")
except Exception as e:
    logger.error(f"Error loading source data from Unity Catalog: {e}")
    raise

# Transformation logic
try:
    # Example transformation: Join with another Unity Catalog table
    join_df = spark.table("catalog.source_db.dimension_table")
    transformed_df = source_df.join(join_df, source_df.key == join_df.key, "inner")

    # Example aggregation
    aggregated_df = transformed_df.groupBy("category").agg(F.sum("amount").alias("total_amount"))

    # Example filtering
    filtered_df = aggregated_df.filter(aggregated_df.total_amount > 1000)

    # Custom calculations
    final_df = filtered_df.withColumn("discounted_amount", F.col("total_amount") * 0.9)

    logger.info("Data transformation completed successfully")
except Exception as e:
    logger.error(f"Error during data transformation: {e}")
    raise

# Output handling
try:
    target_catalog = "catalog_name"
    target_schema = "schema_name"
    target_table = "table_name"

    # Ensure schema exists
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {target_catalog}.{target_schema}")
    logger.info(f"Schema {target_catalog}.{target_schema} ensured")

    # Write to Unity Catalog target table
    final_df.write.format("delta").mode("overwrite").saveAsTable(f"{target_catalog}.{target_schema}.{target_table}")
    logger.info(f"Data written successfully to {target_catalog}.{target_schema}.{target_table}")
except Exception as e:
    logger.error(f"Error writing data to Unity Catalog: {e}")
    raise

# Performance optimizations
try:
    # Cache intermediate DataFrames if beneficial
    transformed_df.cache()

    # Use broadcast joins for small dimension tables
    broadcast_df = spark.table("catalog.source_db.small_dimension_table")
    final_df = final_df.join(F.broadcast(broadcast_df), final_df.key == broadcast_df.key, "inner")

    logger.info("Performance optimizations applied successfully")
except Exception as e:
    logger.error(f"Error during performance optimizations: {e}")
    raise

logger.info("ETL workflow completed successfully")
