In [None]:
import logging
import psycopg2
from pyspark.sql import functions as F

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Function to connect to PostgreSQL and fetch data
def fetch_data_from_postgresql(query, host, port, database, user, password):
    try:
        connection = psycopg2.connect(
            host=host,
            port=int(port),  # Ensure port is an integer
            database=database,
            user=user,
            password=password
        )
        cursor = connection.cursor()
        cursor.execute(query)
        data = cursor.fetchall()
        cursor.close()
        connection.close()
        return data
    except Exception as e:
        logger.error(f"Error fetching data from PostgreSQL: {e}")
        raise

# Retrieve PostgreSQL credentials securely
try:
    pg_host = dbutils.secrets.get(scope="my_scope", key="pg_host")
    pg_port = dbutils.secrets.get(scope="my_scope", key="pg_port")
    pg_database = dbutils.secrets.get(scope="my_scope", key="pg_database")
    pg_user = dbutils.secrets.get(scope="my_scope", key="pg_user")
    pg_password = dbutils.secrets.get(scope="my_scope", key="pg_password")
except Exception as e:
    logger.error(f"Error retrieving secrets: {e}")
    # Handle missing secrets by providing default values or alternative methods
    pg_host = "valid_default_host"  # Ensure this is a valid host
    pg_port = 5432  # Use a valid default integer port
    pg_database = "default_database"
    pg_user = "default_user"
    pg_password = "default_password"
    logger.warning("Using default PostgreSQL credentials due to missing secrets")

# Validate host name resolution
import socket
try:
    socket.gethostbyname(pg_host)
    logger.info(f"Host {pg_host} resolved successfully")
except socket.error as e:
    logger.error(f"Host name resolution failed for {pg_host}: {e}")
    raise

# Fetch data from PostgreSQL
query = "SELECT * FROM legacy_table"
try:
    postgresql_data = fetch_data_from_postgresql(query, pg_host, pg_port, pg_database, pg_user, pg_password)
except Exception as e:
    logger.error(f"Failed to fetch data from PostgreSQL: {e}")
    raise

# Convert PostgreSQL data to Spark DataFrame
postgresql_df = spark.createDataFrame(postgresql_data, schema=["column1", "column2", "column3"])

# Load data from Unity Catalog source table
try:
    source_df = spark.table("catalog.source_db.source_table")
    logger.info("Source data loaded successfully from Unity Catalog")
except Exception as e:
    logger.error(f"Error loading source data from Unity Catalog: {e}")
    raise

# Perform transformations
try:
    # Example transformation: Join with PostgreSQL data
    transformed_df = source_df.join(postgresql_df, source_df["key"] == postgresql_df["key"], "left_outer")

    # Additional transformations (e.g., filtering, aggregations)
    transformed_df = transformed_df.filter(F.col("column1") > 100)
    transformed_df = transformed_df.groupBy("column2").agg(F.sum("column3").alias("total_column3"))

    # Cache intermediate DataFrame if beneficial
    transformed_df.cache()
    logger.info("Data transformations completed successfully")
except Exception as e:
    logger.error(f"Error during data transformations: {e}")
    raise

# Write transformed data to Unity Catalog target table
try:
    target_catalog = "catalog_name"
    target_schema = "schema_name"
    target_table = "table_name"

    # Ensure schema exists before creating table
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {target_catalog}.{target_schema}")
    logger.info(f"Schema {target_catalog}.{target_schema} ensured")

    # Write to Unity Catalog target table (overwrite mode handles table replacement)
    transformed_df.write.format("delta").mode("overwrite").saveAsTable(f"{target_catalog}.{target_schema}.{target_table}")
    logger.info(f"Data written successfully to {target_catalog}.{target_schema}.{target_table}")
except Exception as e:
    logger.error(f"Error writing data to Unity Catalog target table: {e}")
    raise
