In [None]:
# Databricks notebook source
# COMMAND ----------
# MAGIC %md
# MAGIC # ETL Process with PySpark
# MAGIC This notebook performs an ETL process using PySpark, loading data from a flat file and SQL Server, transforming it, and saving it to a Unity Catalog table.

# COMMAND ----------
# MAGIC
# Import necessary libraries
import logging
from pyspark.sql import functions as F

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------
# MAGIC
# Securely retrieve credentials using Databricks utilities
jdbc_username = dbutils.secrets.get(scope="jdbc_scope", key="username")
jdbc_password = dbutils.secrets.get(scope="jdbc_scope", key="password")

# JDBC connection properties
jdbc_url = "jdbc:sqlserver://vsco-sqlserver.ctiklfxgg9js.us-east-2.rds.amazonaws.com;databaseName=vsco"
connection_properties = {
    "user": jdbc_username,
    "password": jdbc_password,
    "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}

# COMMAND ----------
# MAGIC
# Step 1: Data Source Configuration
try:
    # Load flat file data into a DataFrame
    flat_file_path = "dbfs:/mnt/sample_data/SampleCurrencyData.txt"
    flat_file_df = spark.read.format("csv").option("header", "true").load(flat_file_path)
    logger.info("Flat file data loaded successfully.")
except Exception as e:
    logger.error(f"Error loading flat file data: {e}")
    raise

try:
    # Load DimCurrency and DimDate tables from SQL Server into DataFrames
    dim_currency_df = spark.read.jdbc(url=jdbc_url, table="AdventureWorksDW2012.DimCurrency", properties=connection_properties)
    dim_date_df = spark.read.jdbc(url=jdbc_url, table="AdventureWorksDW2012.DimDate", properties=connection_properties)
    logger.info("SQL Server data loaded successfully.")
except Exception as e:
    logger.error(f"Error loading SQL Server data: {e}")
    raise

# COMMAND ----------
# MAGIC
# Step 2: Data Transformation
try:
    # Define join conditions for clarity and maintainability
    currency_join_condition = F.col("CurrencyID") == F.col("CurrencyAlternateKey")
    date_join_condition = F.col("CurrencyDate") == F.col("FullDateAlternateKey")

    # Perform an inner join between the flat file DataFrame and the DimCurrency DataFrame on CurrencyID
    enriched_currency_df = flat_file_df.join(
        dim_currency_df,
        currency_join_condition,
        "inner"
    ).select("*", F.col("CurrencyKey"))
    logger.info("Currency key lookup completed successfully.")

    # Perform an inner join between the enriched currency DataFrame and the DimDate DataFrame on CurrencyDate
    final_df = enriched_currency_df.join(
        dim_date_df,
        date_join_condition,
        "inner"
    ).select("*", F.col("DateKey"))
    logger.info("Date key lookup completed successfully.")
except Exception as e:
    logger.error(f"Error during data transformation: {e}")
    raise

# COMMAND ----------
# MAGIC
# Step 3: Data Loading
try:
    # Drop the existing table if it exists
    spark.sql("DROP TABLE IF EXISTS catalog_name.schema_name.Sample_OLE_DB_Destination")

    # Write the transformed data to a Unity Catalog table using Delta format
    final_df.write.format("delta").mode("overwrite").saveAsTable("catalog_name.schema_name.Sample_OLE_DB_Destination")
    logger.info("Data successfully written to Unity Catalog table.")
except Exception as e:
    logger.error(f"Error writing data to Unity Catalog: {e}")
    raise

# COMMAND ----------
# MAGIC
# Performance Optimizations
# Cache intermediate DataFrames if beneficial
# enriched_currency_df.cache()
# final_df.cache()

# Use broadcast joins for small dimension tables
# spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

logger.info("ETL process completed successfully.")
