In [None]:
# Databricks notebook source
# COMMAND ----------

# MAGIC %md
# MAGIC # ETL Migration Process
# MAGIC This notebook performs an ETL migration process using data from Unity Catalog tables. It includes data loading, integration, calculations, filtering, and output to a Unity Catalog table.

# COMMAND ----------

# MAGIC
# MAGIC import logging
# MAGIC from pyspark.sql import functions as F
# MAGIC from pyspark.sql import SparkSession

# COMMAND ----------

# MAGIC
# MAGIC # Configure logging
# MAGIC logging.basicConfig(level=logging.INFO)
# MAGIC logger = logging.getLogger(__name__)

# COMMAND ----------

# MAGIC
# MAGIC # Assume the Spark session is pre-initialized
# MAGIC spark = SparkSession.builder.appName("ETL Migration").getOrCreate()

# COMMAND ----------

# MAGIC
# MAGIC def load_data():
# MAGIC     try:
# MAGIC         logger.info("Loading data from Unity Catalog tables...")
# MAGIC         hospitals_stats_df = spark.table("genai_demo.cardinal_health.hospitals_stats")
# MAGIC         associates_employment_df = spark.table("genai_demo.cardinal_health.associates_employment")
# MAGIC         compensation_guidelines_df = spark.table("genai_demo.cardinal_health.compensation_guidelines")
# MAGIC         hospital_assignments_df = spark.table("genai_demo.cardinal_health.hospital_assignments")
# MAGIC         logistics_channels_df = spark.table("genai_demo.cardinal_health.logistics_channels")
# MAGIC         growth_opportunities_df = spark.table("genai_demo.cardinal_health.growth_opportunities")
# MAGIC         third_party_trends_df = spark.table("genai_demo.cardinal_health.third_party_trends")
# MAGIC         historical_sales_df = spark.table("genai_demo.cardinal_health.historical_sales_trending")
# MAGIC         company_goals_df = spark.table("genai_demo.cardinal_health.company_goals_1")
# MAGIC         return (hospitals_stats_df, associates_employment_df, compensation_guidelines_df, 
# MAGIC                 hospital_assignments_df, logistics_channels_df, growth_opportunities_df)
# MAGIC     except Exception as e:
# MAGIC         logger.error(f"An error occurred while loading data: {e}")
# MAGIC         raise

# COMMAND ----------

# MAGIC
# MAGIC def integrate_data(hospitals_stats_df, associates_employment_df, compensation_guidelines_df, hospital_assignments_df):
# MAGIC     try:
# MAGIC         logger.info("Performing data integration...")
# MAGIC         hospital_sales_df = hospitals_stats_df.join(
# MAGIC             hospital_assignments_df,
# MAGIC             (F.col('Hospital_ID') == F.col('Hospital_ID')) &
# MAGIC             (F.col('Hospital_Name') == F.col('Hospital_Name')),
# MAGIC             "inner"
# MAGIC         )
# MAGIC 
# MAGIC         employment_compensation_df = associates_employment_df.join(
# MAGIC             compensation_guidelines_df,
# MAGIC             "Associate_ID",
# MAGIC             "inner"
# MAGIC         )
# MAGIC 
# MAGIC         integrated_df = employment_compensation_df.join(
# MAGIC             hospital_sales_df,
# MAGIC             "Associate_ID",
# MAGIC             "inner"
# MAGIC         )
# MAGIC 
# MAGIC         integrated_df = integrated_df.withColumn(
# MAGIC             "Compensation",
# MAGIC             F.col("Base_Salary") + (F.col("Commission_Percentage") * F.col("Base_Salary")) + F.col("Bonus")
# MAGIC         )
# MAGIC         return integrated_df
# MAGIC     except Exception as e:
# MAGIC         logger.error(f"An error occurred during data integration: {e}")
# MAGIC         raise

# COMMAND ----------

# MAGIC
# MAGIC def calculate_growth(logistics_channels_df, growth_opportunities_df):
# MAGIC     try:
# MAGIC         logger.info("Joining logistics and growth opportunities...")
# MAGIC         logistics_growth_df = logistics_channels_df.join(
# MAGIC             growth_opportunities_df,
# MAGIC             "Channel_ID",
# MAGIC             "inner"
# MAGIC         )
# MAGIC 
# MAGIC         logger.info("Calculating projected sales growth rate and revenue...")
# MAGIC         projected_sales_growth_rate_expr = F.expr("CASE WHEN Target_Year > 2023 THEN Projected_Growth_Rate ELSE 0 END")
# MAGIC         logistics_growth_df = logistics_growth_df.withColumn(
# MAGIC             "Projected_Sales_Growth_Rate",
# MAGIC             projected_sales_growth_rate_expr
# MAGIC         ).withColumn(
# MAGIC             "Projected_Revenue",
# MAGIC             F.col("Sales_Revenue") * (1 + F.col("Projected_Sales_Growth_Rate"))
# MAGIC         )
# MAGIC         return logistics_growth_df
# MAGIC     except Exception as e:
# MAGIC         logger.error(f"An error occurred during growth calculation: {e}")
# MAGIC         raise

# COMMAND ----------

# MAGIC
# MAGIC def filter_and_select(logistics_growth_df):
# MAGIC     try:
# MAGIC         logger.info("Filtering data for target years greater than 2023...")
# MAGIC         filtered_df = logistics_growth_df.filter(F.col("Target_Year") > 2023)
# MAGIC 
# MAGIC         logger.info("Selecting relevant fields for output...")
# MAGIC         final_df = filtered_df.select(
# MAGIC             "Hospital_ID", "Channel_Type", "Investment_Planned", "Sales_Revenue",
# MAGIC             "Market_Trend", "Political_Impact", "Economic_Impact", "Target_Year",
# MAGIC             "Projected_Sales_Growth_Rate", "Projected_Revenue"
# MAGIC         )
# MAGIC         return final_df
# MAGIC     except Exception as e:
# MAGIC         logger.error(f"An error occurred during data filtering and selection: {e}")
# MAGIC         raise

# COMMAND ----------

# MAGIC
# MAGIC def sort_and_output(final_df):
# MAGIC     try:
# MAGIC         logger.info("Sorting data by Target Year...")
# MAGIC         sorted_df = final_df.orderBy("Target_Year")
# MAGIC 
# MAGIC         logger.info("Writing the final dataset to Unity Catalog table...")
# MAGIC         spark.sql("DROP TABLE IF EXISTS genai_demo.cardinal_health.target_sales")
# MAGIC         sorted_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.cardinal_health.target_sales")
# MAGIC 
# MAGIC         logger.info("ETL process completed successfully.")
# MAGIC     except Exception as e:
# MAGIC         logger.error(f"An error occurred during data sorting and output: {e}")
# MAGIC         raise

# COMMAND ----------

# MAGIC
# MAGIC # Main execution
# MAGIC try:
# MAGIC     hospitals_stats_df, associates_employment_df, compensation_guidelines_df, hospital_assignments_df, logistics_channels_df, growth_opportunities_df = load_data()
# MAGIC     integrated_df = integrate_data(hospitals_stats_df, associates_employment_df, compensation_guidelines_df, hospital_assignments_df)
# MAGIC     logistics_growth_df = calculate_growth(logistics_channels_df, growth_opportunities_df)
# MAGIC     final_df = filter_and_select(logistics_growth_df)
# MAGIC     sort_and_output(final_df)
# MAGIC except Exception as e:
# MAGIC     logger.error(f"An error occurred during the ETL process: {e}")
