In [None]:
# Databricks notebook source
# COMMAND ----------

# MAGIC %md
# MAGIC # ETL Process for Cardinal Health Data
# MAGIC This notebook performs an ETL process on Cardinal Health data, integrating various datasets, performing calculations, and generating projections.

# COMMAND ----------

# MAGIC
# MAGIC # Import necessary libraries
import logging
from pyspark.sql import functions as F

# COMMAND ----------

# MAGIC
# MAGIC # Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

# MAGIC
# MAGIC try:
# MAGIC     # Load data from Unity Catalog tables
# MAGIC     logger.info("Loading data from Unity Catalog tables...")
# MAGIC     hospital_stats_df = spark.table("genai_demo.cardinal_health.hospitals_stats")
# MAGIC     sales_assignments_df = spark.table("genai_demo.cardinal_health.hospital_assignments")
# MAGIC     employment_details_df = spark.table("genai_demo.cardinal_health.associates_employment")
# MAGIC     compensation_guidelines_df = spark.table("genai_demo.cardinal_health.compensation_guidelines")
# MAGIC     logistics_channels_df = spark.table("genai_demo.cardinal_health.logistics_channels")
# MAGIC     growth_opportunities_df = spark.table("genai_demo.cardinal_health.growth_opportunities")
# MAGIC     company_goals_df = spark.table("genai_demo.cardinal_health.company_goals_1")
# MAGIC     historical_sales_df = spark.table("genai_demo.cardinal_health.historical_sales_trending")
# MAGIC     third_party_trends_df = spark.table("genai_demo.cardinal_health.third_party_trends")

# COMMAND ----------

# MAGIC
# MAGIC # Step 1: Data Integration
# MAGIC logger.info("Integrating hospital statistics with sales assignments...")
# MAGIC integrated_df = hospital_stats_df.join(
# MAGIC     sales_assignments_df, 
# MAGIC     [F.col("Hospital_ID"), F.col("Hospital_Name")], 
# MAGIC     "inner"
# MAGIC )

# COMMAND ----------

# MAGIC
# MAGIC # Step 2: Compensation and Growth Calculations
# MAGIC logger.info("Joining employment details with compensation guidelines...")
# MAGIC compensation_df = employment_details_df.join(
# MAGIC     compensation_guidelines_df, 
# MAGIC     "Associate_ID", 
# MAGIC     "inner"
# MAGIC )
# MAGIC logger.info("Calculating total compensation...")
# MAGIC compensation_df = compensation_df.withColumn(
# MAGIC     "Compensation", 
# MAGIC     F.col("Base_Salary") + (F.col("Commission_Percentage") * F.col("Base_Salary")) + F.col("Bonus")
# MAGIC )

# COMMAND ----------

# MAGIC
# MAGIC # Step 3: Data Cleansing and Standardization
# MAGIC logger.info("Ensuring unique records...")
# MAGIC unique_df = integrated_df.dropDuplicates(["Channel_ID", "Channel_Type", "Hospital_ID"])

# COMMAND ----------

# MAGIC
# MAGIC # Step 4: Predictive Modeling
# MAGIC logger.info("Generating future year projections...")
# MAGIC future_years_df = spark.range(2023, 2027).toDF("Target_Year")
# MAGIC logger.info("Calculating projected sales growth rate...")
# MAGIC projections_df = growth_opportunities_df.join(future_years_df)
# MAGIC projections_df = projections_df.withColumn(
# MAGIC     "projected_sales_growth_rate", 
# MAGIC     F.when(F.col("Target_Year") == 2023, F.col("Projected_Growth_Rate"))
# MAGIC      .otherwise(F.col("Projected_Growth_Rate") * 1.05)
# MAGIC )

# COMMAND ----------

# MAGIC
# MAGIC # Step 5: Final Output
# MAGIC logger.info("Selecting and sorting final output...")
# MAGIC final_output_df = projections_df.select(
# MAGIC     F.col("Channel_Type"), F.col("Hospital_ID"), F.col("Market_Trend"), F.col("Political_Impact"), 
# MAGIC     F.col("Economic_Impact"), F.col("Target_Year"), F.col("projected_sales_growth_rate"), 
# MAGIC     F.col("projected_investments"), F.col("Projected_Revenue")
# MAGIC )
# MAGIC final_output_df = final_output_df.orderBy("Target_Year")

# COMMAND ----------

# MAGIC
# MAGIC # Write the processed data to Unity Catalog tables using the Delta format
# MAGIC logger.info("Writing final output to Unity Catalog table...")
# MAGIC spark.sql("DROP TABLE IF EXISTS genai_demo.cardinal_health.sales_prediction_output")
# MAGIC final_output_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.cardinal_health.sales_prediction_output")

# COMMAND ----------

# MAGIC
# MAGIC logger.info("ETL process completed successfully.")

# COMMAND ----------

# MAGIC
# MAGIC except Exception as e:
# MAGIC     logger.error("An error occurred during the ETL process", exc_info=True)
