In [None]:
# Databricks notebook source
# COMMAND ----------
# MAGIC %md
# MAGIC # ETL Process for Sales Prediction
# MAGIC This notebook performs an ETL process to load, transform, and save sales prediction data using PySpark and Databricks Unity Catalog.

# COMMAND ----------
# MAGIC
# Import necessary libraries
import logging
from pyspark.sql import functions as F

# COMMAND ----------
# MAGIC
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------
# MAGIC
try:
    # Load data from Unity Catalog tables to ensure data consistency and leverage Databricks' optimized data access
    logger.info("Loading data from Unity Catalog tables...")
    hospital_stats_df = spark.table("genai_demo.cardinal_health.hospital_stats_north_america")
    sales_assignments_df = spark.table("genai_demo.cardinal_health.hospital_sales_assignments")
    employment_details_df = spark.table("genai_demo.cardinal_health.sales_associates_employment_details")
    compensation_guidelines_df = spark.table("genai_demo.cardinal_health.compensation_guidelines")
    logistics_channels_df = spark.table("genai_demo.cardinal_health.logistics_channels")
    growth_opportunities_df = spark.table("genai_demo.cardinal_health.growth_opportunities")
    company_goals_df = spark.table("genai_demo.cardinal_health.company_goals")
    historical_sales_df = spark.table("genai_demo.cardinal_health.historical_sales")
    third_party_trends_df = spark.table("genai_demo.cardinal_health.third_party_sales_trends")

# COMMAND ----------
# MAGIC
# Perform joins to integrate datasets based on key identifiers for comprehensive data analysis
    logger.info("Performing data integration...")
    hospital_sales_df = hospital_stats_df.join(
        sales_assignments_df, [F.col("Hospital_ID"), F.col("Hospital_Name")], "inner"
    )
    employment_compensation_df = employment_details_df.join(
        compensation_guidelines_df, F.col("Associate_ID"), "inner"
    )
    consolidated_df = hospital_sales_df.join(
        employment_compensation_df, [F.col("Associate_ID"), F.col("Associate_Name")], "inner"
    )

# COMMAND ----------
# MAGIC
# Apply custom calculations to derive meaningful insights for strategic planning
    logger.info("Applying custom calculations...")
    base_salary = F.col("Base_Salary")
    commission = F.col("Commission_Percentage") * base_salary
    bonus = F.col("Bonus")
    consolidated_df = consolidated_df.withColumn("Compensation", base_salary + commission + bonus)

# COMMAND ----------
# MAGIC
# Join logistics channels with growth opportunities
    logistics_growth_df = logistics_channels_df.join(
        growth_opportunities_df, [F.col("Channel_ID"), F.col("Channel_Type"), F.col("Hospital_ID")], "inner"
    )

# COMMAND ----------
# MAGIC
# Join compensation data with logistics and growth data
    final_df = consolidated_df.join(
        logistics_growth_df, F.col("Hospital_ID"), "inner"
    ).select(
        F.col("Hospital_ID"), F.col("Channel_Type"), F.col("Growth_Opportunities"), F.col("Projected_Growth_Rate")
    )

# COMMAND ----------
# MAGIC
# Join with company goals
    final_df = final_df.join(
        company_goals_df, [F.col("Hospital_ID"), F.col("Channel_Type")], "inner"
    ).select(
        F.col("Hospital_ID"), F.col("Channel_Type"), F.col("Growth_Opportunities"), F.col("Projected_Growth_Rate"),
        F.col("Year"), F.col("Channel_ID"), F.col("Growth_Target"), F.col("Investment_Planned")
    )

# COMMAND ----------
# MAGIC
# Filter and sort data to focus on future projections and facilitate decision-making
    logger.info("Filtering and sorting data...")
    filtered_df = final_df.filter(F.col("Year") > 2023)
    sorted_df = filtered_df.orderBy("Year")

# COMMAND ----------
# MAGIC
# Write output to Unity Catalog table for centralized access and further analysis
    logger.info("Writing output to Unity Catalog table...")
    spark.sql("DROP TABLE IF EXISTS genai_demo.cardinal_health.sales_prediction_output")
    sorted_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.cardinal_health.sales_prediction_output")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error(f"An error occurred during the ETL process: {e}")
    raise
