In [None]:
# Databricks notebook source
# COMMAND ----------
# MAGIC %md
# MAGIC # ETL Process for Cardinal Health Data
# MAGIC This notebook performs an ETL process on data from Unity Catalog tables related to Cardinal Health. The process includes data integration, compensation calculation, and revenue projection.

# COMMAND ----------
# MAGIC
# Import necessary libraries
import logging
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Assume the Spark session is already initialized as 'spark'

# COMMAND ----------
# MAGIC
# Define the main ETL function
def main():
    try:
        # Load data from Unity Catalog tables
        logger.info("Loading data from Unity Catalog tables...")
        hospital_stats_df = spark.table("genai_demo.cardinal_health.hospital_stats_north_america")
        hospital_sales_assignments_df = spark.table("genai_demo.cardinal_health.hospital_sales_assignments")
        employment_details_df = spark.table("genai_demo.cardinal_health.associates_employment")
        compensation_guidelines_df = spark.table("genai_demo.cardinal_health.compensation_guidelines")
        logistics_channels_df = spark.table("genai_demo.cardinal_health.logistics_channels")
        growth_opportunities_df = spark.table("genai_demo.cardinal_health.growth_opportunities")
        company_goals_df = spark.table("genai_demo.cardinal_health.company_goals")
        historical_sales_df = spark.table("genai_demo.cardinal_health.historical_sales")
        third_party_trends_df = spark.table("genai_demo.cardinal_health.third_party_sales_trends")

        # COMMAND ----------
        # MAGIC
# Data Integration
        logger.info("Performing data integration...")
        joined_hospital_df = hospital_stats_df.join(
            hospital_sales_assignments_df, ["Hospital_ID", "Hospital_Name"], "inner"
        )

        joined_employment_df = employment_details_df.join(
            compensation_guidelines_df, "Associate_ID", "inner"
        )

        # COMMAND ----------
        # MAGIC
# Calculate total compensation
        logger.info("Calculating total compensation...")
        base_salary = F.col("Base_Salary")
        commission = F.col("Commission_Percentage") * base_salary
        bonus = F.col("Bonus")
        compensation_df = joined_employment_df.withColumn("Compensation", base_salary + commission + bonus)

        # COMMAND ----------
        # MAGIC
# Join logistics and growth opportunities
        logger.info("Joining logistics and growth opportunities...")
        logistics_growth_df = logistics_channels_df.join(
            growth_opportunities_df, ["Channel_ID", "Channel_Type"], "inner"
        ).dropDuplicates(["Channel_ID", "Channel_Type", "Hospital_ID"])

        # COMMAND ----------
        # MAGIC
# Join with compensation data
        logger.info("Joining with compensation data...")
        consolidated_df = compensation_df.join(
            logistics_growth_df, "Hospital_ID", "inner"
        )

        # COMMAND ----------
        # MAGIC
# Join with company goals
        logger.info("Joining with company goals...")
        consolidated_goals_df = consolidated_df.join(
            company_goals_df, ["Hospital_ID", "Channel_Type"], "inner"
        ).dropDuplicates(["Hospital_ID", "Channel_Type", "Projected_Growth_Rate"])

        # COMMAND ----------
        # MAGIC
# Join historical sales with third-party trends
        logger.info("Joining historical sales with third-party trends...")
        sales_trends_df = historical_sales_df.join(
            third_party_trends_df, "Channel_Type", "inner"
        ).dropDuplicates(["Year", "Channel_Type", "Sales_Revenue"])

        # COMMAND ----------
        # MAGIC
# Join sales trends with growth data
        logger.info("Joining sales trends with growth data...")
        final_join_df = sales_trends_df.join(
            consolidated_goals_df, "Hospital_ID", "inner"
        )

        # COMMAND ----------
        # MAGIC
# Generate rows for target years and calculate projections
        logger.info("Generating rows for target years and calculating projections...")
        generate_rows_df = final_join_df.withColumn("Target_Year", F.expr("sequence(2024, 2026)"))
        projected_df = generate_rows_df.withColumn(
            "Projected_Revenue",
            F.col("Sales_Revenue") * F.col("Projected_Growth_Rate")
        )

        # COMMAND ----------
        # MAGIC
# Filter and select final output
        logger.info("Filtering and selecting final output...")
        final_output_df = projected_df.filter(F.col("Target_Year") > 2023).select(
            F.col("Channel_Type"), F.col("Hospital_ID"), F.col("Market_Trend"), F.col("Projected_Revenue")
        ).orderBy("Target_Year")

        # COMMAND ----------
        # MAGIC
# Write the final output to a CSV file
        logger.info("Writing final output to CSV...")
        final_output_df.write.format("csv").option("header", "true").mode("overwrite").save("dbfs:/genai_demo/cardinal_health/Sales_Prediction_Output.csv")

        logger.info("ETL process completed successfully.")

    except Exception as e:
        logger.error(f"An error occurred during the ETL process: {e}")
        raise

# COMMAND ----------
# MAGIC
# Execute the main function
if __name__ == "__main__":
    main()
