In [None]:
# Databricks notebook source
# COMMAND ----------
# %md
# # ETL Process for Cardinal Health Data
# This notebook performs an ETL process on various datasets from the Unity Catalog, integrating and transforming the data to produce a final output table.

# COMMAND ----------
#
# Import necessary libraries
import logging
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType

# COMMAND ----------
#
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------
#
try:
    # Load data from Unity Catalog tables
    logger.info("Loading data from Unity Catalog tables...")
    df_hospital_stats = spark.table("genai_demo.cardinal_health.hospitals_stats")
    df_sales_associates = spark.table("genai_demo.cardinal_health.associates_employment")
    df_compensation_guidelines = spark.table("genai_demo.cardinal_health.compensation_guidelines")
    df_hospital_assignments = spark.table("genai_demo.cardinal_health.hospital_assignments")
    df_logistics_channels = spark.table("genai_demo.cardinal_health.logistics_channels")
    df_growth_opportunities = spark.table("genai_demo.cardinal_health.growth_opportunities")
    df_third_party_trends = spark.table("genai_demo.cardinal_health.third_party_trends")
    df_historical_sales = spark.table("genai_demo.cardinal_health.historical_sales_trending")
    df_company_goals = spark.table("genai_demo.cardinal_health.company_goals_1")

# COMMAND ----------
#
# Data Integration: Joining hospital statistics with assignments
    logger.info("Performing data integration...")
    df_joined_hospital = df_hospital_stats.join(
        df_hospital_assignments, F.col("Hospital_ID") == F.col("Hospital_ID"), "inner"
    )

# COMMAND ----------
#
# Joining employment details with compensation guidelines
    df_joined_employment = df_sales_associates.join(
        df_compensation_guidelines, F.col("Associate_ID") == F.col("Associate_ID"), "inner"
    )

# COMMAND ----------
#
# Joining all datasets to create a comprehensive view
    df_joined_all = df_joined_hospital.join(
        df_joined_employment, F.col("Associate_ID") == F.col("Associate_ID"), "inner"
    )

# COMMAND ----------
#
# Custom Calculations: Calculating total compensation
    logger.info("Calculating total compensation...")
    base_salary = F.col("Base_Salary").cast(DoubleType())
    commission = F.col("Commission_Percentage").cast(DoubleType()) * base_salary
    bonus = F.col("Bonus").cast(DoubleType())
    df_compensation = df_joined_all.withColumn("Compensation", base_salary + commission + bonus)

# COMMAND ----------
#
# Data Filtering and Selection: Selecting relevant fields for further processing
    logger.info("Filtering and selecting relevant records...")
    df_filtered = df_compensation.select(
        F.col("Hospital_ID"),
        F.col("Director_Name"),
        F.col("Manager_Name"),
        F.col("Associate_ID"),
        F.col("Associate_Name"),
        F.col("Compensation")
    )

# COMMAND ----------
#
# Join with Logistics and Growth Opportunities
    logger.info("Joining with logistics and growth opportunities data...")
    df_logistics_growth = df_logistics_channels.join(
        df_growth_opportunities, 
        (F.col("Channel_ID") == F.col("Channel_ID")) & 
        (F.col("Channel_Type") == F.col("Channel_Type")) & 
        (F.col("Hospital_ID") == F.col("Hospital_ID")), 
        "inner"
    )

    df_final = df_filtered.join(
        df_logistics_growth, F.col("Hospital_ID") == F.col("Hospital_ID"), "inner"
    ).select(
        F.col("Hospital_ID"),
        F.col("Channel_Type"),
        F.col("Growth_Opportunities"),
        F.col("Projected_Growth_Rate"),
        F.col("Market_Potential"),
        F.col("Expected_ROI")
    )

# COMMAND ----------
#
# Join with Company Goals
    logger.info("Joining with company goals...")
    df_final_goals = df_final.join(
        df_company_goals, 
        (F.col("Hospital_ID") == F.col("Hospital_ID")) & 
        (F.col("Channel_Type") == F.col("Channel_Type")), 
        "inner"
    ).select(
        F.col("Hospital_ID"),
        F.col("Channel_Type"),
        F.col("Growth_Target"),
        F.col("Investment_Planned")
    )

# COMMAND ----------
#
# Unique records: Ensuring unique records based on specific fields
    logger.info("Ensuring unique records...")
    df_unique = df_final_goals.dropDuplicates(["Channel_ID", "Channel_Type", "Hospital_ID"])

# COMMAND ----------
#
# Projected Revenue Calculation: Calculating projected revenue based on conditions
    logger.info("Calculating projected revenue...")
    df_projected_revenue = df_unique.withColumn(
        "Projected_Revenue",
        F.when(F.col("Target_Year") == 2024, F.col("Sales_Revenue") * (F.col("Projected_Sales_Growth_Rate") / 100))
        .when(F.col("Target_Year").isin(2025, 2026), F.col("Sales_Revenue") * (1 + F.col("Projected_Sales_Growth_Rate") / 100))
        .otherwise(F.col("Sales_Revenue"))
    )

# COMMAND ----------
#
# Write to Unity Catalog target table
    logger.info("Writing the final output to Unity Catalog target table...")
    spark.sql("DROP TABLE IF EXISTS genai_demo.cardinal_health.target_sales")
    df_projected_revenue.write.format("delta").mode("overwrite").saveAsTable("genai_demo.cardinal_health.target_sales")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error("An error occurred during the ETL process", exc_info=True)
