In [None]:
# Databricks notebook source
# COMMAND ----------

# MAGIC %md
# MAGIC # ETL Process for Cardinal Health Data
# MAGIC This notebook performs an ETL process on various datasets from Unity Catalog tables, integrating and transforming data for analysis.

# COMMAND ----------

# MAGIC
# Import necessary libraries
import logging
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, FloatType

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------

# MAGIC
# Load data from Unity Catalog tables
def load_data():
    logger.info("Loading data from Unity Catalog tables...")
    hospital_stats_df = spark.table("genai_demo.cardinal_health.hospital_stats_north_america")
    sales_associates_df = spark.table("genai_demo.cardinal_health.SalesAssociates_EmploymentDetails")
    compensation_guidelines_df = spark.table("genai_demo.cardinal_health.Compensation_Guidelines")
    hospital_sales_assignments_df = spark.table("genai_demo.cardinal_health.HospitalSales_Assignments")
    logistics_channels_df = spark.table("genai_demo.cardinal_health.Logistics_Channels")
    growth_opportunities_df = spark.table("genai_demo.cardinal_health.Growth_Opportunities")
    third_party_sales_trends_df = spark.table("genai_demo.cardinal_health.ThirdParty_SalesTrends")
    historical_sales_df = spark.table("genai_demo.cardinal_health.Historical_Sales")
    company_goals_df = spark.table("genai_demo.cardinal_health.Company_Goals")
    return (hospital_stats_df, sales_associates_df, compensation_guidelines_df, hospital_sales_assignments_df,
            logistics_channels_df, growth_opportunities_df, third_party_sales_trends_df, historical_sales_df, company_goals_df)

hospital_stats_df, sales_associates_df, compensation_guidelines_df, hospital_sales_assignments_df, logistics_channels_df, growth_opportunities_df, third_party_sales_trends_df, historical_sales_df, company_goals_df = load_data()

# COMMAND ----------

# MAGIC
# Data Integration: Join SalesAssociates_EmploymentDetails with Compensation_Guidelines
def integrate_compensation_data(sales_associates_df, compensation_guidelines_df):
    logger.info("Joining SalesAssociates_EmploymentDetails with Compensation_Guidelines...")
    compensation_df = sales_associates_df.join(compensation_guidelines_df, "Associate_ID", "inner")
    return compensation_df

compensation_df = integrate_compensation_data(sales_associates_df, compensation_guidelines_df)

# COMMAND ----------

# MAGIC
# Calculate total compensation to ensure accurate financial forecasting for associates
def calculate_total_compensation(compensation_df):
    logger.info("Calculating total compensation...")
    compensation_df = compensation_df.withColumn(
        "Compensation",
        F.col("Base_Salary").cast(FloatType()) +
        (F.col("Commission_Percentage").cast(FloatType()) * F.col("Base_Salary").cast(FloatType())) +
        F.col("Bonus").cast(FloatType())
    )
    return compensation_df

compensation_df = calculate_total_compensation(compensation_df)

# COMMAND ----------

# MAGIC
# Join hospital_stats_north_america with HospitalSales_Assignments
def join_hospital_sales(hospital_stats_df, hospital_sales_assignments_df):
    logger.info("Joining hospital_stats_north_america with HospitalSales_Assignments...")
    hospital_sales_df = hospital_stats_df.join(
        hospital_sales_assignments_df,
        ["Hospital_ID", "Hospital_Name"],
        "inner"
    )
    return hospital_sales_df

hospital_sales_df = join_hospital_sales(hospital_stats_df, hospital_sales_assignments_df)

# COMMAND ----------

# MAGIC
# Join the output of previous joins on Associate_ID and Associate_Name
def combine_data(compensation_df, hospital_sales_df):
    logger.info("Joining previous outputs on Associate_ID and Associate_Name...")
    combined_df = compensation_df.join(
        hospital_sales_df,
        ["Associate_ID", "Associate_Name"],
        "inner"
    )
    return combined_df

combined_df = combine_data(compensation_df, hospital_sales_df)

# COMMAND ----------

# MAGIC
# Select specific fields for further processing
def select_fields(combined_df):
    logger.info("Selecting specific fields for further processing...")
    selected_df = combined_df.select(
        "Hospital_ID", "Director_Name", "Manager_Name", "Associate_ID", "Associate_Name", "Compensation"
    )
    return selected_df

selected_df = select_fields(combined_df)

# COMMAND ----------

# MAGIC
# Join with unique records from Logistics_Channels and Growth_Opportunities
def join_growth_opportunities(logistics_channels_df, growth_opportunities_df):
    logger.info("Joining with unique records from Logistics_Channels and Growth_Opportunities...")
    unique_channels_df = logistics_channels_df.dropDuplicates(["Channel_ID", "Channel_Type", "Hospital_ID"])
    growth_df = unique_channels_df.join(
        growth_opportunities_df,
        ["Channel_ID", "Channel_Type", "Hospital_ID"],
        "inner"
    )
    return growth_df

growth_df = join_growth_opportunities(logistics_channels_df, growth_opportunities_df)

# COMMAND ----------

# MAGIC
# Join with Historical_Sales and ThirdParty_SalesTrends
def join_sales_trends(historical_sales_df, third_party_sales_trends_df):
    logger.info("Joining Historical_Sales with ThirdParty_SalesTrends...")
    sales_trends_df = historical_sales_df.join(
        third_party_sales_trends_df,
        "Channel_Type",
        "inner"
    )
    return sales_trends_df

sales_trends_df = join_sales_trends(historical_sales_df, third_party_sales_trends_df)

# COMMAND ----------

# MAGIC
# Calculate projected revenue based on target year
def calculate_projected_revenue(sales_trends_df):
    logger.info("Calculating projected revenue based on target year...")
    projected_growth_rate = F.col("Projected_Growth_Rate").cast(FloatType())
    sales_revenue = F.col("Sales_Revenue").cast(FloatType())
    year_condition = F.col("Year").cast(IntegerType()) > 2023

    sales_trends_df = sales_trends_df.withColumn(
        "Projected_Revenue",
        F.when(year_condition, sales_revenue * (1 + projected_growth_rate))
        .otherwise(sales_revenue)
    )
    return sales_trends_df

sales_trends_df = calculate_projected_revenue(sales_trends_df)

# COMMAND ----------

# MAGIC
# Generate rows for target years
def generate_target_years(sales_trends_df):
    logger.info("Generating rows for target years...")
    target_years_df = sales_trends_df.withColumn("Target_Year", F.explode(F.array([2023, 2024, 2025, 2026])))
    return target_years_df

target_years_df = generate_target_years(sales_trends_df)

# COMMAND ----------

# MAGIC
# Filter records where Target Year is greater than 2023
def filter_target_years(target_years_df):
    logger.info("Filtering records where Target Year is greater than 2023...")
    filtered_df = target_years_df.filter(target_years_df.Target_Year > 2023)
    return filtered_df

filtered_df = filter_target_years(target_years_df)

# COMMAND ----------

# MAGIC
# Join with Company_Goals on Hospital_ID and Channel_Type
def join_company_goals(filtered_df, company_goals_df):
    logger.info("Joining with Company_Goals on Hospital_ID and Channel_Type...")
    final_df = filtered_df.join(
        company_goals_df,
        ["Hospital_ID", "Channel_Type"],
        "inner"
    )
    return final_df

final_df = join_company_goals(filtered_df, company_goals_df)

# COMMAND ----------

# MAGIC
# Sort records by Target Year in ascending order
def sort_records(final_df):
    logger.info("Sorting records by Target Year in ascending order...")
    sorted_df = final_df.sort("Target_Year")
    return sorted_df

sorted_df = sort_records(final_df)

# COMMAND ----------

# MAGIC
# Define schema contract for final output
def define_schema_contract(sorted_df):
    logger.info("Defining schema contract for final output...")
    final_df = sorted_df.select(
        "Hospital_ID", "Channel_Type", "Investment_Planned", "Sales_Revenue", "Market_Trend", "Political_Impact",
        "Economic_Impact", "Target_Year", "Projected_Sales_Growth_Rate", "Projected_Revenue"
    )
    return final_df

final_df = define_schema_contract(sorted_df)

# COMMAND ----------

# MAGIC
# Write the final output to Unity Catalog table
def write_final_output(final_df):
    logger.info("Writing the final output to Unity Catalog table...")
    spark.sql("DROP TABLE IF EXISTS genai_demo.cardinal_health.Target_sales")
    final_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.cardinal_health.Target_sales")
    logger.info("ETL process completed successfully.")

write_final_output(final_df)

# COMMAND ----------

# MAGIC
# Handle exceptions
def handle_exceptions():
    try:
        # Execute all functions
        hospital_stats_df, sales_associates_df, compensation_guidelines_df, hospital_sales_assignments_df, logistics_channels_df, growth_opportunities_df, third_party_sales_trends_df, historical_sales_df, company_goals_df = load_data()
        compensation_df = integrate_compensation_data(sales_associates_df, compensation_guidelines_df)
        compensation_df = calculate_total_compensation(compensation_df)
        hospital_sales_df = join_hospital_sales(hospital_stats_df, hospital_sales_assignments_df)
        combined_df = combine_data(compensation_df, hospital_sales_df)
        selected_df = select_fields(combined_df)
        growth_df = join_growth_opportunities(logistics_channels_df, growth_opportunities_df)
        sales_trends_df = join_sales_trends(historical_sales_df, third_party_sales_trends_df)
        sales_trends_df = calculate_projected_revenue(sales_trends_df)
        target_years_df = generate_target_years(sales_trends_df)
        filtered_df = filter_target_years(target_years_df)
        final_df = join_company_goals(filtered_df, company_goals_df)
        sorted_df = sort_records(final_df)
        final_df = define_schema_contract(sorted_df)
        write_final_output(final_df)
    except Exception as e:
        logger.error(f"An error occurred during the ETL process: {e}")
        raise

handle_exceptions()
