In [None]:
import logging
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    # Load data from Unity Catalog tables
    hospital_stats_df = spark.table("genai_demo.cardinal_health.hospital_stats_north_america")
    hospital_assignments_df = spark.table("genai_demo.cardinal_health.hospital_sales_assignments")
    employment_details_df = spark.table("genai_demo.cardinal_health.sales_associates_employment_details")
    compensation_guidelines_df = spark.table("genai_demo.cardinal_health.compensation_guidelines")
    logistics_channels_df = spark.table("genai_demo.cardinal_health.logistics_channels")
    growth_opportunities_df = spark.table("genai_demo.cardinal_health.growth_opportunities")
    company_goals_df = spark.table("genai_demo.cardinal_health.company_goals")
    historical_sales_df = spark.table("genai_demo.cardinal_health.historical_sales")
    third_party_trends_df = spark.table("genai_demo.cardinal_health.third_party_sales_trends")

    # Step 1: Join hospital statistics with sales assignments
    hospital_sales_df = hospital_stats_df.join(
        hospital_assignments_df,
        on=["Hospital_ID", "Hospital_Name"],
        how="inner"
    )

    # Step 2: Join employment details with compensation guidelines
    employment_compensation_df = employment_details_df.join(
        compensation_guidelines_df,
        on="Associate_ID",
        how="inner"
    )

    # Step 3: Join the results of previous joins
    combined_df = hospital_sales_df.join(
        employment_compensation_df,
        on=["Associate_ID", "Associate_Name"],
        how="inner"
    )

    # Step 4: Calculate total compensation
    combined_df = combined_df.withColumn(
        "Compensation",
        F.col("Base_Salary") + (F.col("Commission_Percentage") / 100 * F.col("Base_Salary")) + F.col("Bonus")
    )

    # Step 5: Select relevant fields for further processing
    selected_df = combined_df.select(
        "Associate_ID", "Associate_Name", "Compensation", "Director_Name", "Hospital_ID", "Manager_Name"
    )

    # Step 6: Join logistics channels with growth opportunities
    logistics_growth_df = logistics_channels_df.join(
        growth_opportunities_df,
        on=["Channel_ID", "Channel_Type", "Hospital_ID"],
        how="inner"
    )

    # Step 7: Join compensation data with logistics and growth data
    compensation_logistics_growth_df = selected_df.join(
        logistics_growth_df,
        on="Hospital_ID",
        how="inner"
    )

    # Step 8: Select fields for further processing
    selected_logistics_growth_df = compensation_logistics_growth_df.select(
        "Hospital_ID", "Channel_Type", "Growth_Opportunities", "Projected_Growth_Rate"
    )

    # Step 9: Join with company goals data
    goals_df = selected_logistics_growth_df.join(
        company_goals_df,
        on=["Hospital_ID", "Channel_Type"],
        how="inner"
    )

    # Step 10: Ensure unique records based on specific fields
    unique_goals_df = goals_df.dropDuplicates(["Hospital_ID", "Channel_Type", "Projected_Growth_Rate", "Investment_Planned"])

    # Step 11: Join historical sales with third-party sales trends
    sales_trends_df = historical_sales_df.join(
        third_party_trends_df,
        on="Channel_Type",
        how="inner"
    )

    # Step 12: Ensure unique sales records
    unique_sales_trends_df = sales_trends_df.dropDuplicates(["Year", "Channel_Type", "Sales_Revenue"])

    # Step 13: Join sales data with growth and investment data
    sales_growth_investment_df = unique_sales_trends_df.join(
        unique_goals_df,
        on=["Hospital_ID", "Channel_ID", "Channel_Type"],
        how="inner"
    )

    # Step 14: Generate rows for target years
    target_years_df = sales_growth_investment_df.withColumn("TargetYear", F.explode(F.array([2024, 2025, 2026])))

    # Step 15: Calculate projected sales growth rate
    def calculate_growth_rate(target_year, projected_growth_rate):
        if target_year == 2024:
            return projected_growth_rate + (projected_growth_rate / 100)
        elif target_year == 2025:
            return (projected_growth_rate + (projected_growth_rate / 100)) + (projected_growth_rate / 100)
        elif target_year == 2026:
            return ((projected_growth_rate + (projected_growth_rate / 100)) + (projected_growth_rate / 100)) + (projected_growth_rate / 100)
        else:
            return projected_growth_rate

    calculate_growth_rate_udf = F.udf(calculate_growth_rate, DoubleType())
    target_years_df = target_years_df.withColumn(
        "projected_sales_growth_rate",
        calculate_growth_rate_udf(F.col("TargetYear"), F.col("Projected_Growth_Rate"))
    )

    # Step 16: Calculate projected investments
    def calculate_investments(target_year, investment_planned, projected_sales_growth_rate):
        if target_year == 2024:
            return investment_planned * (projected_sales_growth_rate / 100)
        elif target_year == 2025:
            return investment_planned * (1 + projected_sales_growth_rate / 100)
        elif target_year == 2026:
            return investment_planned * (1 + projected_sales_growth_rate / 100)
        else:
            return investment_planned

    calculate_investments_udf = F.udf(calculate_investments, DoubleType())
    target_years_df = target_years_df.withColumn(
        "projected_investments",
        calculate_investments_udf(F.col("TargetYear"), F.col("Investment_Planned"), F.col("projected_sales_growth_rate"))
    )

    # Step 17: Calculate projected revenue
    def calculate_revenue(target_year, sales_revenue, projected_sales_growth_rate):
        if target_year == 2024:
            return sales_revenue * (projected_sales_growth_rate / 100)
        elif target_year == 2025:
            return sales_revenue * (1 + projected_sales_growth_rate / 100)
        elif target_year == 2026:
            return sales_revenue * (1 + projected_sales_growth_rate / 100)
        else:
            return sales_revenue

    calculate_revenue_udf = F.udf(calculate_revenue, DoubleType())
    target_years_df = target_years_df.withColumn(
        "Projected_Revenue",
        calculate_revenue_udf(F.col("TargetYear"), F.col("Sales_Revenue"), F.col("projected_sales_growth_rate"))
    )

    # Step 18: Filter records based on target year
    filtered_df = target_years_df.filter(F.col("TargetYear") > 2023)

    # Step 19: Select final fields for output
    final_df = filtered_df.select(
        "Channel_Type", "Hospital_ID", "Market_Trend", "Political_Impact", "Economic_Impact",
        "TargetYear", "projected_sales_growth_rate", "projected_investments", "Projected_Revenue"
    )

    # Step 20: Sort records by TargetYear
    sorted_final_df = final_df.orderBy("TargetYear")

    # Write the processed data to Unity Catalog target table
    spark.sql("DROP TABLE IF EXISTS genai_demo.cardinal_health.sales_prediction_output")
    sorted_final_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.cardinal_health.sales_prediction_output")

    logger.info("ETL process completed successfully and data written to Unity Catalog.")

except Exception as e:
    logger.error(f"An error occurred during the ETL process: {e}")
