In [None]:
# Databricks notebook source
# COMMAND ----------
# MAGIC %md
# MAGIC # ETL Process for Sales Prediction
# MAGIC This notebook performs an ETL process to predict sales using data from Unity Catalog tables.

# COMMAND ----------
# MAGIC
# Import necessary libraries
import logging
from pyspark.sql import functions as F

# COMMAND ----------
# MAGIC
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------
# MAGIC
try:
    # Step 1: Data Loading
    logger.info("Loading data from Unity Catalog tables...")
    hospital_stats_df = spark.table("genai_demo.cardinal_health.hospitals_stats")
    hospital_assignments_df = spark.table("genai_demo.cardinal_health.hospital_assignments")
    employment_details_df = spark.table("genai_demo.cardinal_health.associates_employment")
    compensation_guidelines_df = spark.table("genai_demo.cardinal_health.compensation_guidelines")
    logistics_channels_df = spark.table("genai_demo.cardinal_health.logistics_channels")
    growth_opportunities_df = spark.table("genai_demo.cardinal_health.growth_opportunities")
    company_goals_df = spark.table("genai_demo.cardinal_health.company_goals_1")
    historical_sales_df = spark.table("genai_demo.cardinal_health.historical_sales_trending")
    third_party_trends_df = spark.table("genai_demo.cardinal_health.third_party_trends")

# COMMAND ----------
# MAGIC
# Step 2: Data Integration
    logger.info("Performing data integration...")
    hospital_sales_df = hospital_stats_df.join(
        hospital_assignments_df,
        ["Hospital_ID", "Hospital_Name"],
        "inner"
    )

    employment_compensation_df = employment_details_df.join(
        compensation_guidelines_df,
        "Associate_ID",
        "inner"
    )

    combined_df = employment_compensation_df.join(
        hospital_sales_df,
        ["Associate_ID", "Associate_Name"],
        "inner"
    )

# COMMAND ----------
# MAGIC
# Step 3: Custom Calculations
    logger.info("Performing custom calculations...")
    compensation_df = combined_df.withColumn(
        "Compensation",
        F.expr("Base_Salary + (Commission_Percentage * Base_Salary) + Bonus")
    )

    projected_growth_df = growth_opportunities_df.withColumn(
        "projected_sales_growth_rate",
        F.expr("Projected_Growth_Rate * (Target_Year - 2023)")
    )

    projected_investments_df = company_goals_df.withColumn(
        "projected_investments",
        F.expr("Investment_Planned * projected_sales_growth_rate")
    )

    projected_revenue_df = historical_sales_df.withColumn(
        "Projected_Revenue",
        F.expr("Sales_Revenue * projected_sales_growth_rate")
    )

# COMMAND ----------
# MAGIC
# Step 4: Data Cleansing and Standardization
    logger.info("Ensuring data integrity and relevance...")
    unique_df = projected_revenue_df.dropDuplicates([F.col("Year"), F.col("Channel_Type"), F.col("Sales_Revenue")])

# COMMAND ----------
# MAGIC
# Step 5: Predictive Modeling
    logger.info("Generating future year projections...")
    target_years_df = spark.createDataFrame(
        [(year,) for year in range(2023, 2027)],
        ["Target_Year"]
    )

# COMMAND ----------
# MAGIC
# Step 6: Final Output
    logger.info("Filtering, sorting, and selecting fields for final output...")
    filtered_df = unique_df.filter(F.col("Target_Year") > 2023)
    sorted_df = filtered_df.orderBy("Target_Year")
    final_output_df = sorted_df.select(
        "Channel_Type", "Hospital_ID", "Market_Trend", "Political_Impact", "Economic_Impact",
        "Target_Year", "projected_sales_growth_rate", "projected_investments", "Projected_Revenue"
    )

# COMMAND ----------
# MAGIC
# Step 7: Save Final Output
    logger.info("Saving final output to Unity Catalog table...")
    spark.sql("DROP TABLE IF EXISTS genai_demo.cardinal_health.sales_prediction_output")
    final_output_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.cardinal_health.sales_prediction_output")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error(f"An error occurred during the ETL process: {e}")
    raise
