In [None]:
# Databricks notebook source
# COMMAND ----------
# %md
# # ETL Process for Sales Prediction
# This notebook performs an ETL process to load, integrate, and transform data for sales prediction using PySpark.

# COMMAND ----------
#
# Import necessary libraries
import logging
from pyspark.sql import functions as F

# COMMAND ----------
#
# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------
#
# Function to load data from Unity Catalog tables
def load_data():
    logger.info("Loading data from Unity Catalog tables...")
    hospital_stats_df = spark.table("genai_demo.cardinal_health.hospital_stats_north_america")
    sales_assignments_df = spark.table("genai_demo.cardinal_health.hospital_sales_assignments")
    employment_details_df = spark.table("genai_demo.cardinal_health.sales_associates_employment_details")
    compensation_guidelines_df = spark.table("genai_demo.cardinal_health.compensation_guidelines")
    logistics_channels_df = spark.table("genai_demo.cardinal_health.logistics_channels")
    growth_opportunities_df = spark.table("genai_demo.cardinal_health.growth_opportunities")
    company_goals_df = spark.table("genai_demo.cardinal_health.company_goals")
    historical_sales_df = spark.table("genai_demo.cardinal_health.historical_sales")
    third_party_trends_df = spark.table("genai_demo.cardinal_health.third_party_sales_trends")
    return (hospital_stats_df, sales_assignments_df, employment_details_df, 
            compensation_guidelines_df, logistics_channels_df, growth_opportunities_df, 
            company_goals_df, historical_sales_df, third_party_trends_df)

# COMMAND ----------
#
# Function to perform data integration
def integrate_data(hospital_stats_df, sales_assignments_df, employment_details_df, compensation_guidelines_df):
    logger.info("Performing data integration...")
    hospital_sales_df = hospital_stats_df.join(sales_assignments_df, ["Hospital_ID", "Hospital_Name"], "inner")
    employment_compensation_df = employment_details_df.join(compensation_guidelines_df, "Associate_ID", "inner")
    consolidated_df = hospital_sales_df.join(employment_compensation_df, ["Associate_ID", "Hospital_ID"], "inner")
    return consolidated_df

# COMMAND ----------
#
# Function to calculate total compensation
def calculate_compensation(consolidated_df):
    logger.info("Calculating total compensation for associates...")
    consolidated_df = consolidated_df.withColumn("Compensation", 
        F.col("Base_Salary") + (F.col("Commission_Percentage") * F.col("Base_Salary")) + F.col("Bonus"))
    return consolidated_df

# COMMAND ----------
#
# Function to filter and sort data
def filter_and_sort_data(consolidated_df):
    logger.info("Filtering and sorting data...")
    target_year_condition = F.col("Target Year") > 2023
    filtered_df = consolidated_df.filter(target_year_condition).orderBy("Target Year")
    return filtered_df

# COMMAND ----------
#
# Function to generate final output
def generate_output(filtered_df):
    logger.info("Generating final output for sales predictions...")
    sales_prediction_output_df = filtered_df.select(
        F.col("Channel_Type").alias("Channel_Type"),
        F.col("Hospital_ID").alias("Hospital_ID"),
        F.col("Market_Trend").alias("Market_Trend"),
        F.col("Political_Impact").alias("Political_Impact"),
        F.col("Economic_Impact").alias("Economic_Impact"),
        F.col("Target Year").alias("Target_Year"),
        F.col("projected_sales_growth_rate").alias("Projected_Sales_Growth_Rate"),
        F.col("projected_investments").alias("Projected_Investments"),
        F.col("Projected Revenue").alias("Projected_Revenue")
    )
    return sales_prediction_output_df

# COMMAND ----------
#
# Function to write output to Unity Catalog
def write_output(sales_prediction_output_df):
    logger.info("Writing output to Unity Catalog target table...")
    spark.sql("DROP TABLE IF EXISTS genai_demo.cardinal_health.sales_prediction_output")
    sales_prediction_output_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.cardinal_health.sales_prediction_output")

# COMMAND ----------
#
# Main ETL process
try:
    # Load data
    data = load_data()
    
    # Integrate data
    consolidated_df = integrate_data(*data[:4])
    
    # Calculate compensation
    consolidated_df = calculate_compensation(consolidated_df)
    
    # Filter and sort data
    filtered_df = filter_and_sort_data(consolidated_df)
    
    # Generate output
    sales_prediction_output_df = generate_output(filtered_df)
    
    # Write output
    write_output(sales_prediction_output_df)
    
    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error(f"An error occurred during the ETL process: {e}")
    raise
