In [None]:
# Databricks notebook source
# COMMAND ----------
# MAGIC %md
# MAGIC # Data Processing and Analysis with PySpark
# MAGIC This notebook performs data integration, custom calculations, data cleansing, and output generation using PySpark.

# COMMAND ----------
# MAGIC
# Import necessary libraries
import logging
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType

# COMMAND ----------
# MAGIC
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------
# MAGIC
try:
    # Load data from Unity Catalog tables
    hospital_stats_df = spark.table("genai_demo.cardinal_health.hospital_stats_north_america")
    sales_assignments_df = spark.table("genai_demo.cardinal_health.HospitalSales_Assignments")
    employment_details_df = spark.table("genai_demo.cardinal_health.SalesAssociates_EmploymentDetails")
    compensation_guidelines_df = spark.table("genai_demo.cardinal_health.Compensation_Guidelines")
    logistics_channels_df = spark.table("genai_demo.cardinal_health.Logistics_Channels")
    growth_opportunities_df = spark.table("genai_demo.cardinal_health.Growth_Opportunities")
    company_goals_df = spark.table("genai_demo.cardinal_health.Company_Goals")
    historical_sales_df = spark.table("genai_demo.cardinal_health.Historical_Sales")
    third_party_sales_trends_df = spark.table("genai_demo.cardinal_health.ThirdParty_SalesTrends")

    logger.info("Data loaded successfully from Unity Catalog tables.")

# COMMAND ----------
# MAGIC
# Step 2: Data Integration
    # Join hospital statistics with sales assignments
    hospital_sales_df = hospital_stats_df.join(
        sales_assignments_df,
        on=["Hospital_ID", "Hospital_Name"],
        how="inner"
    )

    # Join employment details with compensation guidelines
    employment_compensation_df = employment_details_df.join(
        compensation_guidelines_df,
        on="Associate_ID",
        how="inner"
    )

    # Join the results of the previous joins
    combined_df = hospital_sales_df.join(
        employment_compensation_df,
        on=["Associate_ID", "Associate_Name"],
        how="inner"
    )

    logger.info("Data integration completed.")

# COMMAND ----------
# MAGIC
# Step 3: Custom Calculations
    # Calculate total compensation based on base salary, commission, and bonus
    combined_df = combined_df.withColumn(
        "Compensation",
        F.col('Base_Salary') + 
        (F.col('Commission_Percentage') * F.col('Base_Salary')) + 
        F.col('Bonus')
    )

    # Join logistics channels with growth opportunities
    logistics_growth_df = logistics_channels_df.join(
        growth_opportunities_df,
        on=["Channel_ID", "Channel_Type", "Hospital_ID"],
        how="inner"
    )

    # Join the results of the compensation calculation with logistics and growth data
    final_df = combined_df.join(
        logistics_growth_df,
        on="Hospital_ID",
        how="inner"
    )

    # Ensure unique records
    unique_df = final_df.dropDuplicates(["Hospital_ID", "Channel_Type", "Projected_Growth_Rate"])

    # Join the results of the previous unique operation with company goals
    final_df = unique_df.join(
        company_goals_df,
        on=["Hospital_ID", "Channel_Type"],
        how="inner"
    )

    # Define condition for target year
    target_year_condition = F.col('Target_Year') > 2023

    # Calculate projected sales growth rate based on target year condition
    final_df = final_df.withColumn(
        "projected_sales_growth_rate",
        F.when(target_year_condition, F.col('Projected_Growth_Rate')).otherwise(0)
    )

    # Calculate projected investments based on target year condition
    final_df = final_df.withColumn(
        "projected_investments",
        F.when(target_year_condition, F.col('Investment_Planned')).otherwise(0)
    )

    # Calculate projected revenue based on target year condition
    final_df = final_df.withColumn(
        "Projected_Revenue",
        F.when(target_year_condition, F.col('Sales_Revenue') * F.col('projected_sales_growth_rate')).otherwise(0)
    )

    logger.info("Custom calculations completed.")

# COMMAND ----------
# MAGIC
# Step 4: Data Cleansing and Standardization
    # Filter records based on the target year
    filtered_df = final_df.filter(target_year_condition)

    # Select specific fields for output, defining the schema contract
    output_df = filtered_df.select(
        F.col("Channel_Type"),
        F.col("Hospital_ID"),
        F.col("Market_Trend"),
        F.col("Political_Impact"),
        F.col("Economic_Impact"),
        F.col("Target_Year"),
        F.col("projected_sales_growth_rate"),
        F.col("projected_investments"),
        F.col("Projected_Revenue")
    )

# COMMAND ----------
# MAGIC
# Step 5: Aggregation and Sorting
    # Sort records by Target Year
    sorted_df = output_df.orderBy("Target_Year")

    logger.info("Data cleansing, standardization, and sorting completed.")

# COMMAND ----------
# MAGIC
# Step 6: Output Generation
    # Write the final output to a Delta table
    spark.sql("DROP TABLE IF EXISTS genai_demo.cardinal_health.sales_prediction_output")
    sorted_df.write.format("delta").mode("overwrite").saveAsTable("genai_demo.cardinal_health.sales_prediction_output")

    logger.info("Output generated and saved to Unity Catalog table.")

except Exception as e:
    logger.error(f"An error occurred: {e}")
    raise
