In [None]:
import logging
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType
import psycopg2
from psycopg2 import sql

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Securely retrieve credentials for external systems
db_host = dbutils.secrets.get(scope="my_scope", key="db_host")
db_port = dbutils.secrets.get(scope="my_scope", key="db_port")
db_name = dbutils.secrets.get(scope="my_scope", key="db_name")
db_user = dbutils.secrets.get(scope="my_scope", key="db_user")
db_password = dbutils.secrets.get(scope="my_scope", key="db_password")

# Load data from Unity Catalog tables
try:
    hospital_stats_df = spark.table("catalog.db.hospital_stats_north_america")
    sales_assignments_df = spark.table("catalog.db.hospital_sales_assignments")
    employment_details_df = spark.table("catalog.db.sales_associates_employment_details")
    compensation_guidelines_df = spark.table("catalog.db.compensation_guidelines")
    logistics_channels_df = spark.table("catalog.db.logistics_channels")
    growth_opportunities_df = spark.table("catalog.db.growth_opportunities")
    company_goals_df = spark.table("catalog.db.company_goals")
    historical_sales_df = spark.table("catalog.db.historical_sales")
    third_party_sales_trends_df = spark.table("catalog.db.third_party_sales_trends")
    logger.info("Data loaded successfully from Unity Catalog tables.")
except Exception as e:
    logger.error(f"Error loading data from Unity Catalog tables: {e}")
    raise

# Transformation logic
try:
    # Join hospital statistics with sales assignments
    hospital_sales_df = hospital_stats_df.join(
        sales_assignments_df,
        on=["Hospital_ID", "Hospital_Name"],
        how="inner"
    )

    # Join employment details with compensation guidelines
    employment_compensation_df = employment_details_df.join(
        compensation_guidelines_df,
        on="Associate_ID",
        how="inner"
    )

    # Combine results from previous joins
    combined_df = hospital_sales_df.join(
        employment_compensation_df,
        on=["Associate_ID", "Associate_Name"],
        how="inner"
    )

    # Calculate total compensation
    combined_df = combined_df.withColumn(
        "Compensation",
        F.col("Base_Salary") + (F.col("Commission_Percentage") / 100 * F.col("Base_Salary")) + F.col("Bonus")
    )

    # Select relevant fields
    selected_df = combined_df.select(
        "Associate_ID", "Associate_Name", "Compensation", "Director_Name", "Hospital_ID", "Manager_Name"
    )

    # Join logistics channels with growth opportunities
    logistics_growth_df = logistics_channels_df.join(
        growth_opportunities_df,
        on=["Channel_ID", "Channel_Type", "Hospital_ID"],
        how="inner"
    )

    # Combine compensation data with logistics and growth data
    final_df = selected_df.join(
        logistics_growth_df,
        on="Hospital_ID",
        how="inner"
    )

    # Select fields for further processing
    final_selected_df = final_df.select(
        "Hospital_ID", "Channel_Type", "Growth_Opportunities", "Projected_Growth_Rate"
    )

    # Join with company goals
    goals_df = final_selected_df.join(
        company_goals_df,
        on=["Hospital_ID", "Channel_Type"],
        how="inner"
    )

    # Ensure unique records
    unique_goals_df = goals_df.dropDuplicates(["Hospital_ID", "Channel_Type", "Projected_Growth_Rate", "Investment_Planned"])

    # Join historical sales with third-party sales trends
    sales_trends_df = historical_sales_df.join(
        third_party_sales_trends_df,
        on="Channel_Type",
        how="inner"
    )

    # Ensure unique sales records
    unique_sales_df = sales_trends_df.dropDuplicates(["Year", "Channel_Type", "Sales_Revenue"])

    # Combine sales data with growth and investment data
    combined_sales_df = unique_sales_df.join(
        unique_goals_df,
        on=["Hospital_ID", "Channel_ID", "Channel_Type"],
        how="inner"
    )

    # Generate rows for target years
    target_years_df = combined_sales_df.withColumn(
        "Target Year",
        F.expr("sequence(2023, 2026)")
    ).selectExpr("explode(`Target Year`) as `Target Year`", "*")

    # Calculate projected sales growth rate
    target_years_df = target_years_df.withColumn(
        "projected_sales_growth_rate",
        F.when(F.col("Target Year") == 2024, F.col("Projected_Growth_Rate") + (F.col("Projected_Growth_Rate") / 100))
        .when(F.col("Target Year") == 2025, (F.col("Projected_Growth_Rate") + (F.col("Projected_Growth_Rate") / 100)) + (F.col("Projected_Growth_Rate") / 100))
        .when(F.col("Target Year") == 2026, ((F.col("Projected_Growth_Rate") + (F.col("Projected_Growth_Rate") / 100)) + (F.col("Projected_Growth_Rate") / 100)) + (F.col("Projected_Growth_Rate") / 100))
        .otherwise(F.col("Projected_Growth_Rate"))
    )

    # Calculate projected investments
    target_years_df = target_years_df.withColumn(
        "projected_investments",
        F.when(F.col("Target Year") == 2024, F.col("Investment_Planned") * (F.col("projected_sales_growth_rate") / 100))
        .when(F.col("Target Year") == 2025, F.col("Investment_Planned") * (1 + F.col("projected_sales_growth_rate") / 100))
        .when(F.col("Target Year") == 2026, F.col("Investment_Planned") * (1 + F.col("projected_sales_growth_rate") / 100))
        .otherwise(F.col("Investment_Planned"))
    )

    # Calculate projected revenue
    target_years_df = target_years_df.withColumn(
        "Projected Revenue",
        F.when(F.col("Target Year") == 2024, F.col("Sales_Revenue") * (F.col("projected_sales_growth_rate") / 100))
        .when(F.col("Target Year") == 2025, F.col("Sales_Revenue") * (1 + F.col("projected_sales_growth_rate") / 100))
        .when(F.col("Target Year") == 2026, F.col("Sales_Revenue") * (1 + F.col("projected_sales_growth_rate") / 100))
        .otherwise(F.col("Sales_Revenue"))
    )

    # Filter records based on target year
    filtered_df = target_years_df.filter(F.col("Target Year") > 2023)

    # Select final fields for output
    output_df = filtered_df.select(
        "Channel_Type", "Hospital_ID", "Market_Trend", "Political_Impact", "Economic_Impact", "Target Year",
        "projected_sales_growth_rate", "projected_investments", "Projected Revenue"
    )

    # Sort records by target year
    sorted_output_df = output_df.orderBy("Target Year")

    logger.info("Data transformation completed successfully.")
except Exception as e:
    logger.error(f"Error during data transformation: {e}")
    raise

# Output handling
try:
    # Drop existing table if necessary
    spark.sql("DROP TABLE IF EXISTS catalog.db.sales_prediction_output")

    # Write the processed data to Unity Catalog tables using Delta format
    sorted_output_df.write.format("delta").mode("overwrite").saveAsTable("catalog.db.sales_prediction_output")
    logger.info("Data written successfully to Unity Catalog table.")
except Exception as e:
    logger.error(f"Error writing data to Unity Catalog table: {e}")
    raise

# Optionally, connect to external systems using psycopg2
try:
    conn = psycopg2.connect(
        host=db_host,
        port=db_port,
        dbname=db_name,
        user=db_user,
        password=db_password
    )
    cursor = conn.cursor()
    # Example query to fetch data
    cursor.execute(sql.SQL("SELECT * FROM external_table"))
    external_data = cursor.fetchall()
    logger.info("Data fetched successfully from external system.")
except Exception as e:
    logger.error(f"Error connecting to external system: {e}")
    raise
finally:
    if conn:
        conn.close()
