In [None]:
import logging
from pyspark.sql import functions as F
from pyspark.sql.types import StringType
import psycopg2
from datetime import datetime

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Function to connect to PostgreSQL and fetch data
def fetch_data_from_postgres(query):
    try:
        # Retrieve credentials securely
        host = dbutils.secrets.get(scope="my_scope", key="postgres_host")
        dbname = dbutils.secrets.get(scope="my_scope", key="postgres_dbname")
        user = dbutils.secrets.get(scope="my_scope", key="postgres_user")
        password = dbutils.secrets.get(scope="my_scope", key="postgres_password")
        
        # Connect to PostgreSQL
        conn = psycopg2.connect(host=host, dbname=dbname, user=user, password=password)
        cursor = conn.cursor()
        cursor.execute(query)
        data = cursor.fetchall()
        cursor.close()
        conn.close()
        return data
    except Exception as e:
        logger.error(f"Error fetching data from PostgreSQL: {e}")
        raise

# Load static data from TextInput Channel (Node 69)
try:
    text_input_df = spark.createDataFrame(
        [("D1", "Distribution Channel 1"), ("D2", "Distribution Channel 2")],
        ["DIST_CHNL", "DIST_CHNL_DESC"]
    )
    logger.info(f"TextInput Channel loaded with {text_input_df.count()} records")
except Exception as e:
    logger.error(f"Error loading TextInput Channel: {e}")

# Load manual date-related fields (Node 24)
try:
    manual_date_df = spark.createDataFrame(
        [(datetime(2023, 1, 1), datetime(2023, 1, 31), datetime.now(), "20230101", "20230131")],
        ["Start Date", "End Date", "DateTime_Out", "StartTXT", "EndTXT"]
    )
    logger.info(f"Manual Date fields loaded with {manual_date_df.count()} records")
except Exception as e:
    logger.error(f"Error loading Manual Date fields: {e}")

# Connect to SQL Server and fetch data (Node 48)
try:
    sql_server_data = fetch_data_from_postgres("SELECT * FROM tdmedpod WHERE BILL_DTE = '2019-07-01' AND WHS = 'D0CG';")
    sql_server_df = spark.createDataFrame(sql_server_data, schema=["DIST_CHNL_DESC", "RunDte", "SO_Date", "BILL_DTE", "Whs", "DIST_CHNL_ID", "FNC_ID", "FNC_DESC", "SOLDTO", "SHIPTO", "RFRNC_DOC_NUM", "Invoices", "Invoice_Lines", "LANDED_COST", "REV_COST", "DIRECT_STD_COST", "NET_REV_AMT", "Invoice_Sales", "EXT_SALES", "EXT_FINAL_PRICE", "SERVICE_FEE", "EXT_SHIP_HNDL", "EXT_SALES_TAX", "EXT_LOCAL_TAX", "BASE_QTY", "SELL_QTY", "WGT", "VOL", "Vendor_Trans_Absorb", "Vendor_Drop_Ship_Absorb", "Ext_Hndl_Drop_Absorb", "Vendor_MOC_Absorb", "Trans_Absorb_Amt", "Trans_Charge_Amt", "RESTOCK_Fee", "Special_Hndl_Amt", "Vendor_Hndl_Amt", "MOC_Amt", "Fuel_Surcharge", "BIA_Ship_Hndl_Amt", "Rush_Order_Fee", "Vendor_Trans_Charge", "Vendor_Drop_Ship_Fee", "Markup_Vendor_Trans", "Markup_Hndl_Fee", "COE_Ship_Hndl_Amt"])
    logger.info(f"SQL Server data loaded with {sql_server_df.count()} records")
except Exception as e:
    logger.error(f"Error fetching data from SQL Server: {e}")

# Load data from Unity Catalog tables for Dynamic Inputs (Nodes 18, 77, 89, 86, 84)
try:
    week1_df = spark.table("catalog.db.dynamic_input_week1")
    week2_df = spark.table("catalog.db.dynamic_input_week2")
    week3_df = spark.table("catalog.db.dynamic_input_week3")
    week4_df = spark.table("catalog.db.dynamic_input_week4")
    week5_df = spark.table("catalog.db.dynamic_input_week5")
    logger.info("Dynamic Input data loaded from Unity Catalog tables")
except Exception as e:
    logger.error(f"Error loading Dynamic Input data: {e}")

# Union operation (Node 78)
try:
    union_df = week1_df.union(week2_df).union(week3_df).union(week4_df).union(week5_df)
    logger.info(f"Union operation completed with {union_df.count()} records")
except Exception as e:
    logger.error(f"Error during Union operation: {e}")

# Rename fields (Node 40)
try:
    renamed_df = union_df.withColumnRenamed("SO_AUDAT", "SO_Date") \
                         .withColumnRenamed("FKDAT", "BILL_DATE") \
                         .withColumnRenamed("WERKS", "Whs") \
                         .withColumnRenamed("VTWEG", "DIST_CHNL_ID") \
                         .withColumnRenamed("ZZFINCLASS", "FNC_ID") \
                         .withColumnRenamed("BEZEK", "FNC_DESC") \
                         .withColumnRenamed("SOLDTO_KUNNR", "SOLDTO") \
                         .withColumnRenamed("SHIPTO_KUNNR", "SHIPTO") \
                         .withColumnRenamed("VGBEL", "RFRNC_DOC_NUM") \
                         .withColumnRenamed("lines", "Invoice_lines")
    logger.info("Fields renamed successfully")
except Exception as e:
    logger.error(f"Error renaming fields: {e}")

# MultiFieldFormula transformation (Node 67)
try:
    transformed_df = renamed_df.select(
        *[F.when(F.col(c).isNull() | (F.col(c) == ""), 0).otherwise(F.col(c)).alias(c) for c in renamed_df.columns]
    )
    logger.info("MultiFieldFormula transformation applied")
except Exception as e:
    logger.error(f"Error applying MultiFieldFormula transformation: {e}")

# Cleanse transformation (Node 44)
try:
    cleansed_df = transformed_df.select(
        *[F.upper(F.col(c)).alias(c) for c in transformed_df.columns]
    )
    logger.info("Cleanse transformation applied")
except Exception as e:
    logger.error(f"Error applying Cleanse transformation: {e}")

# Custom Calculation for Rush_Order_Fee (Node 60)
try:
    rush_fee_df = cleansed_df.withColumn("Rush_Order_Fee", F.col("ADDTN_TRANS_FEE_OVRRIDE_ZSRO"))
    logger.info("Rush_Order_Fee calculated")
except Exception as e:
    logger.error(f"Error calculating Rush_Order_Fee: {e}")

# Summarize Rush_Order_Fee (Node 42)
try:
    summarize_df = rush_fee_df.groupBy("BILL_DATE").agg(F.sum("Rush_Order_Fee").alias("Sum_Rush_Order_Fee"))
    logger.info("Rush_Order_Fee summarized")
except Exception as e:
    logger.error(f"Error summarizing Rush_Order_Fee: {e}")

# Dynamic Rename (Node 46)
try:
    dynamic_rename_df = summarize_df.select(
        *[F.col(c).alias(c.replace("Sum_", "")) for c in summarize_df.columns]
    )
    logger.info("Dynamic Rename transformation applied")
except Exception as e:
    logger.error(f"Error applying Dynamic Rename transformation: {e}")

# Append Fields (Node 56)
try:
    append_fields_df = dynamic_rename_df.withColumn("BIA_SHIP_HNDL_AMT", F.expr("[Sum_Trans_Charge_Amt]+[Sum_RESTOCK_Fee]+[Sum_Special_Hndl_Amt]+[Sum_Vendor_Hndl_Amt]+[Sum_MOC_Amt]+[Sum_Fuel_Surcharge]")) \
                                        .withColumn("COE_SHIP_HNDL_AMT", F.expr("[Sum_Trans_Charge_Amt]+[Sum_RESTOCK_Fee]+[Sum_Special_Hndl_Amt]+[Sum_Vendor_Hndl_Amt]+[Sum_MOC_Amt]+[Sum_Fuel_Surcharge]+[Rush_Order_Fee]+[VENDR_TRANS_CHRG_FRT_ZTV1]+[MARKUP_VENDOR_TRANS_FEE_AMT_ZMT1]"))
    logger.info("Fields appended successfully")
except Exception as e:
    logger.error(f"Error appending fields: {e}")

# Formula: Invoice_Sales (Node 68)
try:
    invoice_sales_df = append_fields_df.withColumn("Invoice_Sales", F.col("Sum_EXT_FINAL_PRICE"))
    logger.info("Invoice_Sales calculated")
except Exception as e:
    logger.error(f"Error calculating Invoice_Sales: {e}")

# Join operation (Node 70)
try:
    joined_df = invoice_sales_df.join(text_input_df, invoice_sales_df.DIST_CHNL_ID == text_input_df.DIST_CHNL, "left")
    logger.info("Join operation completed")
except Exception as e:
    logger.error(f"Error during Join operation: {e}")

# Union operation (Node 71)
try:
    final_union_df = joined_df.union(invoice_sales_df)
    logger.info(f"Final Union operation completed with {final_union_df.count()} records")
except Exception as e:
    logger.error(f"Error during final Union operation: {e}")

# Custom Calculation for null_yn (Node 79)
try:
    null_yn_df = final_union_df.withColumn("null_yn", F.when(F.col("FNC_ID").isNull() | (F.col("FNC_ID") == ""), "Y")
                                           .when(F.col("Whs").isNull() | (F.col("Whs") == ""), "Y")
                                           .when(F.col("DIST_CHNL_ID").isNull() | (F.col("DIST_CHNL_ID") == ""), "Y")
                                           .when(F.col("SOLDTO").isNull() | (F.col("SOLDTO") == ""), "Y")
                                           .when(F.col("SHIPTO").isNull() | (F.col("SHIPTO") == ""), "Y")
                                           .otherwise("N"))
    logger.info("null_yn field calculated")
except Exception as e:
    logger.error(f"Error calculating null_yn field: {e}")

# Filter Tool (Node 80)
try:
    filtered_df = null_yn_df.filter(F.col("null_yn") == "Y")
    logger.info(f"Filter operation completed with {filtered_df.count()} records")
except Exception as e:
    logger.error(f"Error during Filter operation: {e}")

# UPDATE NULL for FNC_ID and FNC_DESC (Node 81)
try:
    updated_null_df = filtered_df.withColumn("FNC_ID", F.when(F.col("FNC_ID").isNull() | (F.col("FNC_ID") == ""), "OTH").otherwise(F.col("FNC_ID"))) \
                                 .withColumn("FNC_DESC", F.when(F.col("FNC_DESC").isNull() | (F.col("FNC_DESC") == ""), "OTHER").otherwise(F.col("FNC_DESC")))
    logger.info("UPDATE NULL transformation applied")
except Exception as e:
    logger.error(f"Error applying UPDATE NULL transformation: {e}")

# Union operation (Node 82)
try:
    union_final_df = updated_null_df.union(final_union_df)
    logger.info(f"Union operation completed with {union_final_df.count()} records")
except Exception as e:
    logger.error(f"Error during Union operation: {e}")

# Select Tool (Node 83)
try:
    select_df = union_final_df.select("SO_Date", "BILL_DATE", "Whs", "DIST_CHNL_ID", "FNC_ID", "FNC_DESC", "SOLDTO", "SHIPTO", "RFRNC_DOC_NUM", "Rush_Order_Fee", "Sum_Invoice_Lines", "Run Date", "StartTXT", "EndTXT", "Start Date", "START_2WK", "END_2WK", "End Date", "BIA_SHIP_HNDL_AMT", "COE_SHIP_HNDL_AMT", "Invoice_Sales", "DIST_CHNL", "DIST_CHNL_DESC")
    logger.info("Select Tool transformation applied")
except Exception as e:
    logger.error(f"Error applying Select Tool transformation: {e}")

# Alteryx Select (Node 57)
try:
    alteryx_select_df = select_df.withColumnRenamed("Run Date", "RunDTE")
    logger.info("Alteryx Select transformation applied")
except Exception as e:
    logger.error(f"Error applying Alteryx Select transformation: {e}")

# Write to Unity Catalog target table
try:
    target_catalog = "catalog_name"
    target_schema = "schema_name"
    target_table = "table_name"
    
    # Create schema if it doesn't exist
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {target_catalog}.{target_schema}")
    logger.info(f"Schema {target_catalog}.{target_schema} ensured")
    
    # Write to Unity Catalog target table (overwrite mode handles table replacement)
    alteryx_select_df.write.format("delta").mode("overwrite").saveAsTable(f"{target_catalog}.{target_schema}.{target_table}")
    logger.info(f"Data written to {target_catalog}.{target_schema}.{target_table}")
except Exception as e:
    logger.error(f"Error writing data to Unity Catalog: {e}")
