In [None]:
import logging
from datetime import datetime, timedelta
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, DecimalType

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Helper function to calculate date ranges
def calculate_date_ranges():
    today = datetime.today()
    yesterday = today - timedelta(days=1)
    prior_week_start = today - timedelta(days=today.weekday() + 7)
    prior_week_end = today - timedelta(days=today.weekday() + 1)
    p1m_start = (today.replace(day=1) - timedelta(days=1)).replace(day=1)
    p1m_end = today.replace(day=1) - timedelta(days=1)
    p2m_start = (p1m_start.replace(day=1) - timedelta(days=1)).replace(day=1)
    p2m_end = p1m_start - timedelta(days=1)
    p3m_start = (p2m_start.replace(day=1) - timedelta(days=1)).replace(day=1)
    p3m_end = p2m_start - timedelta(days=1)
    return {
        "today": today,
        "yesterday": yesterday,
        "prior_week_start": prior_week_start,
        "prior_week_end": prior_week_end,
        "p1m_start": p1m_start,
        "p1m_end": p1m_end,
        "p2m_start": p2m_start,
        "p2m_end": p2m_end,
        "p3m_start": p3m_start,
        "p3m_end": p3m_end
    }

# Calculate date ranges
date_ranges = calculate_date_ranges()

try:
    # Load data from Unity Catalog tables
    sales_df = spark.table("catalog.source_db.sales")
    line_totals_df = spark.table("catalog.source_db.line_totals")

    # Apply transformations
    # DateTime Conversion
    sales_df = sales_df.withColumn("SO_AUDAT", F.to_date(sales_df["SO_AUDAT"], "yyyy-MM-dd"))
    line_totals_df = line_totals_df.withColumn("FKDAT", F.to_date(line_totals_df["FKDAT"], "yyyy-MM-dd"))

    # Select and Rename Fields
    sales_df = sales_df.select(
        F.col("SO_AUDAT").alias("SO_Date"),
        F.col("WERKS").alias("Whs"),
        F.col("VTWEG").alias("DIST_CHNL_ID"),
        F.col("ZZFINCLASS").alias("FNC_ID"),
        F.col("SOLDTO_KUNNR").alias("SOLDTO"),
        F.col("SHIPTO_KUNNR").alias("SHIPTO"),
        F.col("VGBEL").alias("RFRNC_DOC_NUM"),
        F.col("Distinct_VBELN_Count").alias("Invoice_Lines"),
        F.col("UNIT_LAND_COST").alias("Unit_Land_cost"),
        F.col("SO_NETWR").alias("SO_NetValue_Amt"),
        F.col("SO_NETPR").alias("SO_NetPrice_Amt")
    )

    line_totals_df = line_totals_df.select(
        F.col("FKDAT").alias("BILL_DATE"),
        F.col("BILL_ITM_COUNT").alias("Invoice_Lines"),
        F.col("EXTND_LAND_CST").alias("Landed_cost"),
        F.col("EXTND_FNL_PRICE").alias("Extended_Final_Price"),
        F.col("EXTND_SERVC_FEE").alias("Service_Fee"),
        F.col("EXTND_SHPNG_HNDLNG").alias("Ext_Ship_Hndl"),
        F.col("XTND_STATE_TX").alias("Ext_State_Tax"),
        F.col("EXTND_LCL_TX").alias("Ext_Local_Tax"),
        F.col("BASE_QTY").alias("BASE_QTY"),
        F.col("SELL_QTY").alias("SELL_QTY"),
        F.col("WGT").alias("WGT"),
        F.col("VOL").alias("VOL"),
        F.col("RF_TRNSCT_ABSRB_CHRGE_AMT_ZTV2").alias("Vendor_Trans_Absorb"),
        F.col("VNDR_DRP_SHP_ABSRB_AMT_ZSS2_M2").alias("Vendor_Drop_Ship_Absorb"),
        F.col("EX_HDNL_DRP_SHP_VAL_ZSSH").alias("Ext_Hndl_Drop_Absorb"),
        F.col("RF_VENDOR_MOC_ABSRB_AMT_ZSM2").alias("Vendor_MOC_Absorb"),
        F.col("RF_TRNSCT_ABSORB_CHARGE_AMT_ZTR2").alias("Trans_Absorb_Amt"),
        F.col("TRANS_CHRGS_FRT_ZTR1").alias("Trans_Charge_Amt"),
        F.col("RESTOOCKING_FEE_ZSRF").alias("RESTOCK_Fee"),
        F.col("RESTOCK_FEE_MANUAL_ZSRM").alias("RESTOCK_Fee_Man"),
        F.col("SPCL_HNDLNG_CHRG_FX_ZH01").alias("Special_Hndl_Amt"),
        F.col("VNDR_HNDLNG_AMT_ZTHM").alias("Vendor_Hndl_Amt"),
        F.col("MIN_ORDER_CHARGE_USD_ZSMO").alias("MOC_Amt"),
        F.col("MOC_DROP_SHP_VAL_ZSSM").alias("MOC_Drop_Amt"),
        F.col("FUEL_SURCHARGE_ZSDF").alias("Fuel_Surcharge"),
        F.col("FUEL_SURCHARGE_OVERIDE_ZSDO").alias("Fuel_Override_amt"),
        F.col("RF_VNDR_DRP_SHP_FEE_AMT_ZSSM_F_ZTHM").alias("Vendor_Drop_Ship_fee"),
        F.col("MARKUP_VENDOR_TRANS_FEE_AMT_ZMT1").alias("Markup_Vendor_Trans"),
        F.col("MARK_UP_DND_ZMVT").alias("Markup_Hndl_Fee"),
        F.col("BULK_DIST_FEE_DLR_ZMGB").alias("Bulk_Dist_Fee"),
        F.col("BCF_RF_EXTND_VLINK_SVC_FEE").alias("VL_Srvc_Fee"),
        F.col("XTND_N_VLNK_SVC_FEE_ZVC12M1M3NM").alias("Ext_VL_Svc_Fee"),
        F.col("LOW_UOM_DIST_FEE_DLR_ZMGL").alias("LUM_Dist_Fee"),
        F.col("ONSITE_REP_FEE_DLR_ZMGO").alias("Onsite_Rep_Fee"),
        F.col("HLDY_DLVRY_FEE_DLR_ZMGD").alias("HLDY_Dlvr_Fee"),
        F.col("VBRP_BRGEW").alias("WGT"),
        F.col("VBRP_VOLUM").alias("VOL"),
        F.col("RF_TRNSCT_CHARGE_AMT_ZTRM").alias("Trans_Charge_Amt"),
        F.col("RF_SPECIAL_HNDLNG_CHARGE_AMT_ZH01").alias("Special_Hndl_Amt"),
        F.col("RF_MIN_ORDER_CHARGE_AMT_ZSMO").alias("MOC_Amt")
    )

    # Custom Calculations
    sales_df = sales_df.withColumn("Rush_Order_Fee", F.col("ADDTN_TRANS_FEE_OVRRIDE_ZSRO"))
    line_totals_df = line_totals_df.withColumn("BIA_SHIP_HNDL_AMT", F.expr(
        "[Sum_Trans_Charge_Amt]+[Sum_RESTOCK_Fee]+[Sum_Special_Hndl_Amt]+[Sum_Vendor_Hndl_Amt]+[Sum_MOC_Amt]+[Sum_Fuel_Surcharge]"
    ))
    line_totals_df = line_totals_df.withColumn("COE_SHIP_HNDL_AMT", F.expr(
        "[Sum_Trans_Charge_Amt]+[Sum_RESTOCK_Fee]+[Sum_Special_Hndl_Amt]+[Sum_Vendor_Hndl_Amt]+[Sum_MOC_Amt]+[Sum_Fuel_Surcharge]+[Rush_Order_Fee]+[VENDR_TRANS_CHRG_FRT_ZTV1]+[MARKUP_VENDOR_TRANS_FEE_AMT_ZMT1]"
    ))
    line_totals_df = line_totals_df.withColumn("Invoice_Sales", F.col("Sum_EXT_FINAL_PRICE"))

    # Multi-Field Formula: Replace nulls with 0
    fields_to_replace_nulls = [
        "SO_Date", "BILL_DATE", "Whs", "DIST_CHNL_ID", "FNC_ID", "FNC_DESC", "SOLDTO", "SHIPTO", "RFRNC_DOC_NUM",
        "Invoice_Lines", "Distinct_VBELN_Count", "BILL_ITM_COUNT", "SO_NETWR", "SO_NETPR", "FKLMG", "FKIMG", "ZTV2",
        "ZSS2_M2", "EXTND_LAND_CST", "XTND_INVOICE_PRICE_PER_ITM", "EXTND_FNL_PRICE", "EXTND_SERVC_FEE",
        "EXTND_SHPNG_HNDLNG", "XTND_STATE_TX", "EXTND_LCL_TX", "AVG_INVOICE_PRICE", "EXTND_FNL_PRICE1", "VBRP_BRGEW",
        "VBRP_VOLUM", "RF_TRNSCT_ABSRB_CHRGE_AMT_ZTV2", "VNDR_DRP_SHP_ABSRB_AMT_ZSS2_M2", "EX_HDNL_DRP_SHP_VAL_ZSSH",
        "RF_VENDOR_MOC_ABSRB_AMT_ZSM2", "RF_TRNSCT_ABSORB_CHARGE_AMT_ZTR2", "RF_TRNSCT_CHARGE_AMT_ZTRM",
        "RF_SPECIAL_HNDLNG_CHARGE_AMT_ZH01", "RF_MIN_ORDER_CHARGE_AMT_ZSMO", "TRANS_CHRGS_FRT_ZTR1",
        "RESTOOCKING_FEE_ZSRF", "RESTOCK_FEE_MANUAL_ZSRM", "SPCL_HNDLNG_CHRG_FX_ZH01", "VNDR_HNDLNG_AMT_ZTHM",
        "MIN_ORDER_CHARGE_USD_ZSMO", "MOC_DROP_SHP_VAL_ZSSM", "FUEL_SURCHARGE_ZSDF", "FUEL_SURCHARGE_OVERIDE_ZSDO",
        "VENDR_TRANS_CHRG_FRT_ZTV1", "RF_VNDR_DRP_SHP_FEE_AMT_ZSSM_F_ZTHM", "MARKUP_VENDOR_TRANS_FEE_AMT_ZMT1",
        "MARK_UP_DND_ZMVT", "ADDTN_TRANS_FEE_ZSRH", "ADDTN_TRANS_FEE_OVRRIDE_ZSRO", "DROPSHIP_FEE_VALUE_ZSSF",
        "XTND_N_VLNK_SVC_FEE_ZVC12M1M3NM", "ONSITE_REP_FEE_DLR_ZMGO", "BULK_DIST_FEE_DLR_ZMGB", "HLDY_DLVRY_FEE_DLR_ZMGD",
        "LOW_UOM_DIST_FEE_DLR_ZMGL", "GOV_DIST_FEE_DLR_ZMGN", "BCF_RF_EXTND_VLINK_SVC_FEE"
    ]

    for field in fields_to_replace_nulls:
        sales_df = sales_df.withColumn(field, F.when(F.col(field).isNull() | (F.col(field) == ""), 0).otherwise(F.col(field)))
        line_totals_df = line_totals_df.withColumn(field, F.when(F.col(field).isNull() | (F.col(field) == ""), 0).otherwise(F.col(field)))

    # Join and Union operations
    joined_df = sales_df.join(line_totals_df, sales_df["DIST_CHNL_ID"] == line_totals_df["DIST_CHNL_ID"], "inner")
    final_df = joined_df.union(line_totals_df)

    # Filter records with null primary keys
    final_df = final_df.withColumn("null_yn", F.when(
        F.col("FNC_ID").isNull() | (F.col("FNC_ID") == "") |
        F.col("Whs").isNull() | (F.col("Whs") == "") |
        F.col("DIST_CHNL_ID").isNull() | (F.col("DIST_CHNL_ID") == "") |
        F.col("SOLDTO").isNull() | (F.col("SOLDTO") == "") |
        F.col("SHIPTO").isNull() | (F.col("SHIPTO") == ""), "Y").otherwise("N")
    )

    filtered_df = final_df.filter(final_df["null_yn"] == "Y")

    # Update null values
    updated_df = filtered_df.withColumn("FNC_ID", F.when(F.col("FNC_ID").isNull() | (F.col("FNC_ID") == ""), "OTH").otherwise(F.col("FNC_ID")))
    updated_df = updated_df.withColumn("FNC_DESC", F.when(F.col("FNC_DESC").isNull() | (F.col("FNC_DESC") == ""), "OTHER").otherwise(F.col("FNC_DESC")))

    # Union updated records back
    final_df = final_df.union(updated_df)

    # Write to Unity Catalog target table
    spark.sql("DROP TABLE IF EXISTS catalog.target_db.target_table")
    final_df.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.target_table")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error(f"Error during ETL process: {str(e)}")
    raise
