In [None]:
import logging
from datetime import datetime
from pyspark.sql import functions as F

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("GA_Data_Processing_Pipeline")

try:
    # Step 1: Load Data Sources
    logger.info("Loading data sources...")
    
    # Load static distribution channel information
    channel_df = spark.table("catalog.source_db.channel")
    
    # Load manually entered date fields
    manual_date_df = spark.table("catalog.source_db.manual_date")
    
    # Load tdmedpod table
    tdmedpod_df = spark.sql("""
        SELECT * FROM catalog.source_db.tdmedpod
        WHERE BILL_DTE = '2019-07-01' AND Whs = 'D0CG'
    """)
    
    # Load dbo.tableupdated_new_1000 table
    tableupdated_df = spark.sql("""
        SELECT 
            SO_AUDAT, FKDAT, WERKS, VTWEG, ZZFINCLASS, BEZEK, SOLDTO_KUNNR, SHIPTO_KUNNR, VGBEL,
            COUNT(DISTINCT VGBEL) AS Distinct_VBELN_Count,
            SUM(BILL_ITM_COUNT) AS BILL_ITM_COUNT,
            SUM(UNIT_LAND_COST) AS UNIT_LAND_COST,
            SUM(SO_NETWR) AS SO_NETWR,
            SUM(SO_NETPR) AS SO_NETPR
        FROM catalog.source_db.tableupdated_new_1000
        WHERE FKDAT BETWEEN '1999-01-01' AND '1999-01-31'
        GROUP BY SO_AUDAT, FKDAT, WERKS, VTWEG, ZZFINCLASS, BEZEK, SOLDTO_KUNNR, SHIPTO_KUNNR, VGBEL
    """)

    # Step 2: Date Transformations
    logger.info("Performing date transformations...")
    
    current_date = datetime.now().strftime('%Y-%m-%d')
    manual_date_df = manual_date_df.withColumn("DateTime_Out", F.lit(current_date))
    
    # Calculate date ranges
    manual_date_df = manual_date_df.withColumn("Prior_Week_Start", F.date_sub(F.current_date(), 7)) \
                                   .withColumn("Prior_Week_End", F.date_sub(F.current_date(), 1)) \
                                   .withColumn("Yesterday", F.date_sub(F.current_date(), 1)) \
                                   .withColumn("Today", F.current_date()) \
                                   .withColumn("P1M_Start", F.add_months(F.current_date(), -1)) \
                                   .withColumn("P1M_End", F.last_day(F.add_months(F.current_date(), -1)))

    # Step 3: Data Cleansing
    logger.info("Performing data cleansing...")
    
    tdmedpod_df = tdmedpod_df.fillna(0, subset=["SO_Date", "BILL_DATE", "Whs", "DIST_CHNL_ID", "FNC_ID", "FNC_DESC"])
    
    # Identify null values in primary key fields
    tdmedpod_df = tdmedpod_df.withColumn("null_yn", F.when(
        F.col("FNC_ID").isNull() | F.col("Whs").isNull() | F.col("DIST_CHNL_ID").isNull() |
        F.col("SOLDTO").isNull() | F.col("SHIPTO").isNull(), "Y").otherwise("N"))

    # Step 4: Data Aggregation
    logger.info("Performing data aggregation...")
    
    aggregated_df = tdmedpod_df.groupBy("BILL_DATE").agg(
        F.sum("Invoices").alias("Sum_Invoices"),
        F.sum("Invoice_Lines").alias("Sum_Invoice_Lines"),
        F.sum("LANDED_COST").alias("Sum_LANDED_COST"),
        F.sum("EXT_FINAL_PRICE").alias("Sum_EXT_FINAL_PRICE"),
        F.sum("Trans_Charge_Amt").alias("Sum_Trans_Charge_Amt"),
        F.sum("RESTOCK_Fee").alias("Sum_RESTOCK_Fee"),
        F.sum("Special_Hndl_Amt").alias("Sum_Special_Hndl_Amt"),
        F.sum("Vendor_Hndl_Amt").alias("Sum_Vendor_Hndl_Amt"),
        F.sum("MOC_Amt").alias("Sum_MOC_Amt"),
        F.sum("Fuel_Surcharge").alias("Sum_Fuel_Surcharge")
    )

    # Step 5: Custom Calculations
    logger.info("Performing custom calculations...")
    
    aggregated_df = aggregated_df.withColumn("BIA_SHIP_HNDL_AMT", 
        F.col("Sum_Trans_Charge_Amt") + F.col("Sum_RESTOCK_Fee") + F.col("Sum_Special_Hndl_Amt") +
        F.col("Sum_Vendor_Hndl_Amt") + F.col("Sum_MOC_Amt") + F.col("Sum_Fuel_Surcharge")
    )
    
    aggregated_df = aggregated_df.withColumn("COE_SHIP_HNDL_AMT", 
        F.col("BIA_SHIP_HNDL_AMT") + F.col("Rush_Order_Fee") + F.col("VENDR_TRANS_CHRG_FRT_ZTV1") +
        F.col("MARKUP_VENDOR_TRANS_FEE_AMT_ZMT1")
    )
    
    aggregated_df = aggregated_df.withColumn("Invoice_Sales", F.col("Sum_EXT_FINAL_PRICE"))

    # Step 6: Data Enrichment
    logger.info("Performing data enrichment...")
    
    tdmedpod_df = tdmedpod_df.withColumn("FNC_ID", F.when(F.col("FNC_ID").isNull(), "OTH").otherwise(F.col("FNC_ID"))) \
                             .withColumn("FNC_DESC", F.when(F.col("FNC_DESC").isNull(), "OTHER").otherwise(F.col("FNC_DESC")))

    # Step 7: Data Integration
    logger.info("Performing data integration...")
    
    final_df = tdmedpod_df.join(channel_df, tdmedpod_df["DIST_CHNL_ID"] == channel_df["DIST_CHNL"], "left")

    # Step 8: Output
    logger.info("Writing final output to Unity Catalog...")
    
    spark.sql("DROP TABLE IF EXISTS catalog.target_db.final_output")
    final_df.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.final_output")

    logger.info("Pipeline execution completed successfully.")

except Exception as e:
    logger.error(f"Pipeline execution failed: {e}")
    raise
