In [None]:
import logging
from pyspark.sql import functions as F
from pyspark.sql import Window

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define constants for parameterized queries
FROM_PARM = '2023-01-01'  # Example start date
TO_PARM = '2023-12-31'    # Example end date

try:
    # Load data from Unity Catalog tables
    transaction_items_df = spark.table("catalog.salesmi_db.salesmiTRANSACTION_ITEMS")
    transaction_source_reasons_df = spark.table("catalog.salesmi_db.salesmitransaction_source_reasons")
    products_df = spark.table("catalog.salesmi_db.salesmiPRODUCTS")
    transaction_item_reasons_df = spark.table("catalog.salesmi_db.salesmiTRANSACTION_ITEM_REASONS")
    agreements_df = spark.table("catalog.salesmi_db.salesmiAGREEMENTS")

    # Perform joins based on the extracted components
    joined_df = transaction_items_df.alias("a11") \
        .join(products_df.alias("a13"), F.col("a11.PROD_ID") == F.col("a13.PROD_ID")) \
        .join(transaction_item_reasons_df.alias("a14"), F.col("a11.TRAN_ITEM_REASON_ID") == F.col("a14.TRAN_ITEM_REASON_ID")) \
        .join(transaction_source_reasons_df.alias("a12"), F.col("a14.tran_src_reason_id") == F.col("a12.tran_src_reason_id")) \
        .join(agreements_df.alias("a15"), F.col("a11.AGRMT_ID") == F.col("a15.AGRMT_ID"))

    # Apply filters
    filtered_df = joined_df.filter(
        (F.col("a13.PROD_LINE_ID").isin(12, 9, 10, 3)) &
        (F.col("a14.TRAN_CATEGORY_ID").isin(5, 6, 7)) &
        (~F.col("a14.TRAN_SRC_REASON_ID").isin(5219, 5225, 5213, 5286, 5221, 5226, 5214, 5287, 5222, 5283, 5215, 5288, 975, 5284, 5216, 6364, 1053, 5285, 5218, 1014, 1016, 5227, 5217, 5211, 1017, 5228, 5220, 5212)) &
        (F.col("a15.AGRMT_ISSUE_DATE_ID") > '2004-04-01') &
        (~F.col("a15.AGRMT_SRC_ID").like('N0%')) &
        (F.col("a11.TRAN_EFF_DATE_ID") < F.col("a15.AGRMT_ISSUE_DATE_ID") + 916) &
        (F.col("a11.DATE_ID").between(FROM_PARM, TO_PARM))
    )

    # Perform custom calculations
    transformed_df = filtered_df.select(
        F.max(F.substring(F.col("a15.AGRMT_SRC_ID"), 1, 10)).alias("Policy No"),
        F.max(F.concat_ws(", ", F.coalesce(F.col("a15.AGRMT_OWNER_PARTY_NAME"), ''), F.coalesce(F.col("a15.AGRMT_OWNER_FIRST_NAME"), ''), F.coalesce(F.col("a15.AGRMT_OWNER_MIDDLE_INITIAL"), ''))).alias("Owner Name"),
        F.sum(F.when((F.col("a15.SRC_SYSTEM_ID") == '55') & (F.col("a12.TRAN_SRC_REASON") == 'LOAN (LOAN AMOUNT)') & (F.col("a11.DOLLAR_AMT") < 0), F.col("a11.DOLLAR_AMT"))
              .when((F.col("a15.SRC_SYSTEM_ID") == '55') & (F.col("a12.TRAN_SRC_REASON") == 'LOAN (LOAN AMOUNT)') & (F.col("a11.DOLLAR_AMT") > 0), 0)
              .otherwise(F.col("a11.DOLLAR_AMT"))).alias("Amount"),
        (1 + (F.col("a11.TRAN_EFF_DATE_ID") - F.col("a15.AGRMT_ISSUE_DATE_ID")) / 366).alias("Policy Age")
    )

    # Group by and having clause
    grouped_df = transformed_df.groupBy(
        "a14.TRAN_CATEGORY_ID", "a15.AGRMT_ISSUE_DATE_ID", "tran_src_reason", "a11.TRAN_EFF_DATE_ID", (F.col("a11.TRAN_EFF_DATE_ID") - F.col("a15.AGRMT_ISSUE_DATE_ID"))
    ).agg(
        F.sum("Amount").alias("Total Amount")
    ).filter(F.col("Total Amount") > 5000)

    # Order by
    ordered_df = grouped_df.orderBy("Policy No", "Policy Date")

    # Write the transformed data to Unity Catalog target table
    spark.sql("DROP TABLE IF EXISTS catalog.target_db.HD714068")
    ordered_df.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.HD714068")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error(f"An error occurred during the ETL process: {e}")
