In [1]:
from pyspark.sql import functions as F
from pyspark.sql.types import *

StatementMeta(, 6d9b3db5-4a2a-4924-8f6e-cdeb9b3c4f99, 3, Finished, Available, Finished)

In [2]:
cust_dim = spark.sql("select * from l1_customer_dim where is_current = 1")
merch_dim = spark.sql("select * from l1_merchant_dim where is_current = 1")
tran_fact = spark.table("l1_transaction_fact")

print("customer_dim")
cust_dim.show(1)

print("\n\nmerchant_dim")
merch_dim.show(1)

print("\n\ntran_fact")
tran_fact.show(1)

StatementMeta(, 6d9b3db5-4a2a-4924-8f6e-cdeb9b3c4f99, 4, Finished, Available, Finished)

customer_dim
+--------------------+----------+------------+------+---+----------+-------+-------+--------------------+----------------+--------------------+---------------+----------+----------+----------+
|         CUSTOMER_ID|FIRST_NAME|   LAST_NAME|GENDER|AGE|AGE_BUCKET|   CITY|  STATE|      CUSTOMER_EMAIL|CUSTOMER_CONTACT|CUSTOMER_COUNTRYCODE|ETL_INSERT_DATE|VALID_FROM|  VALID_TO|IS_CURRENT|
+--------------------+----------+------------+------+---+----------+-------+-------+--------------------+----------------+--------------------+---------------+----------+----------+----------+
|0028d680-dcd5-487...|    HIMMAT|RAMAKRISHNAN|FEMALE| 51|MIDDLE_AGE|AMBASSA|TRIPURA|sanaratta@hotmail...|     09110430187|                INTL|     2026-01-15|2026-01-15|2999-12-31|         1|
+--------------------+----------+------------+------+---+----------+-------+-------+--------------------+----------------+--------------------+---------------+----------+----------+----------+
only showing top 1 row

##### CUSTOMER TRANSACTION METRICS TABLE

In [3]:
tran_cust = tran_fact.alias("tran").join(cust_dim.alias("cust"),
F.col("tran.customer_id") ==  F.col("cust.customer_id"), how = "inner")\
.select(F.col("tran.customer_id"), F.col("transaction_id"),  F.col("tran.transaction_timestamp"), 
F.col("tran.transaction_amount"),
F.col("transaction_currency"), F.col("is_fraud"), F.col("transaction_city"), F.col("transaction_state"),
F.col("city"), F.col("state"))

tran_cust.cache()
tran_cust.show(2)

StatementMeta(, 6d9b3db5-4a2a-4924-8f6e-cdeb9b3c4f99, 5, Finished, Available, Finished)

+--------------------+--------------------+---------------------+------------------+--------------------+--------+----------------+-----------------+-----------+---------+
|         customer_id|      transaction_id|transaction_timestamp|transaction_amount|transaction_currency|is_fraud|transaction_city|transaction_state|       city|    state|
+--------------------+--------------------+---------------------+------------------+--------------------+--------+----------------+-----------------+-----------+---------+
|e0fcae7a-6450-47b...|b9caf777-5d72-4e3...|  2025-01-29 15:20:18|          90720.56|                 INR|       0|            TURA|        MEGHALAYA|       TURA|MEGHALAYA|
|869c7035-e84c-48b...|12558fac-16b9-4b5...|  2025-01-28 08:22:32|          39778.17|                 INR|       0|     SOUTH DELHI|            DELHI|SOUTH DELHI|    DELHI|
+--------------------+--------------------+---------------------+------------------+--------------------+--------+----------------+---------

In [4]:
tran_cust_met1 = tran_cust.filter(F.col("Transaction_currency") == "INR")\
.groupBy("CUSTOMER_ID").agg(
    F.count("transaction_id").alias("NO_TXN"),
    F.sum("transaction_amount").alias("TOT_TRAN_AMT"),
    F.avg("Transaction_Amount").alias("AVG_TRAN_AMT"),
    F.min("Transaction_Amount").alias("MIN_TRAN_AMT"),
    F.max("Transaction_Amount").alias("MAX_TRAN_AMT"),
    F.max("Transaction_Timestamp").alias("LAST_TXN_TIMESTMP")
)

tran_cust_met1.show(5)

StatementMeta(, 6d9b3db5-4a2a-4924-8f6e-cdeb9b3c4f99, 6, Finished, Available, Finished)

+--------------------+------+------------+------------+------------+------------+-------------------+
|         CUSTOMER_ID|NO_TXN|TOT_TRAN_AMT|AVG_TRAN_AMT|MIN_TRAN_AMT|MAX_TRAN_AMT|  LAST_TXN_TIMESTMP|
+--------------------+------+------------+------------+------------+------------+-------------------+
|25aa2625-aabb-4ab...|     1|    48509.29|    48509.29|    48509.29|    48509.29|2025-01-28 18:56:11|
|7498c0e7-cd4e-422...|     1|    48021.97|    48021.97|    48021.97|    48021.97|2025-01-26 19:12:58|
|422cc5c7-f315-484...|     1|    75516.54|    75516.54|    75516.54|    75516.54|2025-01-13 17:42:05|
|1a565567-632b-4db...|     1|    10278.73|    10278.73|    10278.73|    10278.73|2025-01-19 13:42:49|
|da73bfac-a786-413...|     1|    37322.22|    37322.22|    37322.22|    37322.22|2025-01-06 13:30:44|
+--------------------+------+------------+------------+------------+------------+-------------------+
only showing top 5 rows



In [5]:
tran_cust_fraudmet = tran_cust.groupBy("CUSTOMER_ID")\
.agg(F.count("transaction_id").alias("TOTAL_TXN"), F.sum("IS_FRAUD").alias("FRAUD_TXN"))\
.withColumn("FRAUD_RATE", F.round(F.col("fraud_txn")*100/F.col("total_txn"), 2))\
.withColumn("RISK_CATEGORY", F.when(F.col("fraud_rate") >= 33, "HIGH")\
.when((F.col("fraud_rate") < 33) & (F.col("fraud_rate")!=0), "MEDIUM").otherwise("LOW"))\
.select(["customer_id", "fraud_rate", "risk_category"])

tran_cust_fraudmet.show(5)

StatementMeta(, 6d9b3db5-4a2a-4924-8f6e-cdeb9b3c4f99, 7, Finished, Available, Finished)

+--------------------+----------+-------------+
|         customer_id|fraud_rate|risk_category|
+--------------------+----------+-------------+
|25aa2625-aabb-4ab...|       0.0|          LOW|
|7498c0e7-cd4e-422...|       0.0|          LOW|
|422cc5c7-f315-484...|       0.0|          LOW|
|1a565567-632b-4db...|       0.0|          LOW|
|da73bfac-a786-413...|       0.0|          LOW|
+--------------------+----------+-------------+
only showing top 5 rows



In [6]:
tran_cust_agg = tran_cust_met1.alias("pt1").join(tran_cust_fraudmet.alias("pt2"),
F.col("pt1.customer_id") == F.col("pt2.customer_id"), how = 'inner')\
.select("pt1.*", "pt2.FRAUD_RATE", "pt2.RISK_CATEGORY")\
.withColumn("LAST_UPDATED_ON", F.current_date())

tran_cust_agg.show(5)

StatementMeta(, 6d9b3db5-4a2a-4924-8f6e-cdeb9b3c4f99, 8, Finished, Available, Finished)

+--------------------+------+------------+------------+------------+------------+-------------------+----------+-------------+---------------+
|         CUSTOMER_ID|NO_TXN|TOT_TRAN_AMT|AVG_TRAN_AMT|MIN_TRAN_AMT|MAX_TRAN_AMT|  LAST_TXN_TIMESTMP|FRAUD_RATE|RISK_CATEGORY|LAST_UPDATED_ON|
+--------------------+------+------------+------------+------------+------------+-------------------+----------+-------------+---------------+
|25aa2625-aabb-4ab...|     1|    48509.29|    48509.29|    48509.29|    48509.29|2025-01-28 18:56:11|       0.0|          LOW|     2026-01-19|
|7498c0e7-cd4e-422...|     1|    48021.97|    48021.97|    48021.97|    48021.97|2025-01-26 19:12:58|       0.0|          LOW|     2026-01-19|
|422cc5c7-f315-484...|     1|    75516.54|    75516.54|    75516.54|    75516.54|2025-01-13 17:42:05|       0.0|          LOW|     2026-01-19|
|1a565567-632b-4db...|     1|    10278.73|    10278.73|    10278.73|    10278.73|2025-01-19 13:42:49|       0.0|          LOW|     2026-01-19|

In [7]:
tran_cust_agg.write\
.format('delta')\
.mode('overwrite')\
.partitionBy('risk_category')\
.saveAsTable('l2_customer_aggregated_metrics')

StatementMeta(, 6d9b3db5-4a2a-4924-8f6e-cdeb9b3c4f99, 9, Finished, Available, Finished)