In [1]:
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

print("Project root:", project_root)


Project root: /Users/ruchita/data_engineering_projects/data_engineering


In [2]:
from src.utils import create_spark_session, get_logger, write_df, get_path
from src.aggregations import daily_customer_spend

spark = create_spark_session(
    os.path.join(project_root, "configs", "spark_config.yaml")
)

logger = get_logger("gold-aggregations")
logger.info("Spark session started")


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/05 02:32:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/02/05 02:32:32 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
26/02/05 02:32:32 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
26/02/05 02:32:32 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
2026-02-05 02:32:32,561 - INFO - gold-aggregations - Spark session started


In [3]:
silver_enriched_path = get_path(
    os.path.join(project_root, "data", "processed"),
    "silver",
    "transactions_enriched"
)

silver_df = spark.read.parquet(silver_enriched_path)

logger.info(f"Enriched Silver record count: {silver_df.count()}")
silver_df.show(5)


2026-02-05 02:32:46,396 - INFO - gold-aggregations - Enriched Silver record count: 20000


+----------+-----------+--------------+--------+----------------+--------+-------------------+---------------------+-------+--------------------+----------+----------------+---------+------------+---------+------------+----------+-----------------+
|account_id|customer_id|transaction_id|  amount|transaction_type|merchant|transaction_country|transaction_timestamp| status|       customer_name|       dob|customer_country|  segment|account_type|  balance|created_date|  txn_date|is_high_value_txn|
+----------+-----------+--------------+--------+----------------+--------+-------------------+---------------------+-------+--------------------+----------+----------------+---------+------------+---------+------------+----------+-----------------+
| ACC001179|  CUST00316|   TXN00000001|20087.89|             ATM|  Amazon|                UAE|  2025-12-19 00:40:38|SUCCESS|          Craig Dunn|1996-05-07|              US|Corporate|     Savings|174771.68|  2024-09-03|2025-12-19|                1|
| AC

                                                                                

In [4]:
required_cols = [
    "customer_id",
    "txn_date",
    "amount",
    "is_high_value_txn"
]

missing_cols = [c for c in required_cols if c not in silver_df.columns]
assert not missing_cols, f"Missing columns: {missing_cols}"


In [6]:
gold_customer_daily_df = daily_customer_spend(silver_df)

logger.info(
    f"Gold daily customer records: {gold_customer_daily_df.count()}"
)

gold_customer_daily_df.show(10)


2026-02-05 02:34:12,781 - INFO - gold-aggregations - Gold daily customer records: 19123


+-----------+----------+-----------+---------+
|customer_id|  txn_date|daily_spend|txn_count|
+-----------+----------+-----------+---------+
|  CUST00612|2025-04-30|    38529.4|        1|
|  CUST00655|2025-11-06|    22884.8|        1|
|  CUST00108|2025-09-20|    23309.4|        1|
|  CUST00616|2025-09-05|   77916.75|        2|
|  CUST00461|2025-12-10|   18299.11|        1|
|  CUST00143|2025-05-01|   39359.12|        1|
|  CUST00068|2025-11-05|   41467.12|        1|
|  CUST00067|2025-08-01|   23576.62|        1|
|  CUST00823|2025-08-17|    27238.3|        1|
|  CUST00310|2025-04-11|   48518.05|        1|
+-----------+----------+-----------+---------+
only showing top 10 rows



In [7]:
gold_customer_daily_df = gold_customer_daily_df.repartition("txn_date")


In [8]:
gold_path = get_path(
    os.path.join(project_root, "data", "processed"),
    "gold",
    "daily_customer_spend"
)

write_df(
    gold_customer_daily_df,
    gold_path,
    mode="overwrite",
    partition_col="txn_date"
)

logger.info("Gold daily_customer_spend table written successfully")


2026-02-05 02:35:20,917 - INFO - gold-aggregations - Gold daily_customer_spend table written successfully


In [9]:
gold_check_df = spark.read.parquet(gold_path)

print("Gold record count:", gold_check_df.count())
gold_check_df.printSchema()
gold_check_df.show(10)


Gold record count: 19123
root
 |-- customer_id: string (nullable = true)
 |-- daily_spend: double (nullable = true)
 |-- txn_count: long (nullable = true)
 |-- txn_date: date (nullable = true)

+-----------+-----------+---------+----------+
|customer_id|daily_spend|txn_count|  txn_date|
+-----------+-----------+---------+----------+
|  CUST00043|   20539.91|        1|2025-07-09|
|  CUST00428|    5541.07|        1|2025-07-09|
|  CUST00667|   41856.86|        1|2025-07-09|
|  CUST00287|   15092.35|        1|2025-07-09|
|  CUST00617|    1547.41|        1|2025-07-09|
|  CUST00871|   20069.38|        1|2025-07-09|
|  CUST00291|   45070.54|        1|2025-07-09|
|  CUST00251|   18756.81|        1|2025-07-09|
|  CUST00237|    7980.04|        1|2025-07-09|
|  CUST00381|   47561.85|        1|2025-07-09|
+-----------+-----------+---------+----------+
only showing top 10 rows

