In [17]:
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

print("Project root:", project_root)


Project root: /Users/ruchita/data_engineering_projects/data_engineering


In [18]:
from src.utils import create_spark_session, get_logger, write_df, get_path
from src.transformations import enrich_transactions

spark = create_spark_session(
    os.path.join(project_root, "configs", "spark_config.yaml")
)

logger = get_logger("transformations")
logger.info("Spark session started")


2026-02-05 02:24:18,557 - INFO - transformations - Spark session started


In [19]:
silver_txn_path = get_path(
    os.path.join(project_root, "data", "processed"),
    "silver",
    "transactions_clean"
)

txn_df = spark.read.parquet(silver_txn_path)

logger.info(f"Silver clean records count: {txn_df.count()}")
txn_df.show(5)


2026-02-05 02:24:18,636 - INFO - transformations - Silver clean records count: 20000


+--------------+----------+-----------+--------+----------------+--------+-------+---------------------+-------+
|transaction_id|account_id|customer_id|  amount|transaction_type|merchant|country|transaction_timestamp| status|
+--------------+----------+-----------+--------+----------------+--------+-------+---------------------+-------+
|   TXN00000001| ACC001179|  CUST00316|20087.89|             ATM|  Amazon|    UAE|  2025-12-19 00:40:38|SUCCESS|
|   TXN00000002| ACC001247|  CUST00910|35292.43|            CARD|    Noon|     SG|  2026-01-03 00:40:38| FAILED|
|   TXN00000003| ACC000015|  CUST00654| 6304.01|             ATM|  Careem|     UK|  2025-04-11 00:40:38|SUCCESS|
|   TXN00000004| ACC000879|  CUST00160|23516.83|          ONLINE|  Careem|     SG|  2025-06-04 00:40:38| FAILED|
|   TXN00000005| ACC000078|  CUST00315| 8049.74|            CARD| Talabat|    UAE|  2025-04-02 00:40:38|SUCCESS|
+--------------+----------+-----------+--------+----------------+--------+-------+--------------

In [20]:
customers_path = os.path.join(project_root, "data", "raw", "customers.csv")
accounts_path = os.path.join(project_root, "data", "raw", "accounts.csv")

customers_df = spark.read.csv(customers_path, header=True, inferSchema=True)
accounts_df = spark.read.csv(accounts_path, header=True, inferSchema=True)

logger.info(f"Customers count: {customers_df.count()}")
logger.info(f"Accounts count: {accounts_df.count()}")


2026-02-05 02:24:18,811 - INFO - transformations - Customers count: 1000
2026-02-05 02:24:18,846 - INFO - transformations - Accounts count: 1500


In [21]:
customers_df.show(5)
accounts_df.show(5)


+-----------+----------------+----------+-------+---------+
|customer_id|   customer_name|       dob|country|  segment|
+-----------+----------------+----------+-------+---------+
|  CUST00001|      Sarah Lamb|1957-12-14|    UAE|Corporate|
|  CUST00002|     Anita Lopez|1988-08-11|    UAE|Corporate|
|  CUST00003|Thomas Castaneda|1961-10-10|    UAE|Corporate|
|  CUST00004|      Leah Burch|1996-05-17|     IN|Corporate|
|  CUST00005|    Tina Wiggins|1995-06-18|     SG|Corporate|
+-----------+----------------+----------+-------+---------+
only showing top 5 rows

+----------+-----------+------------+---------+------------+
|account_id|customer_id|account_type|  balance|created_date|
+----------+-----------+------------+---------+------------+
| ACC000001|  CUST00157|     Current|127041.96|  2020-08-21|
| ACC000002|  CUST00559|      Credit| 66310.09|  2021-06-29|
| ACC000003|  CUST00293|      Credit|180277.47|  2023-06-01|
| ACC000004|  CUST00011|     Current|109624.66|  2025-02-21|
| ACC000

In [22]:
enriched_df = enrich_transactions(
    txn_df,
    customers_df,
    accounts_df
)

logger.info(f"Enriched records count: {enriched_df.count()}")
enriched_df.show(5)


2026-02-05 02:24:19,010 - INFO - transformations - Enriched records count: 20000


+----------+-----------+--------------+--------+----------------+--------+-------------------+---------------------+-------+--------------------+----------+----------------+---------+------------+---------+------------+----------+-----------------+
|account_id|customer_id|transaction_id|  amount|transaction_type|merchant|transaction_country|transaction_timestamp| status|       customer_name|       dob|customer_country|  segment|account_type|  balance|created_date|  txn_date|is_high_value_txn|
+----------+-----------+--------------+--------+----------------+--------+-------------------+---------------------+-------+--------------------+----------+----------------+---------+------------+---------+------------+----------+-----------------+
| ACC001179|  CUST00316|   TXN00000001|20087.89|             ATM|  Amazon|                UAE|  2025-12-19 00:40:38|SUCCESS|          Craig Dunn|1996-05-07|              US|Corporate|     Savings|174771.68|  2024-09-03|2025-12-19|                1|
| AC

In [23]:
enriched_df.select(
    "transaction_id",
    "customer_id",
    "account_id",
    "amount",
    "txn_date",
    "is_high_value_txn",
    "segment",
    "account_type"
).show(10)



+--------------+-----------+----------+--------+----------+-----------------+---------+------------+
|transaction_id|customer_id|account_id|  amount|  txn_date|is_high_value_txn|  segment|account_type|
+--------------+-----------+----------+--------+----------+-----------------+---------+------------+
|   TXN00000001|  CUST00316| ACC001179|20087.89|2025-12-19|                1|Corporate|     Savings|
|   TXN00000002|  CUST00910| ACC001247|35292.43|2026-01-03|                1|Corporate|     Savings|
|   TXN00000003|  CUST00654| ACC000015| 6304.01|2025-04-11|                0|   Retail|      Credit|
|   TXN00000004|  CUST00160| ACC000879|23516.83|2025-06-04|                1|   Retail|     Current|
|   TXN00000005|  CUST00315| ACC000078| 8049.74|2025-04-02|                0|   Retail|     Current|
|   TXN00000006|  CUST00412| ACC000021| 1181.21|2025-07-04|                0|   Retail|     Current|
|   TXN00000007|  CUST00916| ACC001305|49637.12|2025-10-16|                1| Priority|    

In [None]:
silver_enriched_path = get_path(
    os.path.join(project_root, "data", "processed"),
    "silver",
    "transactions_enriched"
)

write_df(
    enriched_df,
    silver_enriched_path,
    mode="overwrite"
)

logger.info("Silver enriched transactions written successfully")




2026-02-05 02:25:10,038 - INFO - transformations - Silver enriched transactions written successfully


In [25]:
check_df = spark.read.parquet(silver_enriched_path)

print("Final enriched count:", check_df.count())
check_df.printSchema()



Final enriched count: 20000
root
 |-- account_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- transaction_id: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- transaction_type: string (nullable = true)
 |-- merchant: string (nullable = true)
 |-- transaction_country: string (nullable = true)
 |-- transaction_timestamp: timestamp (nullable = true)
 |-- status: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- customer_country: string (nullable = true)
 |-- segment: string (nullable = true)
 |-- account_type: string (nullable = true)
 |-- balance: double (nullable = true)
 |-- created_date: date (nullable = true)
 |-- txn_date: date (nullable = true)
 |-- is_high_value_txn: integer (nullable = true)

