In [1]:
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

print("Project root:", project_root)


Project root: /Users/ruchita/data_engineering_projects/data_engineering


In [2]:
from src.utils import create_spark_session, get_logger, write_df, get_path
from src.validations import validate_transactions

spark = create_spark_session(
    os.path.join(project_root, "configs", "spark_config.yaml")
)

logger = get_logger("data-quality")
logger.info("Spark session started")


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/05 01:37:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/02/05 01:37:47 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
2026-02-05 01:37:47,916 - INFO - data-quality - Spark session started


In [3]:
bronze_path = get_path(
    os.path.join(project_root, "data", "processed"),
    "bronze",
    "transactions"
)

bronze_df = spark.read.parquet(bronze_path)

logger.info(f"Bronze records count: {bronze_df.count()}")
bronze_df.show(5)


2026-02-05 01:38:10,377 - INFO - data-quality - Bronze records count: 20000     


+--------------+----------+-----------+--------+----------------+--------+-------+---------------------+-------+
|transaction_id|account_id|customer_id|  amount|transaction_type|merchant|country|transaction_timestamp| status|
+--------------+----------+-----------+--------+----------------+--------+-------+---------------------+-------+
|   TXN00000001| ACC001179|  CUST00316|20087.89|             ATM|  Amazon|    UAE|  2025-12-19 00:40:38|SUCCESS|
|   TXN00000002| ACC001247|  CUST00910|35292.43|            CARD|    Noon|     SG|  2026-01-03 00:40:38| FAILED|
|   TXN00000003| ACC000015|  CUST00654| 6304.01|             ATM|  Careem|     UK|  2025-04-11 00:40:38|SUCCESS|
|   TXN00000004| ACC000879|  CUST00160|23516.83|          ONLINE|  Careem|     SG|  2025-06-04 00:40:38| FAILED|
|   TXN00000005| ACC000078|  CUST00315| 8049.74|            CARD| Talabat|    UAE|  2025-04-02 00:40:38|SUCCESS|
+--------------+----------+-----------+--------+----------------+--------+-------+--------------

                                                                                

## Data Quality Rules Applied
- Amount must be greater than 0
- Transaction timestamp must not be NULL
- Customer ID must not be NULL


In [5]:
valid_df, invalid_df = validate_transactions(bronze_df)

valid_count = valid_df.count()
invalid_count = invalid_df.count()

logger.info(f"Valid records count: {valid_count}")
logger.info(f"Invalid records count: {invalid_count}")


2026-02-05 01:39:41,488 - INFO - data-quality - Valid records count: 20000
2026-02-05 01:39:41,488 - INFO - data-quality - Invalid records count: 0


In [6]:
invalid_df.show(5, truncate=False)


+--------------+----------+-----------+------+----------------+--------+-------+---------------------+------+
|transaction_id|account_id|customer_id|amount|transaction_type|merchant|country|transaction_timestamp|status|
+--------------+----------+-----------+------+----------------+--------+-------+---------------------+------+
+--------------+----------+-----------+------+----------------+--------+-------+---------------------+------+



In [7]:
silver_clean_path = get_path(
    os.path.join(project_root, "data", "processed"),
    "silver",
    "transactions_clean"
)

write_df(
    valid_df,
    silver_clean_path,
    mode="overwrite"
)

logger.info("Silver clean transactions written")


2026-02-05 01:40:09,026 - INFO - data-quality - Silver clean transactions written


In [8]:
silver_invalid_path = get_path(
    os.path.join(project_root, "data", "processed"),
    "silver",
    "transactions_invalid"
)

write_df(
    invalid_df,
    silver_invalid_path,
    mode="overwrite"
)

logger.info("Silver invalid transactions written")


2026-02-05 01:40:37,442 - INFO - data-quality - Silver invalid transactions written


In [9]:
clean_df = spark.read.parquet(silver_clean_path)
invalid_df_check = spark.read.parquet(silver_invalid_path)

print("Clean records:", clean_df.count())
print("Invalid records:", invalid_df_check.count())


Clean records: 20000
Invalid records: 0
