# 01 â€“ Data Ingestion (Bronze Layer)
This notebook ingests raw banking data and stores it in the Bronze layer.


In [1]:
import os
import sys

# go one level up from notebooks/ to project root
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# add project root to PYTHONPATH
if project_root not in sys.path:
    sys.path.insert(0, project_root)

print("Project root added:", project_root)


Project root added: /Users/ruchita/data_engineering_projects/data_engineering


In [2]:
from src.utils import create_spark_session, get_logger, write_df, get_path

In [3]:
import sys
sys.path


['/Users/ruchita/data_engineering_projects/data_engineering',
 '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python39.zip',
 '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9',
 '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/lib-dynload',
 '',
 '/Users/ruchita/Library/Python/3.9/lib/python/site-packages',
 '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/site-packages']

In [4]:
import os
print(os.listdir(project_root))


['.DS_Store', 'requirements.txt', 'dummy_data_generator.py', 'README.md', 'configs', '.git', 'data', 'notebooks', 'src']


In [6]:
from src.ingestion import read_transactions
config_path = os.path.join(project_root, "configs", "spark_config.yaml")

spark = create_spark_session(config_path)
logger = get_logger("bronze-ingestion")

txn_df = read_transactions(
    spark,
    os.path.join(project_root, "data", "raw", "transactions.csv")
)

logger.info("Transactions loaded")

bronze_path = get_path(
    os.path.join(project_root, "data", "processed"),
    "bronze",
    "transactions"
)

write_df(txn_df, bronze_path)

logger.info("Bronze layer written successfully")


2026-02-05 01:33:38,952 - INFO - bronze-ingestion - Transactions loaded
2026-02-05 01:33:40,477 - INFO - bronze-ingestion - Bronze layer written successfully


In [7]:
bronze_path = os.path.join(
    project_root, "data", "processed", "bronze", "transactions"
)

bronze_df = spark.read.parquet(bronze_path)
bronze_df.show(5)


+--------------+----------+-----------+--------+----------------+--------+-------+---------------------+-------+
|transaction_id|account_id|customer_id|  amount|transaction_type|merchant|country|transaction_timestamp| status|
+--------------+----------+-----------+--------+----------------+--------+-------+---------------------+-------+
|   TXN00000001| ACC001179|  CUST00316|20087.89|             ATM|  Amazon|    UAE|  2025-12-19 00:40:38|SUCCESS|
|   TXN00000002| ACC001247|  CUST00910|35292.43|            CARD|    Noon|     SG|  2026-01-03 00:40:38| FAILED|
|   TXN00000003| ACC000015|  CUST00654| 6304.01|             ATM|  Careem|     UK|  2025-04-11 00:40:38|SUCCESS|
|   TXN00000004| ACC000879|  CUST00160|23516.83|          ONLINE|  Careem|     SG|  2025-06-04 00:40:38| FAILED|
|   TXN00000005| ACC000078|  CUST00315| 8049.74|            CARD| Talabat|    UAE|  2025-04-02 00:40:38|SUCCESS|
+--------------+----------+-----------+--------+----------------+--------+-------+--------------

In [8]:
bronze_df.printSchema()


root
 |-- transaction_id: string (nullable = true)
 |-- account_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- transaction_type: string (nullable = true)
 |-- merchant: string (nullable = true)
 |-- country: string (nullable = true)
 |-- transaction_timestamp: timestamp (nullable = true)
 |-- status: string (nullable = true)

