# Bronze Layer - Configuration Driven Data Ingestion

This notebook ingests raw CSV data into the bronze layer using configuration-driven approach.

## Features:
- **Config-driven**: Uses `config/ingestion_config.json`
- **Data Quality Checks**: Null, duplicate, data type, business rules
- **Error Handling**: Logs errors to control tables
- **Audit Trail**: Tracks all operations

## How to Add New Tables:
1. Add CSV file to `data/raw/`
2. Update `config/ingestion_config.json`
3. Re-run this notebook

In [None]:
# Import libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json
import os
import uuid
import datetime

# Initialize Spark
spark = SparkSession.builder \
    .appName("Bronze Layer - Config Driven") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

def now(): return datetime.datetime.now().isoformat()

In [None]:
# Load configuration
def load_config():
    """Load ingestion configuration from JSON file"""
    config_path = "../config/ingestion_config.json"
    try:
        with open(config_path, 'r') as f:
            config = json.load(f)
        print(f"✅ Configuration loaded from {config_path}")
        return config
    except Exception as e:
        print(f"❌ Error loading config: {str(e)}")
        return None

# Load configuration
config = load_config()
if config is None:
    raise Exception("Failed to load configuration")

# Extract paths from config
bronze_config = config['bronze_layer']
RAW_DATA_PATH = bronze_config['raw_data_path']
BRONZE_DATA_PATH = bronze_config['bronze_data_path']
control_config = config['control_tables']

# Create directories
os.makedirs(BRONZE_DATA_PATH, exist_ok=True)
print(f"📁 Bronze data path: {BRONZE_DATA_PATH}")

In [None]:
# Control table functions
def log_audit(table, layer, status, row_count, error=None):
    """Log audit information to control table"""
    run_id = str(uuid.uuid4())
    try:
        with open(control_config['audit_log'], 'a') as f:
            f.write(f'{run_id},{table},{layer},{now()},,{status},{row_count},{error or ""}\n')
        return run_id
    except Exception as e:
        print(f"Warning: Could not log audit: {e}")
        return None

def log_error(run_id, table, layer, error_type, error_message, record):
    """Log error information to control table"""
    try:
        with open(control_config['error_records'], 'a') as f:
            f.write(f'{run_id},{table},{layer},{error_type},{error_message},{json.dumps(record)},{now()}\n')
    except Exception as e:
        print(f"Warning: Could not log error: {e}")

def update_watermark(table, layer, key, date):
    """Update watermark control table"""
    try:
        with open(control_config['watermark'], 'a') as f:
            f.write(f'{table},{layer},{key},{date}\n')
    except Exception as e:
        print(f"Warning: Could not update watermark: {e}")

In [None]:
# Data quality validation functions
def validate_null_checks(df, null_columns):
    """Validate null checks for specified columns"""
    if not null_columns:
        return df
    
    null_condition = " AND ".join([f"{col} IS NOT NULL" for col in null_columns])
    return df.filter(null_condition)

def validate_duplicate_checks(df, duplicate_columns):
    """Remove duplicates based on specified columns"""
    if not duplicate_columns:
        return df
    
    return df.dropDuplicates(duplicate_columns)

def validate_data_types(df, data_type_rules):
    """Cast columns to specified data types"""
    if not data_type_rules:
        return df
    
    for col, data_type in data_type_rules.items():
        if col in df.columns:
            if data_type == "int":
                df = df.withColumn(col, col(col).cast("int"))
            elif data_type == "decimal":
                df = df.withColumn(col, col(col).cast("decimal(10,2)"))
            elif data_type == "string":
                df = df.withColumn(col, col(col).cast("string"))
    return df

def validate_business_rules(df, business_rules):
    """Apply business rule validations"""
    if not business_rules:
        return df
    
    conditions = []
    for col, rule in business_rules.items():
        if col in df.columns:
            conditions.append(f"{col} {rule}")
    
    if conditions:
        condition_str = " AND ".join(conditions)
        return df.filter(condition_str)
    return df

In [None]:
# Process all tables from configuration
tables_config = bronze_config['tables']

for table_name, table_config in tables_config.items():
    print(f"\n🔄 Processing table: {table_name}")
    
    try:
        # Read CSV file
        source_file = table_config['source_file']
        csv_path = f"{RAW_DATA_PATH}/{source_file}"
        
        print(f"📖 Reading from: {csv_path}")
        df = spark.read.option("header", "true").option("inferSchema", "true").csv(csv_path)
        
        initial_count = df.count()
        print(f"📊 Initial row count: {initial_count}")
        
        # Apply data quality rules
        dq_rules = table_config.get('data_quality_rules', {})
        
        # Null checks
        if 'null_checks' in dq_rules:
            df = validate_null_checks(df, dq_rules['null_checks'])
            print(f"✅ Applied null checks: {dq_rules['null_checks']}")
        
        # Duplicate checks
        if 'duplicate_checks' in dq_rules:
            df = validate_duplicate_checks(df, dq_rules['duplicate_checks'])
            print(f"✅ Applied duplicate checks: {dq_rules['duplicate_checks']}")
        
        # Data type validation
        if 'data_type_validation' in dq_rules:
            df = validate_data_types(df, dq_rules['data_type_validation'])
            print(f"✅ Applied data type validation")
        
        # Business rules
        if 'business_rules' in dq_rules:
            df = validate_business_rules(df, dq_rules['business_rules'])
            print(f"✅ Applied business rules")
        
        final_count = df.count()
        print(f"📊 Final row count: {final_count}")
        
        # Write to bronze layer
        bronze_table_path = f"{BRONZE_DATA_PATH}/{table_name}"
        df.write.mode("overwrite").format("delta").save(bronze_table_path)
        print(f"💾 Written to bronze: {bronze_table_path}")
        
        # Log success
        run_id = log_audit(table_name, "bronze", "SUCCESS", final_count)
        update_watermark(table_name, "bronze", None, now())
        
        print(f"✅ {table_name} processed successfully")
        
    except Exception as e:
        error_msg = str(e)
        print(f"❌ Error processing {table_name}: {error_msg}")
        
        # Log error
        run_id = log_audit(table_name, "bronze", "FAIL", 0, error_msg)
        log_error(run_id, table_name, "bronze", "INGESTION", error_msg, {})
    
    print("-" * 50)

print("\n🎉 Bronze layer processing completed!")