In [0]:
print("""
╔═══════════════════════════════════════════════════════════════╗
║                                                               ║
║        🔧 PHASE 4 AUDIT INFRASTRUCTURE INITIALIZATION         ║
║                                                               ║
║        This will create 6 audit tables in the audit schema    ║
║        for tracking pipeline execution, errors, and quality   ║
║                                                               ║
╚═══════════════════════════════════════════════════════════════╝
""")



╔═══════════════════════════════════════════════════════════════╗
║                                                               ║
║        🔧 PHASE 4 AUDIT INFRASTRUCTURE INITIALIZATION         ║
║                                                               ║
║        This will create 6 audit tables in the audit schema    ║
║        for tracking pipeline execution, errors, and quality   ║
║                                                               ║
╚═══════════════════════════════════════════════════════════════╝



In [0]:
# STEP 1: Set Configuration

catalog_name = "fintech_analytics"
schema_name = "audit"

print(f"Configuration:")
print(f"  Catalog: {catalog_name}")
print(f"  Schema: {schema_name}")


Configuration:
  Catalog: fintech_analytics
  Schema: audit


In [0]:
# STEP 2: Create Schema

spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.{schema_name}")
print(f"✅ Schema created: {catalog_name}.{schema_name}")

✅ Schema created: fintech_analytics.audit


In [0]:
# STEP 3: Create TABLE 1 - Pipeline Runs (high-level tracking)

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {catalog_name}.{schema_name}.pipeline_runs (
    run_id STRING NOT NULL,
    execution_date DATE NOT NULL,
    environment STRING,
    run_mode STRING,
    total_tasks INT,
    successful_tasks INT,
    failed_tasks INT,
    skipped_tasks INT,
    status STRING,
    start_time TIMESTAMP,
    end_time TIMESTAMP,
    duration_seconds BIGINT,
    error_summary STRING,
    created_at TIMESTAMP,
    updated_at TIMESTAMP
)
USING DELTA
PARTITIONED BY (execution_date)
""")

print("✅ TABLE 1 Created: pipeline_runs")
print("   Purpose: High-level tracking of each pipeline execution")
print("   Columns: run_id, execution_date, status, start_time, duration, etc.")

✅ TABLE 1 Created: pipeline_runs
   Purpose: High-level tracking of each pipeline execution
   Columns: run_id, execution_date, status, start_time, duration, etc.


In [0]:
# STEP 4: Create TABLE 2 - Task Runs (detailed task tracking)

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {catalog_name}.{schema_name}.task_runs (
    run_id STRING NOT NULL,
    task_name STRING NOT NULL,
    task_type STRING,
    execution_order INT,
    attempt INT,
    status STRING,
    start_time TIMESTAMP,
    end_time TIMESTAMP,
    duration_seconds BIGINT,
    rows_processed BIGINT,
    rows_inserted BIGINT,
    rows_updated BIGINT,
    error_message STRING,
    error_type STRING,
    stack_trace STRING,
    created_at TIMESTAMP
)
USING DELTA
PARTITIONED BY (run_id)
""")

print("✅ TABLE 2 Created: task_runs")
print("   Purpose: Detailed tracking of each task (silver, gold, quality, optimize)")
print("   Columns: task_name, status, attempt, duration_seconds, error_message, stack_trace")

✅ TABLE 2 Created: task_runs
   Purpose: Detailed tracking of each task (silver, gold, quality, optimize)
   Columns: task_name, status, attempt, duration_seconds, error_message, stack_trace


In [0]:
# STEP 5: Create TABLE 3 - Data Quality Checks (quality validation results)

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {catalog_name}.{schema_name}.data_quality_checks (
    run_id STRING NOT NULL,
    table_name STRING NOT NULL,
    check_name STRING NOT NULL,
    check_type STRING,
    expected_value STRING,
    actual_value STRING,
    status STRING,
    message STRING,
    execution_date DATE,
    created_at TIMESTAMP
)
USING DELTA
PARTITIONED BY (execution_date)
""")

print("✅ TABLE 3 Created: data_quality_checks")
print("   Purpose: Results of all data quality validations")
print("   Columns: table_name, check_name, status, expected_value, actual_value, message")

✅ TABLE 3 Created: data_quality_checks
   Purpose: Results of all data quality validations
   Columns: table_name, check_name, status, expected_value, actual_value, message


In [0]:
# STEP 6: Create TABLE 4 - Watermarks (incremental processing state)

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {catalog_name}.{schema_name}.watermarks (
    source_table STRING NOT NULL,
    target_table STRING NOT NULL,
    last_processed_date DATE,
    last_processed_timestamp TIMESTAMP,
    row_count BIGINT,
    updated_at TIMESTAMP
)
USING DELTA
""")

print("✅ TABLE 4 Created: watermarks")
print("   Purpose: Track last processed date for incremental pipelines")
print("   Columns: source_table, target_table, last_processed_date, row_count")

✅ TABLE 4 Created: watermarks
   Purpose: Track last processed date for incremental pipelines
   Columns: source_table, target_table, last_processed_date, row_count


In [0]:
# STEP 7: Create TABLE 5 - Error Log (detailed error tracking)

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {catalog_name}.{schema_name}.error_log (
    error_id STRING NOT NULL,
    run_id STRING,
    task_name STRING,
    error_type STRING,
    error_message STRING,
    full_traceback STRING,
    context_data STRING,
    severity STRING,
    is_resolved BOOLEAN,
    resolution_notes STRING,
    created_at TIMESTAMP
)
USING DELTA
""")

print("✅ TABLE 5 Created: error_log")
print("   Purpose: Complete error tracking with stack traces")
print("   Columns: error_id, error_type, error_message, full_traceback, severity, is_resolved")

✅ TABLE 5 Created: error_log
   Purpose: Complete error tracking with stack traces
   Columns: error_id, error_type, error_message, full_traceback, severity, is_resolved


In [0]:
# STEP 8: Create TABLE 6 - Repair History (recovery tracking)

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {catalog_name}.{schema_name}.repair_history (
    repair_id STRING NOT NULL,
    original_run_id STRING NOT NULL,
    original_task_name STRING NOT NULL,
    repair_type STRING,
    repair_status STRING,
    rows_reprocessed BIGINT,
    attempted_at TIMESTAMP,
    completed_at TIMESTAMP,
    notes STRING,
    created_at TIMESTAMP
)
USING DELTA
""")

print("✅ TABLE 6 Created: repair_history")
print("   Purpose: Track recovery/repair attempts when pipelines fail")
print("   Columns: repair_id, original_run_id, repair_type, repair_status, rows_reprocessed")

✅ TABLE 6 Created: repair_history
   Purpose: Track recovery/repair attempts when pipelines fail
   Columns: repair_id, original_run_id, repair_type, repair_status, rows_reprocessed


In [0]:
# STEP 9: Verify All Tables Created

verification = spark.sql(f"""
SELECT 
    table_name,
    table_type,
    comment
FROM {catalog_name}.information_schema.tables
WHERE table_schema = '{schema_name}'
ORDER BY table_name
""")

print("\n" + "="*80)
print("✅ VERIFICATION: All Tables Successfully Created")
print("="*80)

display(verification)


✅ VERIFICATION: All Tables Successfully Created


table_name,table_type,comment
data_quality_checks,MANAGED,
error_log,MANAGED,
pipeline_runs,MANAGED,
repair_history,MANAGED,
task_runs,MANAGED,
watermarks,MANAGED,


In [0]:
# STEP 10: Check Table Row Counts

row_counts = spark.sql(f"""
SELECT 
    'pipeline_runs' as table_name, COUNT(*) as row_count FROM {catalog_name}.{schema_name}.pipeline_runs
UNION ALL
SELECT 'task_runs', COUNT(*) FROM {catalog_name}.{schema_name}.task_runs
UNION ALL
SELECT 'data_quality_checks', COUNT(*) FROM {catalog_name}.{schema_name}.data_quality_checks
UNION ALL
SELECT 'watermarks', COUNT(*) FROM {catalog_name}.{schema_name}.watermarks
UNION ALL
SELECT 'error_log', COUNT(*) FROM {catalog_name}.{schema_name}.error_log
UNION ALL
SELECT 'repair_history', COUNT(*) FROM {catalog_name}.{schema_name}.repair_history
""")

print("\nCurrent Row Counts (should all be 0 initially):")
display(row_counts)




Current Row Counts (should all be 0 initially):


table_name,row_count
pipeline_runs,0
task_runs,0
data_quality_checks,0
watermarks,0
error_log,0
repair_history,0


In [0]:
# STEP 11: Create Sample Data (for testing)

# Sample pipeline run
spark.sql(f"""
INSERT INTO {catalog_name}.{schema_name}.pipeline_runs (
    run_id, execution_date, environment, run_mode, total_tasks, 
    successful_tasks, failed_tasks, skipped_tasks, status, start_time, end_time
)
VALUES (
    'TEST_20260220_000000',
    CURRENT_DATE(),
    'dev',
    'full',
    4,
    4,
    0,
    0,
    'SUCCESS',
    CURRENT_TIMESTAMP(),
    CURRENT_TIMESTAMP()
)
""")

print("✅ Sample data inserted (for testing)")



✅ Sample data inserted (for testing)


In [0]:
# STEP 12: Final Summary

print("\n" + "="*80)
print("🎉 SETUP COMPLETE!")
print("="*80)

summary_text = f"""
Audit Infrastructure Initialized Successfully!

CREATED 6 TABLES:
  1. pipeline_runs           - High-level pipeline execution tracking
  2. task_runs              - Detailed task-level execution details
  3. data_quality_checks    - Quality validation results
  4. watermarks             - Incremental processing state
  5. error_log              - Detailed error tracking
  6. repair_history         - Recovery attempt tracking

LOCATION:
  Schema: {catalog_name}.{schema_name}

NEXT STEPS:
  1. Create silver notebook (Step 6)
  2. Create gold notebook (Step 7)
  3. Create quality checks notebook (Step 8)
  4. Create optimization notebook (Step 9)
  5. Create master orchestration notebook (Step 10)
  6. Create Databricks Workflows job (Step 11)
  7. Run first test!

VERIFY SETUP:
  Run this query anytime to check audit tables:
  
  SELECT * FROM {catalog_name}.{schema_name}.pipeline_runs
  ORDER BY created_at DESC
  LIMIT 10;
"""

print(summary_text)



🎉 SETUP COMPLETE!

Audit Infrastructure Initialized Successfully!

CREATED 6 TABLES:
  1. pipeline_runs           - High-level pipeline execution tracking
  2. task_runs              - Detailed task-level execution details
  3. data_quality_checks    - Quality validation results
  4. watermarks             - Incremental processing state
  5. error_log              - Detailed error tracking
  6. repair_history         - Recovery attempt tracking

LOCATION:
  Schema: fintech_analytics.audit

NEXT STEPS:
  1. Create silver notebook (Step 6)
  2. Create gold notebook (Step 7)
  3. Create quality checks notebook (Step 8)
  4. Create optimization notebook (Step 9)
  5. Create master orchestration notebook (Step 10)
  6. Create Databricks Workflows job (Step 11)
  7. Run first test!

VERIFY SETUP:
  Run this query anytime to check audit tables:
  
  SELECT * FROM fintech_analytics.audit.pipeline_runs
  ORDER BY created_at DESC
  LIMIT 10;



In [0]:
# STEP 13: Display Sample Data to Verify

print("Sample data from pipeline_runs table:")

sample_data = spark.sql(f"""
SELECT 
    run_id,
    execution_date,
    environment,
    status,
    total_tasks,
    successful_tasks
FROM {catalog_name}.{schema_name}.pipeline_runs
ORDER BY created_at DESC
LIMIT 5
""")

display(sample_data)

# COMMAND ----------

print("""
✅ SETUP NOTEBOOK COMPLETE!

Now proceed to:
  STEP 6: Create & Adapt Silver Notebook
  STEP 7: Create & Adapt Gold Notebook
  STEP 8: Create Quality Checks Notebook
  STEP 9: Create Table Optimization Notebook
  STEP 10: Create Master Orchestration Notebook
  STEP 11: Create Databricks Workflows Job

Refer to DEPLOYMENT_STEP_BY_STEP.md for detailed instructions on each step.
""")


Sample data from pipeline_runs table:


run_id,execution_date,environment,status,total_tasks,successful_tasks
TEST_20260220_000000,2026-02-21,dev,SUCCESS,4,4



✅ SETUP NOTEBOOK COMPLETE!

Now proceed to:
  STEP 6: Create & Adapt Silver Notebook
  STEP 7: Create & Adapt Gold Notebook
  STEP 8: Create Quality Checks Notebook
  STEP 9: Create Table Optimization Notebook
  STEP 10: Create Master Orchestration Notebook
  STEP 11: Create Databricks Workflows Job

Refer to DEPLOYMENT_STEP_BY_STEP.md for detailed instructions on each step.

