In [0]:
# COMMAND ----------
# MASTER ORCHESTRATION NOTEBOOK (FINAL - NO PARAMETERS!)
# Main entry point for entire SEC Smart Money pipeline
# Silver → Gold → Quality → Optimize

from datetime import datetime, timedelta
import traceback

print("""
╔═══════════════════════════════════════════════════════════════════════════╗
║                                                                           ║
║              🎯 MASTER ORCHESTRATION NOTEBOOK                            ║
║                                                                           ║
║              Coordinating complete SEC Smart Money pipeline              ║
║              Silver → Gold → Quality → Optimize                          ║
║                                                                           ║
╚═══════════════════════════════════════════════════════════════════════════╝
""")

# COMMAND ----------
# STEP 1: Generate Execution IDs (No parameters!)

RUN_ID = datetime.now().strftime("%Y%m%d_%H%M%S")
EXECUTION_DATE = datetime.now().strftime("%Y-%m-%d")
START_TIME = datetime.now()

run_mode = "incremental"
environment = "production"

print(f"""
📋 PIPELINE CONFIGURATION:
  RUN_ID:            {RUN_ID}
  EXECUTION_DATE:    {EXECUTION_DATE}
  START_TIME:        {START_TIME.strftime('%H:%M:%S')}
  Run Mode:          {run_mode}
  Environment:       {environment}
""")


╔═══════════════════════════════════════════════════════════════════════════╗
║                                                                           ║
║              🎯 MASTER ORCHESTRATION NOTEBOOK                            ║
║                                                                           ║
║              Coordinating complete SEC Smart Money pipeline              ║
║              Silver → Gold → Quality → Optimize                          ║
║                                                                           ║
╚═══════════════════════════════════════════════════════════════════════════╝


📋 PIPELINE CONFIGURATION:
  RUN_ID:            20260222_035752
  EXECUTION_DATE:    2026-02-22
  START_TIME:        03:57:52
  Run Mode:          incremental
  Environment:       production



In [0]:
# COMMAND ----------
# STEP 2: Initialize Audit Logging

print("\n🔍 Initializing audit logging...")

def log_pipeline_start():
    try:
        spark.sql(f"""
        INSERT INTO fintech_analytics.audit.pipeline_runs 
        (run_id, execution_date, environment, run_mode, status, start_time, total_tasks)
        VALUES (
            '{RUN_ID}',
            '{EXECUTION_DATE}',
            '{environment}',
            '{run_mode}',
            'RUNNING',
            CURRENT_TIMESTAMP(),
            4
        )
        """)
        print(f"✅ Pipeline run logged: {RUN_ID}")
    except Exception as e:
        print(f"⚠️  Could not log pipeline start (audit table may not exist): {str(e)[:100]}")

def log_task_start(task_name):
    try:
        spark.sql(f"""
        INSERT INTO fintech_analytics.audit.task_runs
        (run_id, task_name, status, start_time)
        VALUES ('{RUN_ID}', '{task_name}', 'RUNNING', CURRENT_TIMESTAMP())
        """)
    except:
        pass

def log_task_success(task_name, rows=0):
    try:
        spark.sql(f"""
        UPDATE fintech_analytics.audit.task_runs
        SET status = 'SUCCESS',
            end_time = CURRENT_TIMESTAMP(),
            rows_processed = {rows}
        WHERE run_id = '{RUN_ID}' AND task_name = '{task_name}'
        """)
    except:
        pass

def log_task_failure(task_name, error):
    try:
        error_msg = str(error)[:200].replace("'", "''")
        spark.sql(f"""
        UPDATE fintech_analytics.audit.task_runs
        SET status = 'FAILED',
            end_time = CURRENT_TIMESTAMP(),
            error_message = '{error_msg}'
        WHERE run_id = '{RUN_ID}' AND task_name = '{task_name}'
        """)
    except:
        pass

log_pipeline_start()


🔍 Initializing audit logging...
✅ Pipeline run logged: 20260222_035752


In [0]:
# COMMAND ----------
# STEP 3: TASK 1 - Silver Transformation

print("\n" + "="*80)
print("TASK 1: SILVER TRANSFORMATION")
print("="*80)

task_1_name = "silver_transformation"
log_task_start(task_1_name)
task_1_success = False

try:
    print(f"  ⚙️  Running silver transformation...")
    
    # Call your silver notebook - NO ARGUMENTS!
    result = dbutils.notebook.run(
        "/Users/shreyashp042@gmail.com/fintech_pipeline/jobs/sec_smart_money_silver",
        timeout_seconds=1800,
        arguments={}  # Empty - no parameters!
    )
    
    # Count rows
    silver_count = spark.sql(
        "SELECT COUNT(*) as count FROM fintech_analytics.silver.silver_fact_insider_transactions"
    ).collect()[0]['count']
    
    log_task_success(task_1_name, silver_count)
    task_1_success = True
    
    print(f"    ✅ Silver transformation succeeded ({silver_count:,} rows)")
    
except Exception as e:
    error = str(e)
    log_task_failure(task_1_name, error)
    task_1_success = False
    print(f"    ❌ Silver transformation failed: {error[:100]}")



TASK 1: SILVER TRANSFORMATION
  ⚙️  Running silver transformation...
    ✅ Silver transformation succeeded (394 rows)


In [0]:

# COMMAND ----------
# STEP 4: Check Task 1 Status

if not task_1_success:
    print("""
❌ CRITICAL ERROR: Silver Transformation Failed
   Pipeline cannot continue

Actions:
  1. Check silver notebook logs
  2. Fix data source issue
  3. Re-run pipeline
    """)
    
    try:
        spark.sql(f"""
        UPDATE fintech_analytics.audit.pipeline_runs
        SET status = 'FAILED', end_time = CURRENT_TIMESTAMP()
        WHERE run_id = '{RUN_ID}'
        """)
    except:
        pass
    
    raise Exception("Silver transformation failed - stopping pipeline")


In [0]:
# COMMAND ----------
# STEP 5: TASK 2 - Gold Analytics

print("\n" + "="*80)
print("TASK 2: GOLD ANALYTICS")
print("="*80)

task_2_name = "gold_analytics"
log_task_start(task_2_name)
task_2_success = False

try:
    print(f"  ⚙️  Running gold analytics...")
    
    # Call your gold notebook - NO ARGUMENTS!
    result = dbutils.notebook.run(
        "/Users/shreyashp042@gmail.com/fintech_pipeline/jobs/sec_smart_money_gold",
        timeout_seconds=1200,
        arguments={}  # Empty - no parameters!
    )
    
    # Count rows
    gold_count = spark.sql(
        "SELECT COUNT(*) as count FROM fintech_analytics.gold.gold_insider_summary_by_company"
    ).collect()[0]['count']
    
    log_task_success(task_2_name, gold_count)
    task_2_success = True
    
    print(f"    ✅ Gold analytics succeeded ({gold_count:,} rows)")
    
except Exception as e:
    error = str(e)
    log_task_failure(task_2_name, error)
    task_2_success = False
    print(f"    ❌ Gold analytics failed: {error[:100]}")


TASK 2: GOLD ANALYTICS
  ⚙️  Running gold analytics...
    ✅ Gold analytics succeeded (26 rows)


In [0]:
# COMMAND ----------
# STEP 6: Check Task 2 Status

if not task_2_success:
    print("""
❌ CRITICAL ERROR: Gold Analytics Failed
   Pipeline cannot continue

Actions:
  1. Check gold notebook logs
  2. Fix transformation logic
  3. Re-run pipeline
    """)
    
    try:
        spark.sql(f"""
        UPDATE fintech_analytics.audit.pipeline_runs
        SET status = 'FAILED', end_time = CURRENT_TIMESTAMP()
        WHERE run_id = '{RUN_ID}'
        """)
    except:
        pass
    
    raise Exception("Gold analytics failed - stopping pipeline")


In [0]:
# STEP 7: TASK 3 - Data Quality Checks

print("\n" + "="*80)
print("TASK 3: DATA QUALITY CHECKS")
print("="*80)

task_3_name = "data_quality_checks"
log_task_start(task_3_name)
task_3_success = False

try:
    print(f"  ⚙️  Running quality checks...")
    
    # Call quality checks - NO ARGUMENTS!
    result = dbutils.notebook.run(
        "/Users/shreyashp042@gmail.com/fintech_pipeline/jobs/08_data_quality_checks",
        timeout_seconds=600,
        arguments={}  # Empty - no parameters!
    )
    
    log_task_success(task_3_name, 0)
    task_3_success = True
    
    print(f"    ✅ Quality checks passed")
    
except Exception as e:
    error = str(e)
    log_task_failure(task_3_name, error)
    task_3_success = False
    # Non-critical, so don't stop
    print(f"    ⚠️  Quality checks warning: {error[:100]}")


TASK 3: DATA QUALITY CHECKS
  ⚙️  Running quality checks...
    ✅ Quality checks passed


In [0]:
# STEP 8: TASK 4 - Table Optimization (Non-Critical)

print("\n" + "="*80)
print("TASK 4: TABLE OPTIMIZATION")
print("="*80)

task_4_name = "table_optimization"
log_task_start(task_4_name)
task_4_success = False

try:
    print(f"  ⚙️  Running table optimization...")
    
    # Call optimization - NO ARGUMENTS!
    result = dbutils.notebook.run(
        "/Users/shreyashp042@gmail.com/fintech_pipeline/jobs/table_optimization",
        timeout_seconds=900,
        arguments={}  # Empty - no parameters!
    )
    
    log_task_success(task_4_name, 0)
    task_4_success = True
    
    print(f"    ✅ Tables optimized")
    
except Exception as e:
    error = str(e)
    log_task_failure(task_4_name, error)
    task_4_success = False
    # Non-critical, so don't stop
    print(f"    ⚠️  Optimization warning: {error[:100]}")


TASK 4: TABLE OPTIMIZATION
  ⚙️  Running table optimization...
    ✅ Tables optimized


In [0]:
# STEP 9: Calculate Summary

total_tasks = 4
successful_tasks = sum([task_1_success, task_2_success, task_3_success, task_4_success])
failed_tasks = total_tasks - successful_tasks

# Overall status based on critical tasks
overall_status = "SUCCESS" if (task_1_success and task_2_success) else "FAILED"
end_time = datetime.now()
duration = (end_time - START_TIME).total_seconds()

print("\n" + "="*80)
print("📊 EXECUTION SUMMARY")
print("="*80)

summary_data = {
    "Run ID": RUN_ID,
    "Status": overall_status,
    "Total Tasks": total_tasks,
    "Successful": successful_tasks,
    "Failed": failed_tasks,
    "Duration (sec)": int(duration),
}

for key, value in summary_data.items():
    print(f"  {key:.<40} {value}")


📊 EXECUTION SUMMARY
  Run ID.................................. 20260222_035752
  Status.................................. SUCCESS
  Total Tasks............................. 4
  Successful.............................. 4
  Failed.................................. 0
  Duration (sec).......................... 4043


In [0]:
# STEP 10: Update Pipeline Status in Audit

try:
    spark.sql(f"""
    UPDATE fintech_analytics.audit.pipeline_runs
    SET status = '{overall_status}',
        end_time = CURRENT_TIMESTAMP(),
        duration_seconds = {int(duration)},
        successful_tasks = {successful_tasks},
        failed_tasks = {failed_tasks}
    WHERE run_id = '{RUN_ID}'
    """)
    print(f"\n✅ Pipeline status updated in audit table")
except Exception as e:
    print(f"⚠️  Could not update audit table: {str(e)[:100]}")


✅ Pipeline status updated in audit table


In [0]:
# STEP 11: Display Audit Information

print("\n" + "="*80)
print("📋 AUDIT INFORMATION")
print("="*80)

try:
    pipeline_info = spark.sql(f"""
    SELECT 
        run_id,
        status,
        duration_seconds,
        successful_tasks,
        failed_tasks
    FROM fintech_analytics.audit.pipeline_runs
    WHERE run_id = '{RUN_ID}'
    """)
    
    display(pipeline_info)
except Exception as e:
    print(f"Could not display audit info: {str(e)[:100]}")


📋 AUDIT INFORMATION


run_id,status,duration_seconds,successful_tasks,failed_tasks
20260222_035752,SUCCESS,4043,4,0


In [0]:
# COMMAND ----------
# STEP 12: Show Task Details

print("\n📝 Task Execution Details:")

try:
    task_details = spark.sql(f"""
    SELECT 
        task_name,
        status,
        ROUND(duration_seconds, 1) as duration_sec
    FROM fintech_analytics.audit.task_runs
    WHERE run_id = '{RUN_ID}'
    ORDER BY start_time
    """)
    
    display(task_details)
except Exception as e:
    print(f"Could not display task details: {str(e)[:100]}")



📝 Task Execution Details:


task_name,status,duration_sec
silver_transformation,SUCCESS,
silver_transformation,SUCCESS,
silver_transformation,SUCCESS,
gold_analytics,SUCCESS,
gold_analytics,SUCCESS,
gold_analytics,SUCCESS,
gold_analytics,SUCCESS,
gold_analytics,SUCCESS,
gold_analytics,SUCCESS,
gold_analytics,SUCCESS,


In [0]:
# STEP 13: Final Report

print("\n" + "="*80)
print("🎉 PIPELINE EXECUTION COMPLETE")
print("="*80)

if overall_status == "SUCCESS":
    print(f"""
✅ SUCCESS! Pipeline completed successfully

Summary:
  - Silver layer: Transformed ✅
  - Gold layer: Aggregated ✅
  - Quality checks: Validated {'✅' if task_3_success else '⚠️'}
  - Tables optimized: {'✅' if task_4_success else '⚠️'}
  - Duration: {int(duration)} seconds
  - Run ID: {RUN_ID}

Your data is ready for analysis!
    """)
else:
    print(f"""
⚠️ WARNING: Pipeline completed with issues

Status: {overall_status}
  - Silver layer: {'✅' if task_1_success else '❌'}
  - Gold layer: {'✅' if task_2_success else '❌'}
  - Quality checks: {'✅' if task_3_success else '⚠️'}
  - Tables optimized: {'✅' if task_4_success else '⚠️'}

Review error logs and fix issues.
    """)



🎉 PIPELINE EXECUTION COMPLETE

✅ SUCCESS! Pipeline completed successfully

Summary:
  - Silver layer: Transformed ✅
  - Gold layer: Aggregated ✅
  - Quality checks: Validated ✅
  - Tables optimized: ✅
  - Duration: 4043 seconds
  - Run ID: 20260222_035752

Your data is ready for analysis!
    


In [0]:
# STEP 14: Return Status (Safe)

try:
    result = {
        "status": overall_status,
        "run_id": RUN_ID,
        "successful_tasks": successful_tasks,
        "failed_tasks": failed_tasks,
        "duration_seconds": int(duration)
    }
    dbutils.jobs.taskValues.set(result)
    print(f"\n✅ Results returned to workflow")
except:
    # OK if not in a job
    print(f"\n⚠️  Not running in a job context (OK for testing)")

# COMMAND ----------
# Final Status

print(f"""
═══════════════════════════════════════════════════════════════
Pipeline Execution: {overall_status}
Run ID: {RUN_ID}
Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
═══════════════════════════════════════════════════════════════
""")

if overall_status == "SUCCESS":
    dbutils.notebook.exit("SUCCESS")
else:
    dbutils.notebook.exit("FAILED")
