In [0]:
# COMMAND ----------
print("""
╔═══════════════════════════════════════════════════════════════╗
║                                                               ║
║                  ✅ DATA QUALITY CHECKS                       ║
║                                                               ║
║        Validating data quality across silver and gold layers  ║
║                                                               ║
╚═══════════════════════════════════════════════════════════════╝
""")


╔═══════════════════════════════════════════════════════════════╗
║                                                               ║
║                  ✅ DATA QUALITY CHECKS                       ║
║                                                               ║
║        Validating data quality across silver and gold layers  ║
║                                                               ║
╚═══════════════════════════════════════════════════════════════╝



In [0]:

from datetime import datetime

print("""
╔═══════════════════════════════════════════════════════════════╗
║                                                               ║
║                  ✅ DATA QUALITY CHECKS                       ║
║                                                               ║
║        Validating data quality across silver and gold layers  ║
║                                                               ║
╚═══════════════════════════════════════════════════════════════╝
""")

# COMMAND ----------
# STEP 1: Generate Execution ID (no parameters needed!)

run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
execution_date = datetime.now().strftime("%Y-%m-%d")

print(f"""
Quality Check Execution:
  Run ID: {run_id}
  Execution Date: {execution_date}
  Timestamp: {datetime.now()}
""")


╔═══════════════════════════════════════════════════════════════╗
║                                                               ║
║                  ✅ DATA QUALITY CHECKS                       ║
║                                                               ║
║        Validating data quality across silver and gold layers  ║
║                                                               ║
╚═══════════════════════════════════════════════════════════════╝


Quality Check Execution:
  Run ID: 20260221_214901
  Execution Date: 2026-02-21
  Timestamp: 2026-02-21 21:49:01.501534



In [0]:
# STEP 2: Set Configuration

catalog = "fintech_analytics"
silver_schema = "silver"
gold_schema = "gold"
audit_schema = "audit"

print(f"""
Database Configuration:
  Catalog: {catalog}
  Silver Schema: {silver_schema}
  Gold Schema: {gold_schema}
  Audit Schema: {audit_schema}
""")


Database Configuration:
  Catalog: fintech_analytics
  Silver Schema: silver
  Gold Schema: gold
  Audit Schema: audit



In [0]:
# STEP 3: Quality Check 1 - Silver Table Exists

print("\n" + "="*80)
print("CHECK 1: SILVER FACT TABLE EXISTS")
print("="*80)

try:
    silver_table = f"{catalog}.{silver_schema}.silver_fact_insider_transactions"
    silver_count = spark.sql(f"SELECT COUNT(*) as count FROM {silver_table}").collect()[0]['count']
    
    check_1_status = "PASS"
    check_1_message = f"Silver table exists: {silver_count:,} rows"
    print(f"✅ {check_1_message}")
    
except Exception as e:
    check_1_status = "FAIL"
    check_1_message = f"Error: {str(e)[:100]}"
    print(f"❌ {check_1_message}")
    silver_count = 0



CHECK 1: SILVER FACT TABLE EXISTS
✅ Silver table exists: 985 rows


In [0]:
# STEP 4: Quality Check 2 - Required Columns

print("\n" + "="*80)
print("CHECK 2: REQUIRED COLUMNS IN SILVER")
print("="*80)

required_columns = [
    "transaction_id",
    "company_name",
    "filing_date",
    "insider_name",
    "transaction_amount",
    "shares"
]

try:
    silver_table = f"{catalog}.{silver_schema}.silver_fact_insider_transactions"
    df = spark.sql(f"SELECT * FROM {silver_table} LIMIT 1")
    actual_columns = set(df.columns)
    # STEP 4: Quality Check 2 - Required Columns

    required_columns = [
    "cik",
    "company_name",
    "filing_date",
    "insider_name",
    "security_title",
    "transaction_date",
    "shares",
    "price_per_share",
    "transaction_code",
    "acquired_disposed",
    "shares_after_transaction",
    "confidence_score"
    ]

    required_set = set(required_columns)
    missing = required_set - actual_columns
    extra = actual_columns - required_set
    
    if not missing:
        check_2_status = "PASS"
        check_2_message = f"All {len(required_columns)} required columns present"
        print(f"✅ {check_2_message}")
    else:
        check_2_status = "FAIL"
        check_2_message = f"Missing: {', '.join(missing)}"
        print(f"❌ {check_2_message}")
        
except Exception as e:
    check_2_status = "FAIL"
    check_2_message = f"Error: {str(e)[:100]}"
    print(f"❌ {check_2_message}")



CHECK 2: REQUIRED COLUMNS IN SILVER
✅ All 12 required columns present


In [0]:
# STEP 5: Quality Check 3 - Row Count

print("\n" + "="*80)
print("CHECK 3: ROW COUNT VALIDATION")
print("="*80)

try:
    silver_table = f"{catalog}.{silver_schema}.silver_fact_insider_transactions"
    count_result = spark.sql(f"SELECT COUNT(*) as cnt FROM {silver_table}").collect()[0]['cnt']
    min_threshold = 1  # Change to your minimum
    
    if count_result >= min_threshold:
        check_3_status = "PASS"
        check_3_message = f"Row count {count_result:,} is valid (minimum: {min_threshold})"
        print(f"✅ {check_3_message}")
    else:
        check_3_status = "FAIL"
        check_3_message = f"Row count {count_result:,} below minimum {min_threshold}"
        print(f"❌ {check_3_message}")
        
except Exception as e:
    check_3_status = "FAIL"
    check_3_message = f"Error: {str(e)[:100]}"
    print(f"❌ {check_3_message}")



CHECK 3: ROW COUNT VALIDATION
✅ Row count 985 is valid (minimum: 1)


In [0]:
# STEP 6: Quality Check 4 - No Null IDs

print("\n" + "="*80)
print("CHECK 4: PRIMARY KEY NOT NULL")
print("="*80)

try:
    silver_table = f"{catalog}.{silver_schema}.silver_fact_insider_transactions"
    null_count = spark.sql(f"""
    SELECT COUNT(*) as null_count
    FROM {silver_table}
    WHERE cik IS NULL and price_per_share IS NULL
    """).collect()[0]['null_count']
    
    if null_count == 0:
        check_4_status = "PASS"
        check_4_message = "No null values in transaction_id (primary key)"
        print(f"✅ {check_4_message}")
    else:
        check_4_status = "FAIL"
        check_4_message = f"Found {null_count:,} null values in transaction_id"
        print(f"❌ {check_4_message}")
        
except Exception as e:
    check_4_status = "FAIL"
    check_4_message = f"Error: {str(e)[:100]}"
    print(f"❌ {check_4_message}")



CHECK 4: PRIMARY KEY NOT NULL
✅ No null values in transaction_id (primary key)


In [0]:
# STEP 7: Quality Check 5 - Gold Table Exists

print("\n" + "="*80)
print("CHECK 5: GOLD TABLE EXISTS")
print("="*80)

try:
    gold_table = f"{catalog}.{gold_schema}.gold_insider_summary_by_company"
    gold_count = spark.sql(f"SELECT COUNT(*) as count FROM {gold_table}").collect()[0]['count']
    
    check_5_status = "PASS"
    check_5_message = f"Gold table Gold insider summary by company: {gold_count:,} rows"
    print(f"✅ {check_5_message}")
    gold_table = f"{catalog}.{gold_schema}.gold_kpi_summary"
    gold_count = spark.sql(f"SELECT COUNT(*) as count FROM {gold_table}").collect()[0]['count']
    
    check_5_status = "PASS"
    check_5_message = f"Gold table Gold KPI summary: {gold_count:,} rows"
    print(f"✅ {check_5_message}")
    
except Exception as e:
    check_5_status = "FAIL"
    check_5_message = f"Error: {str(e)[:100]}"
    print(f"❌ {check_5_message}")
    gold_count = 0



CHECK 5: GOLD TABLE EXISTS
✅ Gold table Gold insider summary by company: 33 rows
✅ Gold table Gold KPI summary: 6 rows


In [0]:
# ============================================================
# STEP 8: Quality Check 6 - Data Consistency (Silver vs Gold)
# ============================================================

print("\n" + "="*80)
print("CHECK 6: DATA CONSISTENCY (SILVER → GOLD)")
print("="*80)

try:
    silver_table = f"{catalog}.{silver_schema}.silver_fact_insider_transactions"
    gold_table = f"{catalog}.{gold_schema}.gold_insider_summary_by_company"

    # --------------------------------------------------------
    # Count DISTINCT companies in both layers
    # --------------------------------------------------------
    silver_companies = spark.sql(f"""
        SELECT COUNT(DISTINCT company_name) AS cnt
        FROM {silver_table}
    """).collect()[0]["cnt"]

    gold_companies = spark.sql(f"""
        SELECT COUNT(DISTINCT company_name) AS cnt
        FROM {gold_table}
    """).collect()[0]["cnt"]

    # --------------------------------------------------------
    # Lineage Validation (Gold must come from Silver)
    # --------------------------------------------------------
    mismatch_df = spark.sql(f"""
        SELECT DISTINCT company_name
        FROM {gold_table}

        EXCEPT

        SELECT DISTINCT company_name
        FROM {silver_table}
    """)

    mismatch_count = mismatch_df.count()

    # --------------------------------------------------------
    # Final Validation Logic
    # --------------------------------------------------------
    if mismatch_count == 0 and silver_companies == gold_companies:
        check_6_status = "PASS"
        check_6_message = (
            f"Consistency OK: {silver_companies} companies aligned across Silver and Gold"
        )
        print(f"✅ {check_6_message}")

    else:
        check_6_status = "FAIL"
        check_6_message = (
            f"Mismatch detected → Silver: {silver_companies}, Gold: {gold_companies}, "
            f"Extra in Gold: {mismatch_count}"
        )
        print(f"❌ {check_6_message}")

        print("\n⚠️ Companies present in GOLD but missing in SILVER:")
        mismatch_df.show(truncate=False)

except Exception as e:
    check_6_status = "FAIL"
    check_6_message = f"Error during consistency check: {str(e)[:120]}"
    print(f"❌ {check_6_message}")


CHECK 6: DATA CONSISTENCY (SILVER → GOLD)
✅ Consistency OK: 28 companies aligned across Silver and Gold


In [0]:
# STEP 9: Attempt to Log to Audit Table (Safe - won't crash if fails)

print("\n" + "="*80)
print("LOGGING RESULTS TO AUDIT TABLE")
print("="*80)

quality_checks = [
    {"check_name": "silver_table_exists", "status": check_1_status, "message": check_1_message},
    {"check_name": "required_columns", "status": check_2_status, "message": check_2_message},
    {"check_name": "row_count", "status": check_3_status, "message": check_3_message},
    {"check_name": "no_null_ids", "status": check_4_status, "message": check_4_message},
    {"check_name": "gold_table_exists", "status": check_5_status, "message": check_5_message},
    {"check_name": "data_consistency", "status": check_6_status, "message": check_6_message},
]

logged_count = 0
for check in quality_checks:
    try:
        spark.sql(f"""
        INSERT INTO {catalog}.{audit_schema}.data_quality_checks
        (run_id, table_name, check_name, status, message, execution_date, created_at)
        VALUES (
            '{run_id}',
            'silver_fact_insider_transactions',
            '{check["check_name"]}',
            '{check["status"]}',
            '{check["message"].replace("'", "''")[:200]}',
            CAST('{execution_date}' AS DATE),
            CURRENT_TIMESTAMP()
        )
        """)
        logged_count += 1
    except Exception as e:
        # Silently continue if audit table doesn't exist
        pass

if logged_count > 0:
    print(f"✅ Logged {logged_count} checks to audit table")
else:
    print(f"⚠️  Could not log to audit table (table may not exist)")



LOGGING RESULTS TO AUDIT TABLE
✅ Logged 6 checks to audit table


In [0]:
# STEP 10: Summary Report

print("\n" + "="*80)
print("📊 QUALITY CHECK SUMMARY")
print("="*80)

total = len(quality_checks)
passed = sum(1 for c in quality_checks if c["status"] == "PASS")
failed = sum(1 for c in quality_checks if c["status"] == "FAIL")

print(f"""
Results:
  Total Checks: {total}
  Passed: {passed} ✅
  Failed: {failed} ❌
  Success Rate: {100 * passed / total:.1f}%
""")

# Display as table
summary_df = spark.createDataFrame([
    {"Total": total, "Passed": passed, "Failed": failed, "Success %": f"{100 * passed / total:.1f}%"}
])

display(summary_df)


📊 QUALITY CHECK SUMMARY

Results:
  Total Checks: 6
  Passed: 6 ✅
  Failed: 0 ❌
  Success Rate: 100.0%



Failed,Passed,Success %,Total
0,6,100.0%,6


In [0]:
# STEP 11: Detailed Results Table

print("\nDetailed Results:")

results_df = spark.createDataFrame(quality_checks)
display(results_df)

# COMMAND ----------
# STEP 12: Safe Return Status

all_passed = all(c["status"] == "PASS" for c in quality_checks)

result = {
    "status": "SUCCESS" if all_passed else "PARTIAL",
    "total_checks": total,
    "passed": passed,
    "failed": failed
}

try:
    dbutils.jobs.taskValues.set(result)
except:
    # OK if this fails (not in a job)
    pass

print(f"""
Final Status: {result['status']}
Quality checks complete!
""")


Detailed Results:


check_name,message,status
silver_table_exists,Silver table exists: 985 rows,PASS
required_columns,All 12 required columns present,PASS
row_count,Row count 985 is valid (minimum: 1),PASS
no_null_ids,No null values in transaction_id (primary key),PASS
gold_table_exists,Gold table Gold KPI summary: 6 rows,PASS
data_consistency,Consistency OK: 28 companies aligned across Silver and Gold,PASS



Final Status: SUCCESS
Quality checks complete!



In [0]:
# STEP 13: Next Steps

if all_passed:
    print("""
✅ ALL QUALITY CHECKS PASSED!

Everything is good:
  ✓ Silver layer has data
  ✓ Gold layer has data
  ✓ Data is consistent
  ✓ No critical nulls
  ✓ All columns present
  
Pipeline can continue to optimization!
    """)
else:
    print("""
⚠️ SOME QUALITY CHECKS FAILED

Review the checks above and debug:
  
For silver table issues:
  SELECT * FROM fintech_analytics.silver.silver_fact_insider_transactions LIMIT 10;
  
For gold table issues:
  SELECT * FROM fintech_analytics.gold.gold_company_summary LIMIT 10;
  
For consistency issues:
  SELECT company_name, COUNT(*) as count
  FROM fintech_analytics.silver.silver_fact_insider_transactions
  GROUP BY company_name;
    """)

# COMMAND ----------
print("✅ Quality Checks Notebook Complete")


✅ ALL QUALITY CHECKS PASSED!

Everything is good:
  ✓ Silver layer has data
  ✓ Gold layer has data
  ✓ Data is consistent
  ✓ No critical nulls
  ✓ All columns present
  
Pipeline can continue to optimization!
    
✅ Quality Checks Notebook Complete
