# Quality checks

#### Set up basis

In [43]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable
from datetime import datetime

def log_check(layer, table, check_name, status, details=""):
    df = spark.createDataFrame([{
        "layer": layer,
        "table_name": table,
        "check_name": check_name,
        "status": status,
        "details": details,
        "timestamp": datetime.utcnow()
    }])
    df.write.format("delta").mode("append").saveAsTable("dq_stock_checks")


StatementMeta(, 145a63cd-b36a-43fb-a049-753f45cb85ba, 45, Finished, Available, Finished)

## Bronze checks

#### 1. Check - Has the table names?

In [44]:
table = "bronze_stock_daily"
df = spark.table(table) 
count = df.count() 

print(f"[Bronze] Row count for {table}: {count}")

log_check("bronze", table, "row_count", 
          "pass" if count > 0 else "fail",
          f"rows={count}")


StatementMeta(, 145a63cd-b36a-43fb-a049-753f45cb85ba, 46, Finished, Available, Finished)

[Bronze] Row count for bronze_stock_daily: 18875


#### 2. Check - Duplicates?

In [45]:
dups = df.groupBy("Ticker", "Datetime").count().filter("count > 1").count()

log_check("bronze", table, "duplicates", 
          "pass" if dups == 0 else "fail",
          f"duplicates={dups}")


StatementMeta(, 145a63cd-b36a-43fb-a049-753f45cb85ba, 47, Finished, Available, Finished)

### 3. Check - Time gaps

In [46]:
from pyspark.sql import functions as F

table = "bronze_stock_minutes"
df = spark.table(table)

# Expected number of trading minutes per day (regular session)
EXPECTED_MINUTES = 390

# Extract date from Datetime column
df = df.withColumn("date", F.to_date("Datetime"))

# Group by ticker and date to calculate completeness metrics
results = (
    df.groupBy("Ticker", "date")
      .agg(
          F.count("*").alias("rows"),
          F.min("Datetime").alias("start_time"),
          F.max("Datetime").alias("end_time")
      )
)

# Display results for inspection
display(results)

# Log completeness check for each ticker and date
for row in results.collect():
    ticker = row["Ticker"]
    date = row["date"]
    rows = row["rows"]
    missing = EXPECTED_MINUTES - rows

    # Allow up to 15 missing minutes as tolerance
    status = "pass" if missing <= 15 else "fail"
    details = f"date={date}, rows={rows}, missing_minutes={missing}"

    log_check(
        layer="bronze",
        table=table,
        check_name=f"minute_completeness_{ticker}_{date}",
        status=status,
        details=details
    )


StatementMeta(, 145a63cd-b36a-43fb-a049-753f45cb85ba, 48, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 16268704-e4d9-4ab4-88d6-a4af1daf5e30)

## Silver Checks

#### 1. Check for missing

In [47]:
table = "silver_stock_daily"
df = spark.table(table)

nulls = df.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in df.columns
])

null_counts = nulls.collect()[0].asDict()
print("[Silver] Null counts:", null_counts)

failed_cols = [c for c, v in null_counts.items() if v > 0]

log_check(
    "silver", table, "null_values",
    "pass" if len(failed_cols) == 0 else "fail",
    str(null_counts)
)


StatementMeta(, 145a63cd-b36a-43fb-a049-753f45cb85ba, 49, Finished, Available, Finished)

[Silver] Null counts: {'ticker': 0, 'date': 0, 'open': 0, 'high': 0, 'low': 0, 'close': 0, 'volume': 0, 'ingestion_time': 0}


## Gold Checks

#### 1. Check - KPI Completion

In [48]:
table = "gold_stock_daily_indicators"
df = spark.table(table)

required_cols = ["sma_20", "sma_50", "rsi_14", "macd_hist", "ema_12", "ema_26"]
missing_cols = [c for c in required_cols if c not in df.columns]

print("[Gold] Missing columns:", missing_cols)

log_check(
    "gold", table, "required_columns",
    "pass" if len(missing_cols) == 0 else "fail",
    str(missing_cols)
)


StatementMeta(, 145a63cd-b36a-43fb-a049-753f45cb85ba, 50, Finished, Available, Finished)

[Gold] Missing columns: []


#### 2. Check - Outliers

In [49]:
outliers = df.filter(
    (F.col("rsi_14") < 0) | (F.col("rsi_14") > 100)
).count()

print(f"[Gold] RSI outliers: {outliers}")

log_check(
    "gold", table, "rsi_range",
    "pass" if outliers == 0 else "fail",
    f"outliers={outliers}"
)


StatementMeta(, 145a63cd-b36a-43fb-a049-753f45cb85ba, 51, Finished, Available, Finished)

[Gold] RSI outliers: 0
