### Data Validation

In [0]:
from pyspark.sql.functions import count, to_date, col, max
from pyspark.sql import Row
from datetime import datetime

In [0]:
# Read the latest landing data
landing_df = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .option("multiLine", "true")\
    .option("escape", "\"")\
    .load("abfss://landing@rcmadls10dev.dfs.core.windows.net/complaints")

In [0]:
# Find the most recent file in landing
most_recent_lnd_file_df = landing_df.groupBy("_metadata.file_name", "_metadata.file_modification_time").count()\
    .orderBy(col("_metadata.file_modification_time").desc()).limit(1)

display(most_recent_lnd_file_df.select("file_modification_time","count"))

In [0]:
# Read the bronze layer data
bronze_df = spark.read.format("parquet")\
    .load("abfss://bronze@rcmadls10dev.dfs.core.windows.net/complaints")



In [0]:
# Find the most recent file in bronze
most_recent_brz_file_df = bronze_df.groupBy("_metadata.file_name","_metadata.file_modification_time").count()\
    .orderBy(col("_metadata.file_modification_time").desc()).limit(1)

display(most_recent_brz_file_df.select("count"))

### Saving Data Validation to Audit Table

In [0]:
landing_count = most_recent_lnd_file_df.select("count").collect()[0][0]
bronze_count = most_recent_brz_file_df.select("count").collect()[0][0]

# Prepare validation record
validation_result = Row(
    validation_date=datetime.now(),
    landing_count=landing_count,
    bronze_count=bronze_count,
    status="Success" if landing_count == bronze_count else "Failed"
)



In [0]:
# Convert to DataFrame and save to Audit Delta table
validation_df = spark.createDataFrame([validation_result])

validation_df.write.format("delta")\
    .mode("append")\
    .save("abfss://bronze@rcmadls10dev.dfs.core.windows.net/audit_complaints")

In [0]:
%sql
create table if not exists databricks_catalog.bronze.audit_complaints
using delta
location 'abfss://bronze@rcmadls10dev.dfs.core.windows.net/audit_complaints';