## Data Quality Rules Implementation

In [0]:
bronze_df = spark.read.format("parquet")\
    .load("abfss://bronze@rcmadls10dev.dfs.core.windows.net/complaints")

In [0]:
silver_df = spark.read.format("delta")\
    .load("abfss://silver@rcmadls10dev.dfs.core.windows.net/br_complaints")

## Basic Data Validation Rules
### Non-Null Check on date_received, complaint_id and product columns


In [0]:
silver_df = silver_df.dropna(subset=['complaint_id','date_received','product'])

### Valid Date Checks

In [0]:
from pyspark.sql.functions import col
silver_df = silver_df.filter(col('date_received').isNotNull())

### Unique Constrains

In [0]:
if silver_df.count() != silver_df.select("complaint_id").distinct().count():
    silver_df = silver_df.dropDuplicates(['complaint_id'])

In [0]:
silver_df.groupBy('complaint_id').count().filter(col("count")>1).display()

## Range and Threshold Validation
### Complaint Age and Zip Code

In [0]:
invalid_silver_df = silver_df.filter((col('complaint_age') < 0) | (col('zip_code')).rlike("XX"))

## Qurantine Invalid Records

In [0]:
invalid_silver_df.display()

Databricks visualization. Run in Databricks to view.

In [0]:
invalid_silver_df.write.mode("append")\
    .format("delta")\
    .option("mergeSchema", "true")\
    .save("abfss://silver@rcmadls10dev.dfs.core.windows.net/qurantined_complaints")

In [0]:
invalid_count = invalid_silver_df.count()

valid_count = silver_df.count() - invalid_count

print(f"Total Bronze Records: {bronze_df.count()}")
print(f"Valid Silver records: {valid_count}")
print(f"Invalid Silver records: {invalid_count}")
print(f"Percentage Invalid: {invalid_count/silver_df.count()*100}%")

### NULL Percent of Important Columns

In [0]:
from pyspark.sql.functions import col, sum as spark_sum

id_columns = ['complaint_id', 'date_received', 'sub_issue', 'company_public_response', 'consumer_consent_provided', 'consumer_complaint_narrative']
null_percent_df = silver_df.select([(spark_sum(col(c).isNull().cast("int")) / silver_df.count()).alias(c) for c in id_columns])
display(null_percent_df)

Databricks visualization. Run in Databricks to view.