# Data Quality Check

This notebook is used for data exploration by checking null values, missing data, and the uniqueness of primary keys.

In [0]:
from pyspark.sql.functions import *
from pyspark.sql import *

Assigning variables for storing the path of the file in the silver container

In [0]:
fact_transactions_silver_path = "abfss://silver@dlgysnergy.dfs.core.windows.net/fact_transactions"
hier_clnd_silver_path = "abfss://silver@dlgysnergy.dfs.core.windows.net/hier_clnd"
hier_hldy_silver_path = "abfss://silver@dlgysnergy.dfs.core.windows.net/hier_hldy"
hier_invloc_silver_path = "abfss://silver@dlgysnergy.dfs.core.windows.net/hier_invloc"
hier_invstatus_silver_path = "abfss://silver@dlgysnergy.dfs.core.windows.net/hier_invstatus"
hier_possite_silver_path = "abfss://silver@dlgysnergy.dfs.core.windows.net/hier_possite"
hier_pricestate_silver_path = "abfss://silver@dlgysnergy.dfs.core.windows.net/hier_pricestate"
hier_prod__silverpath = "abfss://silver@dlgysnergy.dfs.core.windows.net/hier_prod"
hier_rtlloc_silver_path = "abfss://silver@dlgysnergy.dfs.core.windows.net/hier_rtlloc"


This function reads the files from silver container which are stored in delta format

In [0]:
def read_delta_from_silver(file_path):
    return spark.read.format("delta").load(file_path)


In [0]:
fact_transactions_silver = read_delta_from_silver(fact_transactions_silver_path)
hier_clnd_silver = read_delta_from_silver(hier_clnd_silver_path)

In [0]:
def find_null_missing(df):
    results = []

    for column in df.columns:
        null_count = df.filter(col(column).isNull()).count()

        # Check for empty strings only in string-type columns
        if df.schema[column].dataType.simpleString() == 'string':
            empty_count = df.filter(col(column) == '').count()
        else:
            empty_count = NA

        results.append(Row(
            Column=column,
            Null_Values=null_count,
            Empty_Strings=empty_count
        ))

    return spark.createDataFrame(results)

In [0]:
find_null_missing(fact_transactions_silver).show()

In [0]:
fact_transactions_silver.count()