In [1]:
import psycopg2  # Importing PostgreSQL database adapter for Python
from psycopg2 import sql  # Importing SQL module for safely constructing SQL queries
import os  # Importing OS module for interacting with the operating system
import logging  # Importing logging module for event tracking and debugging
import warnings  # Warnings module to control warning messages

# Suppress all warnings
warnings.filterwarnings("ignore")


In [2]:

notebook_name = 'Initial Load Testing' 

# Paths for the log directories
info_log_path = f'../Logs/info/{notebook_name}_info.log'

# Creating directories if they don't exist
os.makedirs(os.path.dirname(info_log_path), exist_ok=True)

# Clearing any previous handlers if re-running this setup
logger = logging.getLogger()
while logger.handlers:
    logger.handlers.pop()

# Configuring logging
info_logger = logging.getLogger('info_logger')

info_handler = logging.FileHandler(info_log_path, mode='a')  # Append mode

info_handler.setLevel(logging.INFO)

# Consistent formatter for both handlers
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
info_handler.setFormatter(formatter)

# Adding handlers to the loggers
info_logger.addHandler(info_handler)

info_logger.setLevel(logging.INFO)


### Initial load for all dimension and fact tables

In [3]:

# Database credentials
DB_CONFIG = {
    'dbname': 'UK Real Estate DB',
    'user': 'postgres',
    'password': '123!@*qweQWE',
    'host': 'localhost',
    'port': '5432'
}

# Queries to execute
queries = {
    "record_completeness : region_dimension": """
        -- Verify the number of records loaded into the dimension
        SELECT 
            'Total Records in Dimension' AS description,
            COUNT(*) AS count
        FROM region_dimension
        UNION ALL
        SELECT 
            'Records from Source Table' AS description,
            COUNT(DISTINCT local_authority_code) AS count
        FROM initial_load_etl_source_data;
    """,
    "null_checks : region_dimension": """
        -- Check for NULL values in essential fields
        SELECT
          SUM(CASE WHEN region_code IS NULL THEN 1 ELSE 0 END) AS null_region_code,
          SUM(CASE WHEN region_name IS NULL THEN 1 ELSE 0 END) AS null_region_name,
          SUM(CASE WHEN local_authority_code IS NULL THEN 1 ELSE 0 END) AS null_local_authority_code,
          SUM(CASE WHEN local_authority_name IS NULL THEN 1 ELSE 0 END) AS null_local_authority_name
        FROM region_dimension;
    """,
    "duplicate_records_check : region_dimension": """
        -- Check for duplicate records in the dimension
        SELECT 
            local_authority_code, 
            COUNT(*) AS duplicate_count
        FROM 
            region_dimension
        GROUP BY 
            local_authority_code
        HAVING 
            COUNT(*) > 1;
    """,
    "accuracy_check : region_dimension": """
        -- Check records from source not in target and vice versa
        SELECT 'Records in source not in target' AS description, COUNT(*) AS count
        FROM (
            SELECT local_authority_code, region_code, region_name, local_authority_name
            FROM initial_load_etl_source_data
            EXCEPT
            SELECT local_authority_code, region_code, region_name, local_authority_name
            FROM region_dimension
        ) AS missing_records

        UNION ALL

        SELECT 'Records in target not in source' AS description, COUNT(*) AS count
        FROM (
            SELECT local_authority_code, region_code, region_name, local_authority_name
            FROM region_dimension
            EXCEPT
            SELECT local_authority_code, region_code, region_name, local_authority_name
            FROM initial_load_etl_source_data
        ) AS extra_records;
    """
}

def execute_query(query, output_file):
    """ Executes SQL query and write results to a file, also print the results."""
    
    # Establishing a connection to the database using the provided configuration
    with psycopg2.connect(**DB_CONFIG) as conn:
        
        # Creating a cursor object for executing the query
        with conn.cursor() as cur:
            
            # Executing the provided SQL query
            cur.execute(query)
            # Fetching all query results
            results = cur.fetchall()
            
            # Iterating through the results, writing each row to the output file and printing
            for row in results:
                line = str(row) + '\n'
                output_file.write(line)
                print(line.strip())  # Print each row

def main():
    # Defining the path for saving query results
    results_path = '../Results/region_dimension_query_results.txt'
    
    # Opening the output file in write mode
    with open(results_path, 'w') as output_file:

        # Iterating through each query and running it
        for description, query in queries.items():
            
            # Writing and printing the message for the running query
            message = f"Running {description}...\n"
            output_file.write(message)
            print(message.strip())  # Print the running message

            # Executing the SQL query
            execute_query(query, output_file)

            # Writing and printing the completion message after the query execution
            completed_message = f"Completed {description}\n\n"
            output_file.write(completed_message)
            print(completed_message.strip())  # Print the completed message

# Calling the main function
if __name__ == "__main__":
    main()
    
info_logger.info("Testing completed for Region dimension")

Running record_completeness : region_dimension...
('Total Records in Dimension', 292)
('Records from Source Table', 292)
Completed record_completeness : region_dimension
Running null_checks : region_dimension...
(0, 0, 0, 0)
Completed null_checks : region_dimension
Running duplicate_records_check : region_dimension...
Completed duplicate_records_check : region_dimension
Running accuracy_check : region_dimension...
('Records in source not in target', 0)
('Records in target not in source', 0)
Completed accuracy_check : region_dimension


In [4]:

# Database credentials
DB_CONFIG = {
    'dbname': 'UK Real Estate DB',
    'user': 'postgres',
    'password': '123!@*qweQWE',
    'host': 'localhost',
    'port': '5432'
}

# Queries to execute
queries = {
    "record_completeness : date_dimension": """
        -- Verify the number of records loaded into the dimension
        SELECT 
            'Total Records in Dimension' AS description,
            COUNT(*) AS count
        FROM date_dimension
        UNION ALL
        SELECT 
            'Records from Source Table' AS description,
            COUNT(DISTINCT date) AS count
        FROM initial_load_etl_source_data;
    """,
    "null_checks : date_dimension": """
        -- Check for NULL values in essential fields
        SELECT
          SUM(CASE WHEN date IS NULL THEN 1 ELSE 0 END) AS null_date,
          SUM(CASE WHEN month IS NULL THEN 1 ELSE 0 END) AS null_month,
          SUM(CASE WHEN quarter IS NULL THEN 1 ELSE 0 END) AS null_quarter,
          SUM(CASE WHEN year IS NULL THEN 1 ELSE 0 END) AS null_year,
          SUM(CASE WHEN transfer_month_year IS NULL THEN 1 ELSE 0 END) AS null_transfer_month_year
        FROM date_dimension;
    """,
    "duplicate_records_check : date_dimension": """
        -- Check for duplicate records in the dimension
        SELECT 
            date, 
            COUNT(*) AS duplicate_count
        FROM 
            date_dimension
        GROUP BY 
            date
        HAVING 
            COUNT(*) > 1;
    """,
    "accuracy_check : date_dimension": """
        -- Check records from source not in target and vice versa
        SELECT 'Records in source not in target' AS description, COUNT(*) AS count
        FROM (
            SELECT date, month, quarter, year, transfer_month_year
            FROM initial_load_etl_source_data
            EXCEPT
            SELECT date, month, quarter, year, transfer_month_year
            FROM date_dimension
        ) AS missing_records

        UNION ALL

        SELECT 'Records in target not in source' AS description, COUNT(*) AS count
        FROM (
            SELECT date, month, quarter, year, transfer_month_year
            FROM date_dimension
            EXCEPT
            SELECT date, month, quarter, year, transfer_month_year
            FROM initial_load_etl_source_data
        ) AS extra_records;
    """
}


def execute_query(query, output_file):
    """ Executes SQL query and write results to a file, also print the results."""
    
    # Establishing a connection to the database using the provided configuration
    with psycopg2.connect(**DB_CONFIG) as conn:
        
        # Creating a cursor object for executing the query
        with conn.cursor() as cur:
            
            # Executing the provided SQL query
            cur.execute(query)
            # Fetching all query results
            results = cur.fetchall()
            
            # Iterating through the results, writing each row to the output file and printing
            for row in results:
                line = str(row) + '\n'
                output_file.write(line)
                print(line.strip())  # Print each row

def main():
    # Defining the path for saving query results
    results_path = '../Results/date_dimension_query_results.txt' 
    
    # Opening the output file in write mode
    with open(results_path, 'w') as output_file:

        # Iterating through each query and running it
        for description, query in queries.items():
            
            # Writing and printing the message for the running query
            message = f"Running {description}...\n"
            output_file.write(message)
            print(message.strip())  # Print the running message

            # Executing the SQL query
            execute_query(query, output_file)

            # Writing and printing the completion message after the query execution
            completed_message = f"Completed {description}\n\n"
            output_file.write(completed_message)
            print(completed_message.strip())  # Print the completed message

# Calling the main function
if __name__ == "__main__":
    main()
    
info_logger.info("Testing completed for Date dimension")

Running record_completeness : date_dimension...
('Total Records in Dimension', 12)
('Records from Source Table', 12)
Completed record_completeness : date_dimension
Running null_checks : date_dimension...


(0, 0, 0, 0, 0)
Completed null_checks : date_dimension
Running duplicate_records_check : date_dimension...
Completed duplicate_records_check : date_dimension
Running accuracy_check : date_dimension...
('Records in source not in target', 0)
('Records in target not in source', 0)
Completed accuracy_check : date_dimension


In [5]:

# Database credentials
DB_CONFIG = {
    'dbname': 'UK Real Estate DB',
    'user': 'postgres',
    'password': '123!@*qweQWE',
    'host': 'localhost',
    'port': '5432'
}

# Queries to execute
queries = {
    "record_completeness : vehicle_dimension": """
        -- Verify the number of records loaded into the dimension
        SELECT 
            'Source Unique Local Authority Codes' AS description,
            COUNT(DISTINCT local_authority_code) AS count
        FROM initial_load_etl_source_data

        UNION ALL

        SELECT 
            'Dimension Unique Records' AS description,
            COUNT(DISTINCT local_authority_code) AS count
        FROM vehicle_dimension;
    """,
    "null_checks : vehicle_dimension": """
        -- Check for NULL values in essential fields
        SELECT
          SUM(CASE WHEN local_authority_code IS NULL THEN 1 ELSE 0 END) AS null_in_local_authority_code,
          SUM(CASE WHEN region_id IS NULL THEN 1 ELSE 0 END) AS null_in_region_id,
          SUM(CASE WHEN buses_total IS NULL THEN 1 ELSE 0 END) AS null_in_buses_total,
          SUM(CASE WHEN petrol_cars_total IS NULL THEN 1 ELSE 0 END) AS null_in_petrol_cars_total,
          SUM(CASE WHEN hgv_total IS NULL THEN 1 ELSE 0 END) AS null_in_hgv_total,
          SUM(CASE WHEN petrol_lgv_total IS NULL THEN 1 ELSE 0 END) AS null_in_petrol_lgv_total,
          SUM(CASE WHEN lpg_lgv_total IS NULL THEN 1 ELSE 0 END) AS null_in_lpg_lgv_total,
          SUM(CASE WHEN hgv_motorways IS NULL THEN 1 ELSE 0 END) AS null_in_hgv_motorways,
          SUM(CASE WHEN personal_transport IS NULL THEN 1 ELSE 0 END) AS null_in_personal_transport
        FROM vehicle_dimension;

    """,
    "duplicate_records_check : vehicle_dimension": """
        -- Check for duplicate records in the dimension
        SELECT 
            local_authority_code, 
            COUNT(*) AS duplicate_count
        FROM 
            vehicle_dimension
        GROUP BY 
            local_authority_code
        HAVING 
            COUNT(*) > 1;
    """,
       "accuracy_check : vehicle_dimension": """
        -- Check records from source not in target and vice versa
        SELECT 'Records in source not in target' AS description, COUNT(*) AS count
        FROM (
            SELECT local_authority_code, buses_total, petrol_cars_total, hgv_total, petrol_lgv_total, 
                   lpg_lgv_total, hgv_motorways, personal_transport
            FROM initial_load_etl_source_data
            EXCEPT
            SELECT local_authority_code, buses_total, petrol_cars_total, hgv_total, petrol_lgv_total, 
                   lpg_lgv_total, hgv_motorways, personal_transport
            FROM vehicle_dimension
        ) AS missing_records
    
        UNION ALL
    
        SELECT 'Records in target not in source' AS description, COUNT(*) AS count
        FROM (
            SELECT local_authority_code, buses_total, petrol_cars_total, hgv_total, petrol_lgv_total, 
                   lpg_lgv_total, hgv_motorways, personal_transport
            FROM vehicle_dimension
            EXCEPT
            SELECT local_authority_code, buses_total, petrol_cars_total, hgv_total, petrol_lgv_total, 
                   lpg_lgv_total, hgv_motorways, personal_transport
            FROM initial_load_etl_source_data
        ) AS extra_records;
    """

}

def execute_query(query, output_file):
    """ Executes SQL query and write results to a file, also print the results."""
    
    # Establishing a connection to the database using the provided configuration
    with psycopg2.connect(**DB_CONFIG) as conn:
        
        # Creating a cursor object for executing the query
        with conn.cursor() as cur:
            
            # Executing the provided SQL query
            cur.execute(query)
            # Fetching all query results
            results = cur.fetchall()
            
            # Iterating through the results, writing each row to the output file and printing
            for row in results:
                line = str(row) + '\n'
                output_file.write(line)
                print(line.strip())  # Print each row

def main():
    # Defining the path for saving query results
    results_path = '../Results/vehicle_dimension_query_results.txt' 
    
    # Opening the output file in write mode
    with open(results_path, 'w') as output_file:

        # Iterating through each query and running it
        for description, query in queries.items():
            
            # Writing and printing the message for the running query
            message = f"Running {description}...\n"
            output_file.write(message)
            print(message.strip())  # Print the running message

            # Executing the SQL query
            execute_query(query, output_file)

            # Writing and printing the completion message after the query execution
            completed_message = f"Completed {description}\n\n"
            output_file.write(completed_message)
            print(completed_message.strip())  # Print the completed message

# Calling the main function
if __name__ == "__main__":
    main()

info_logger.info("Testing completed for Vechile dimension")

Running record_completeness : vehicle_dimension...
('Source Unique Local Authority Codes', 292)
('Dimension Unique Records', 292)
Completed record_completeness : vehicle_dimension
Running null_checks : vehicle_dimension...


(0, 0, 0, 0, 0, 0, 0, 0, 0)
Completed null_checks : vehicle_dimension
Running duplicate_records_check : vehicle_dimension...


Completed duplicate_records_check : vehicle_dimension
Running accuracy_check : vehicle_dimension...
('Records in source not in target', 0)
('Records in target not in source', 0)
Completed accuracy_check : vehicle_dimension


In [6]:

# Database credentials
DB_CONFIG = {
    'dbname': 'UK Real Estate DB',
    'user': 'postgres',
    'password': '123!@*qweQWE',
    'host': 'localhost',
    'port': '5432'
}

# Queries to execute
queries = {
    "record_completeness : district_dimension": """
        -- Verify the number of records loaded into the dimension
        SELECT 
            'Total Records in Dimension' AS description,
            COUNT(*) AS count
        FROM district_dimension
        UNION ALL
        SELECT 
            'Records from Source Table' AS description,
			COUNT(DISTINCT (local_authority_code, date)) AS count
        FROM initial_load_etl_source_data
    """,
    "null_checks : district_dimension": """
        -- Check for NULL values in essential fields
        SELECT
          SUM(CASE WHEN local_authority_code IS NULL THEN 1 ELSE 0 END) AS null_local_authority_code,
          SUM(CASE WHEN date IS NULL THEN 1 ELSE 0 END) AS null_date,
          SUM(CASE WHEN district IS NULL THEN 1 ELSE 0 END) AS null_district,
          SUM(CASE WHEN town_city IS NULL THEN 1 ELSE 0 END) AS null_town_city,
          SUM(CASE WHEN county IS NULL THEN 1 ELSE 0 END) AS null_county
        FROM district_dimension;
    """,
    "duplicate_records_check : district_dimension": """
        -- Check for duplicate records in the dimension
       SELECT 
            local_authority_code, 
            date,
            COUNT(*) AS duplicate_count
        FROM 
            district_dimension
        GROUP BY 
            local_authority_code, 
            date
        HAVING 
            COUNT(*) > 1;
    """,
    "accuracy_check : district_dimension": """
        -- Check records from source not in target
        SELECT 'Records in source not in target' AS description, COUNT(*) AS count
        FROM (
            SELECT local_authority_code, date, district, town_city, county
            FROM initial_load_etl_source_data
            EXCEPT
            SELECT local_authority_code, date, district, town_city, county
            FROM district_dimension
        ) AS missing_records

        UNION ALL

        -- Check records in target not in source
        SELECT 'Records in target not in source' AS description, COUNT(*) AS count
        FROM (
            SELECT local_authority_code, date, district, town_city, county
            FROM district_dimension
            EXCEPT
            SELECT local_authority_code, date, district, town_city, county
            FROM initial_load_etl_source_data
        ) AS extra_records;
    """,
    "inactive_records_check : district_dimension": """
        -- Check that all older records are inactive
        SELECT 
            local_authority_code,
            date,
            is_current
        FROM district_dimension
        WHERE (local_authority_code, date) NOT IN (
            SELECT 
                local_authority_code,
                MAX(date)
            FROM initial_load_etl_source_data
            GROUP BY local_authority_code
        ) AND is_current = TRUE;
    """
    
}

def execute_query(query, output_file):
    """ Executes SQL query and write results to a file, also print the results."""
    
    # Establishing a connection to the database using the provided configuration
    with psycopg2.connect(**DB_CONFIG) as conn:
        
        # Creating a cursor object for executing the query
        with conn.cursor() as cur:
            
            # Executing the provided SQL query
            cur.execute(query)
            # Fetching all query results
            results = cur.fetchall()
            
            # Iterating through the results, writing each row to the output file and printing
            for row in results:
                line = str(row) + '\n'
                output_file.write(line)
                print(line.strip())  # Print each row

def main():
    # Defining the path for saving query results
    results_path = '../Results/district_dimension_query_results.txt' 
    
    # Opening the output file in write mode
    with open(results_path, 'w') as output_file:

        # Iterating through each query and running it
        for description, query in queries.items():
            
            # Writing and printing the message for the running query
            message = f"Running {description}...\n"
            output_file.write(message)
            print(message.strip())  # Print the running message

            # Executing the SQL query
            execute_query(query, output_file)

            # Writing and printing the completion message after the query execution
            completed_message = f"Completed {description}\n\n"
            output_file.write(completed_message)
            print(completed_message.strip())  # Print the completed message

# Calling the main function
if __name__ == "__main__":
    main()
   
info_logger.info("Testing completed for District dimension")

Running record_completeness : district_dimension...


('Total Records in Dimension', 3504)
('Records from Source Table', 3504)
Completed record_completeness : district_dimension
Running null_checks : district_dimension...
(0, 0, 0, 0, 0)
Completed null_checks : district_dimension
Running duplicate_records_check : district_dimension...


Completed duplicate_records_check : district_dimension
Running accuracy_check : district_dimension...
('Records in source not in target', 0)
('Records in target not in source', 0)
Completed accuracy_check : district_dimension
Running inactive_records_check : district_dimension...


Completed inactive_records_check : district_dimension


In [7]:

# Database credentials
DB_CONFIG = {
    'dbname': 'UK Real Estate DB',
    'user': 'postgres',
    'password': '123!@*qweQWE',
    'host': 'localhost',
    'port': '5432'
}

# Queries to execute
queries = {
    "record_completeness : property_type_dimension": """
        -- Verify the number of records loaded into the dimension
        SELECT 
            'Total Records in Dimension' AS description,
            COUNT(*) AS count
        FROM property_type_dimension
        UNION ALL
        SELECT 
            'Records from Source Table' AS description,
            COUNT(DISTINCT (local_authority_code, date)) AS count
        FROM initial_load_etl_source_data;
    """,
    "null_checks : property_type_dimension": """
        -- Check for NULL values in essential fields
        SELECT
          SUM(CASE WHEN local_authority_code IS NULL THEN 1 ELSE 0 END) AS null_local_authority_code,
          SUM(CASE WHEN date IS NULL THEN 1 ELSE 0 END) AS null_date,
          SUM(CASE WHEN property_type IS NULL THEN 1 ELSE 0 END) AS null_property_type,
          SUM(CASE WHEN duration IS NULL THEN 1 ELSE 0 END) AS null_duration,
          SUM(CASE WHEN detached_price IS NULL THEN 1 ELSE 0 END) AS null_detached_price,
          SUM(CASE WHEN semi_detached_price IS NULL THEN 1 ELSE 0 END) AS null_semi_detached_price,
          SUM(CASE WHEN terraced_price IS NULL THEN 1 ELSE 0 END) AS null_terraced_price,
          SUM(CASE WHEN flat_price IS NULL THEN 1 ELSE 0 END) AS null_flat_price
        FROM property_type_dimension;
    """,
    "duplicate_records_check : property_type_dimension": """
        -- Check for duplicate records in the dimension
       SELECT 
            local_authority_code, 
            date,
            property_type,
            duration,
            COUNT(*) AS duplicate_count
        FROM 
            property_type_dimension
        GROUP BY 
            local_authority_code, 
            date,
            property_type,
            duration
        HAVING 
            COUNT(*) > 1;
    """,
    "accuracy_check : property_type_dimension": """
        -- Check records from source not in target
        SELECT 'Records in source not in target' AS description, COUNT(*) AS count
        FROM (
            SELECT local_authority_code, date, property_type, duration, detached_price, semi_detached_price, terraced_price, flat_price
            FROM initial_load_etl_source_data
            EXCEPT
            SELECT local_authority_code, date, property_type, duration, detached_price, semi_detached_price, terraced_price, flat_price
            FROM property_type_dimension
        ) AS missing_records

        UNION ALL

        -- Check records in target not in source
        SELECT 'Records in target not in source' AS description, COUNT(*) AS count
        FROM (
            SELECT local_authority_code, date, property_type, duration, detached_price, semi_detached_price, terraced_price, flat_price
            FROM property_type_dimension
            EXCEPT
            SELECT local_authority_code, date, property_type, duration, detached_price, semi_detached_price, terraced_price, flat_price
            FROM initial_load_etl_source_data
        ) AS extra_records;
    """,
    "inactive_records_check : property_type_dimension": """
        -- Check that all older records are inactive
        SELECT 
            local_authority_code,
            date
            is_current
        FROM property_type_dimension
        WHERE (local_authority_code, date) NOT IN (
            SELECT 
                local_authority_code,
                MAX(date)
            FROM initial_load_etl_source_data
            GROUP BY local_authority_code
        ) AND is_current = TRUE;
    """
}


def execute_query(query, output_file):
    """ Executes SQL query and write results to a file, also print the results."""
    
    # Establishing a connection to the database using the provided configuration
    with psycopg2.connect(**DB_CONFIG) as conn:
        
        # Creating a cursor object for executing the query
        with conn.cursor() as cur:
            
            # Executing the provided SQL query
            cur.execute(query)
            # Fetching all query results
            results = cur.fetchall()
            
            # Iterating through the results, writing each row to the output file and printing
            for row in results:
                line = str(row) + '\n'
                output_file.write(line)
                print(line.strip())  # Print each row

def main():
    # Defining the path for saving query results
    results_path = '../Results/property_type_dimension_query_results.txt'
    
    # Opening the output file in write mode
    with open(results_path, 'w') as output_file:

        # Iterating through each query and running it
        for description, query in queries.items():
            
            # Writing and printing the message for the running query
            message = f"Running {description}...\n"
            output_file.write(message)
            print(message.strip())  # Print the running message

            # Executing the SQL query
            execute_query(query, output_file)

            # Writing and printing the completion message after the query execution
            completed_message = f"Completed {description}\n\n"
            output_file.write(completed_message)
            print(completed_message.strip())  # Print the completed message

# Calling the main function
if __name__ == "__main__":
    main()
    
info_logger.info("Testing completed for Property type dimension")

Running record_completeness : property_type_dimension...
('Total Records in Dimension', 3504)
('Records from Source Table', 3504)
Completed record_completeness : property_type_dimension
Running null_checks : property_type_dimension...
(0, 0, 0, 0, 0, 0, 0, 0)
Completed null_checks : property_type_dimension
Running duplicate_records_check : property_type_dimension...


Completed duplicate_records_check : property_type_dimension
Running accuracy_check : property_type_dimension...


('Records in source not in target', 0)
('Records in target not in source', 0)
Completed accuracy_check : property_type_dimension
Running inactive_records_check : property_type_dimension...
Completed inactive_records_check : property_type_dimension


In [8]:

# Database credentials
DB_CONFIG = {
    'dbname': 'UK Real Estate DB',
    'user': 'postgres',
    'password': '123!@*qweQWE',
    'host': 'localhost',
    'port': '5432'
}

# Queries to execute
queries = {
    "record_completeness : education_employment_dimension": """
        -- Verify the number of records loaded into the dimension
        SELECT 
            'Total Records in Dimension' AS description,
            COUNT(*) AS count
        FROM education_employment_dimension
        UNION ALL
        SELECT 
            'Records from Source Table' AS description,
            COUNT(DISTINCT (local_authority_code, date)) AS count
        FROM initial_load_etl_source_data;
    """,
    "null_checks : education_employment_dimension": """
        -- Check for NULL values in essential fields
        SELECT
          SUM(CASE WHEN local_authority_code IS NULL THEN 1 ELSE 0 END) AS null_local_authority_code,
          SUM(CASE WHEN date IS NULL THEN 1 ELSE 0 END) AS null_date,
          SUM(CASE WHEN qualification_index_score IS NULL THEN 1 ELSE 0 END) AS null_qualification_index_score,
          SUM(CASE WHEN qualification_index_rank IS NULL THEN 1 ELSE 0 END) AS null_qualification_index_rank,
          SUM(CASE WHEN no_qualifications IS NULL THEN 1 ELSE 0 END) AS null_no_qualifications,
          SUM(CASE WHEN level_1_and_entry_level_qualifications IS NULL THEN 1 ELSE 0 END) AS null_level_1_and_entry_level_qualifications,
          SUM(CASE WHEN level_2_qualifications IS NULL THEN 1 ELSE 0 END) AS null_level_2_qualifications,
          SUM(CASE WHEN level_3_qualifications IS NULL THEN 1 ELSE 0 END) AS null_level_3_qualifications,
          SUM(CASE WHEN apprenticeship IS NULL THEN 1 ELSE 0 END) AS null_apprenticeship,
          SUM(CASE WHEN level_4_qualifications_and_above IS NULL THEN 1 ELSE 0 END) AS null_level_4_qualifications_and_above,
          SUM(CASE WHEN other_qualifications IS NULL THEN 1 ELSE 0 END) AS null_other_qualifications,
          SUM(CASE WHEN num_aged_16_plus_unemployed IS NULL THEN 1 ELSE 0 END) AS null_num_aged_16_plus_unemployed,
          SUM(CASE WHEN num_aged_16_plus_employed IS NULL THEN 1 ELSE 0 END) AS null_num_aged_16_plus_employed,
          SUM(CASE WHEN num_aged_16_plus_self_employed IS NULL THEN 1 ELSE 0 END) AS null_num_aged_16_plus_self_employed,
          SUM(CASE WHEN deprivation_average_score IS NULL THEN 1 ELSE 0 END) AS null_deprivation_average_score,
          SUM(CASE WHEN deprivation_employment_ratio IS NULL THEN 1 ELSE 0 END) AS null_deprivation_employment_ratio,
          SUM(CASE WHEN qualification_adjusted_employment_rate IS NULL THEN 1 ELSE 0 END) AS null_qualification_adjusted_employment_rate
        FROM education_employment_dimension;
    """,
    "duplicate_records_check : education_employment_dimension": """
        -- Check for duplicate records in the dimension
       SELECT 
            local_authority_code, 
            date,
            COUNT(*) AS duplicate_count
        FROM 
            education_employment_dimension
        GROUP BY 
            local_authority_code, 
            date
        HAVING 
            COUNT(*) > 1;
    """,
    "accuracy_check : education_employment_dimension": """
        -- Check records from source not in target
        SELECT 'Records in source not in target' AS description, COUNT(*) AS count
        FROM (
            SELECT local_authority_code, date, qualification_index_score, qualification_index_rank, no_qualifications, level_1_and_entry_level_qualifications,
                   level_2_qualifications, level_3_qualifications, apprenticeship, level_4_qualifications_and_above, other_qualifications,
                   num_aged_16_plus_unemployed, num_aged_16_plus_employed, num_aged_16_plus_self_employed, deprivation_average_score, 
                   deprivation_employment_ratio, qualification_adjusted_employment_rate
            FROM initial_load_etl_source_data
            EXCEPT
            SELECT local_authority_code, date, qualification_index_score, qualification_index_rank, no_qualifications, level_1_and_entry_level_qualifications,
                   level_2_qualifications, level_3_qualifications, apprenticeship, level_4_qualifications_and_above, other_qualifications,
                   num_aged_16_plus_unemployed, num_aged_16_plus_employed, num_aged_16_plus_self_employed, deprivation_average_score, 
                   deprivation_employment_ratio, qualification_adjusted_employment_rate
            FROM education_employment_dimension
        ) AS missing_records

        UNION ALL

        -- Check records in target not in source
        SELECT 'Records in target not in source' AS description, COUNT(*) AS count
        FROM (
            SELECT local_authority_code, date, qualification_index_score, qualification_index_rank, no_qualifications, level_1_and_entry_level_qualifications,
                   level_2_qualifications, level_3_qualifications, apprenticeship, level_4_qualifications_and_above, other_qualifications,
                   num_aged_16_plus_unemployed, num_aged_16_plus_employed, num_aged_16_plus_self_employed, deprivation_average_score, 
                   deprivation_employment_ratio, qualification_adjusted_employment_rate
            FROM education_employment_dimension
            EXCEPT
            SELECT local_authority_code, date, qualification_index_score, qualification_index_rank, no_qualifications, level_1_and_entry_level_qualifications,
                   level_2_qualifications, level_3_qualifications, apprenticeship, level_4_qualifications_and_above, other_qualifications,
                   num_aged_16_plus_unemployed, num_aged_16_plus_employed, num_aged_16_plus_self_employed, deprivation_average_score, 
                   deprivation_employment_ratio, qualification_adjusted_employment_rate
            FROM initial_load_etl_source_data
        ) AS extra_records;
    """,
    "inactive_records_check : education_employment_dimension": """
        -- Check that all older records are inactive
        SELECT 
            local_authority_code,
            date,
            is_current
        FROM education_employment_dimension
        WHERE (local_authority_code, date) NOT IN (
            SELECT 
                local_authority_code,
                MAX(date)
            FROM initial_load_etl_source_data
            GROUP BY local_authority_code
        ) AND is_current = TRUE;
    """
}


def execute_query(query, output_file):
    """ Executes SQL query and write results to a file, also print the results."""
    
    # Establishing a connection to the database using the provided configuration
    with psycopg2.connect(**DB_CONFIG) as conn:
        
        # Creating a cursor object for executing the query
        with conn.cursor() as cur:
            
            # Executing the provided SQL query
            cur.execute(query)
            # Fetching all query results
            results = cur.fetchall()
            
            # Iterating through the results, writing each row to the output file and printing
            for row in results:
                line = str(row) + '\n'
                output_file.write(line)
                print(line.strip())  # Print each row

def main():
    # Defining the path for saving query results
    results_path = '../Results/education_employment_dimension_query_results.txt'
    
    # Opening the output file in write mode
    with open(results_path, 'w') as output_file:

        # Iterating through each query and running it
        for description, query in queries.items():
            
            # Writing and printing the message for the running query
            message = f"Running {description}...\n"
            output_file.write(message)
            print(message.strip())  # Print the running message

            # Executing the SQL query
            execute_query(query, output_file)

            # Writing and printing the completion message after the query execution
            completed_message = f"Completed {description}\n\n"
            output_file.write(completed_message)
            print(completed_message.strip())  # Print the completed message

# Calling the main function
if __name__ == "__main__":
    main()
   
info_logger.info("Testing completed for Education Employment dimension")

Running record_completeness : education_employment_dimension...
('Total Records in Dimension', 3504)
('Records from Source Table', 3504)
Completed record_completeness : education_employment_dimension
Running null_checks : education_employment_dimension...


(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
Completed null_checks : education_employment_dimension
Running duplicate_records_check : education_employment_dimension...


Completed duplicate_records_check : education_employment_dimension
Running accuracy_check : education_employment_dimension...


('Records in source not in target', 0)
('Records in target not in source', 0)
Completed accuracy_check : education_employment_dimension
Running inactive_records_check : education_employment_dimension...
Completed inactive_records_check : education_employment_dimension


In [9]:

# Database credentials
DB_CONFIG = {
    'dbname': 'UK Real Estate DB',
    'user': 'postgres',
    'password': '123!@*qweQWE',
    'host': 'localhost',
    'port': '5432'
}

# Queries to execute
queries = {
    "record_completeness : demographics_dimension": """
        -- Verify the number of records loaded into the dimension
        SELECT 
            'Total Records in Dimension' AS description,
            COUNT(*) AS count
        FROM demographics_dimension
        UNION ALL
        SELECT 
            'Records from Source Table' AS description,
            COUNT(DISTINCT (local_authority_code, date)) AS count
        FROM initial_load_etl_source_data;
    """,
    "null_checks : demographics_dimension": """
        -- Check for NULL values in essential fields
        SELECT
          SUM(CASE WHEN local_authority_code IS NULL THEN 1 ELSE 0 END) AS null_local_authority_code,
          SUM(CASE WHEN date IS NULL THEN 1 ELSE 0 END) AS null_date,
          SUM(CASE WHEN area_sq_km IS NULL THEN 1 ELSE 0 END) AS null_area_sq_km,
          SUM(CASE WHEN age_0_20 IS NULL THEN 1 ELSE 0 END) AS null_age_0_20,
          SUM(CASE WHEN age_20_40 IS NULL THEN 1 ELSE 0 END) AS null_age_20_40,
          SUM(CASE WHEN age_40_60 IS NULL THEN 1 ELSE 0 END) AS null_age_40_60,
          SUM(CASE WHEN age_60_plus IS NULL THEN 1 ELSE 0 END) AS null_age_60_plus,
          SUM(CASE WHEN female_population IS NULL THEN 1 ELSE 0 END) AS null_female_population,
          SUM(CASE WHEN all_ages IS NULL THEN 1 ELSE 0 END) AS null_all_ages,
          SUM(CASE WHEN male_population IS NULL THEN 1 ELSE 0 END) AS null_male_population,
          SUM(CASE WHEN est_num_households_with_child IS NULL THEN 1 ELSE 0 END) AS null_est_num_households_with_child,
          SUM(CASE WHEN age_dependency_ratio IS NULL THEN 1 ELSE 0 END) AS null_age_dependency_ratio
        FROM demographics_dimension;
    """,
    "duplicate_records_check : demographics_dimension": """
        -- Check for duplicate records in the dimension
        SELECT 
            local_authority_code, 
            date,
            COUNT(*) AS duplicate_count
        FROM 
            demographics_dimension
        GROUP BY 
            local_authority_code, 
            date
        HAVING 
            COUNT(*) > 1;
    """,
    "accuracy_check : demographics_dimension": """
        -- Check records from source not in target
        SELECT 'Records in source not in target' AS description, COUNT(*) AS count
        FROM (
            SELECT local_authority_code, date, area_sq_km, age_0_20, age_20_40, age_40_60, age_60_plus,
                   female_population, all_ages, male_population, est_num_households_with_child, age_dependency_ratio
            FROM initial_load_etl_source_data
            EXCEPT
            SELECT local_authority_code, date, area_sq_km, age_0_20, age_20_40, age_40_60, age_60_plus,
                   female_population, all_ages, male_population, est_num_households_with_child, age_dependency_ratio
            FROM demographics_dimension
        ) AS missing_records

        UNION ALL

        -- Check records in target not in source
        SELECT 'Records in target not in source' AS description, COUNT(*) AS count
        FROM (
            SELECT local_authority_code, date, area_sq_km, age_0_20, age_20_40, age_40_60, age_60_plus,
                   female_population, all_ages, male_population, est_num_households_with_child, age_dependency_ratio
            FROM demographics_dimension
            EXCEPT
            SELECT local_authority_code, date, area_sq_km, age_0_20, age_20_40, age_40_60, age_60_plus,
                   female_population, all_ages, male_population, est_num_households_with_child, age_dependency_ratio
            FROM initial_load_etl_source_data
        ) AS extra_records;
    """,
    "inactive_records_check : demographics_dimension": """
        -- Check that all older records are inactive
        SELECT 
            local_authority_code,
            date,
            is_current
        FROM demographics_dimension
        WHERE (local_authority_code, date) NOT IN (
            SELECT 
                local_authority_code,
                MAX(date)
            FROM initial_load_etl_source_data
            GROUP BY local_authority_code
        ) AND is_current = TRUE;
    """
}

def execute_query(query, output_file):
    """ Executes SQL query and write results to a file, also print the results."""
    
    # Establishing a connection to the database using the provided configuration
    with psycopg2.connect(**DB_CONFIG) as conn:
        
        # Creating a cursor object for executing the query
        with conn.cursor() as cur:
            
            # Executing the provided SQL query
            cur.execute(query)
            # Fetching all query results
            results = cur.fetchall()
            
            # Iterating through the results, writing each row to the output file and printing
            for row in results:
                line = str(row) + '\n'
                output_file.write(line)
                print(line.strip())  # Print each row

def main():
    # Defining the path for saving query results
    results_path = '../Results/demographics_dimension_query_results.txt' 
    
    # Opening the output file in write mode
    with open(results_path, 'w') as output_file:

        # Iterating through each query and running it
        for description, query in queries.items():
            
            # Writing and printing the message for the running query
            message = f"Running {description}...\n"
            output_file.write(message)
            print(message.strip())  # Print the running message

            # Executing the SQL query
            execute_query(query, output_file)

            # Writing and printing the completion message after the query execution
            completed_message = f"Completed {description}\n\n"
            output_file.write(completed_message)
            print(completed_message.strip())  # Print the completed message

# Calling the main function
if __name__ == "__main__":
    main()
    
info_logger.info("Testing completed for Demographics dimension")

Running record_completeness : demographics_dimension...
('Total Records in Dimension', 3504)
('Records from Source Table', 3504)
Completed record_completeness : demographics_dimension
Running null_checks : demographics_dimension...


(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
Completed null_checks : demographics_dimension
Running duplicate_records_check : demographics_dimension...


Completed duplicate_records_check : demographics_dimension
Running accuracy_check : demographics_dimension...
('Records in source not in target', 0)
('Records in target not in source', 0)
Completed accuracy_check : demographics_dimension
Running inactive_records_check : demographics_dimension...
Completed inactive_records_check : demographics_dimension


In [10]:

# Database credentials
DB_CONFIG = {
    'dbname': 'UK Real Estate DB',
    'user': 'postgres',
    'password': '123!@*qweQWE',
    'host': 'localhost',
    'port': '5432'
}

# Queries to execute
queries = {
    "record_completeness : rental_dimension": """
        -- Verify the number of records loaded into the dimension
        SELECT 
            'Total Records in Dimension' AS description,
            COUNT(*) AS count
        FROM rental_dimension
        UNION ALL
        SELECT 
            'Records from Source Table' AS description,
            COUNT(DISTINCT (local_authority_code, date)) AS count
        FROM initial_load_etl_source_data;
    """,
    "null_checks : rental_dimension": """
        -- Check for NULL values in essential fields
        SELECT
          SUM(CASE WHEN local_authority_code IS NULL THEN 1 ELSE 0 END) AS null_local_authority_code,
          SUM(CASE WHEN date IS NULL THEN 1 ELSE 0 END) AS null_date,
          SUM(CASE WHEN rental_price IS NULL THEN 1 ELSE 0 END) AS null_rental_price,
          SUM(CASE WHEN one_bedroom_rent IS NULL THEN 1 ELSE 0 END) AS null_one_bedroom_rent,
          SUM(CASE WHEN two_bedrooms_rent IS NULL THEN 1 ELSE 0 END) AS null_two_bedrooms_rent,
          SUM(CASE WHEN three_bedrooms_rent IS NULL THEN 1 ELSE 0 END) AS null_three_bedrooms_rent,
          SUM(CASE WHEN four_or_more_bedrooms_rent IS NULL THEN 1 ELSE 0 END) AS null_four_or_more_bedrooms_rent,
          SUM(CASE WHEN all_categories_rent IS NULL THEN 1 ELSE 0 END) AS null_all_categories_rent
        FROM rental_dimension;
    """,
    "duplicate_records_check : rental_dimension": """
        -- Check for duplicate records in the dimension
        SELECT 
            local_authority_code, 
            date,
            COUNT(*) AS duplicate_count
        FROM 
            rental_dimension
        GROUP BY 
            local_authority_code, 
            date
        HAVING 
            COUNT(*) > 1;
    """,
    "accuracy_check : rental_dimension": """
        -- Check records from source not in target
        SELECT 'Records in source not in target' AS description, COUNT(*) AS count
        FROM (
            SELECT local_authority_code, date, rental_price, one_bedroom_rent, two_bedrooms_rent, three_bedrooms_rent, 
                   four_or_more_bedrooms_rent, all_categories_rent
            FROM initial_load_etl_source_data
            EXCEPT
            SELECT local_authority_code, date, rental_price, one_bedroom_rent, two_bedrooms_rent, three_bedrooms_rent, 
                   four_or_more_bedrooms_rent, all_categories_rent
            FROM rental_dimension
        ) AS missing_records

        UNION ALL

        -- Check records in target not in source
        SELECT 'Records in target not in source' AS description, COUNT(*) AS count
        FROM (
            SELECT local_authority_code, date, rental_price, one_bedroom_rent, two_bedrooms_rent, three_bedrooms_rent, 
                   four_or_more_bedrooms_rent, all_categories_rent
            FROM rental_dimension
            EXCEPT
            SELECT local_authority_code, date, rental_price, one_bedroom_rent, two_bedrooms_rent, three_bedrooms_rent, 
                   four_or_more_bedrooms_rent, all_categories_rent
            FROM initial_load_etl_source_data
        ) AS extra_records;
    """,
    "inactive_records_check : rental_dimension": """
        -- Check that all older records are inactive
        SELECT 
            local_authority_code,
            date,
            is_current
        FROM rental_dimension
        WHERE (local_authority_code, date) NOT IN (
            SELECT 
                local_authority_code,
                MAX(date)
            FROM initial_load_etl_source_data
            GROUP BY local_authority_code
        ) AND is_current = TRUE;
    """
}

def execute_query(query, output_file):
    """ Executes SQL query and write results to a file, also print the results."""
    
    # Establishing a connection to the database using the provided configuration
    with psycopg2.connect(**DB_CONFIG) as conn:
        
        # Creating a cursor object for executing the query
        with conn.cursor() as cur:
            
            # Executing the provided SQL query
            cur.execute(query)
            # Fetching all query results
            results = cur.fetchall()
            
            # Iterating through the results, writing each row to the output file and printing
            for row in results:
                line = str(row) + '\n'
                output_file.write(line)
                print(line.strip())  # Print each row

def main():
    # Defining the path for saving query results
    results_path = '../Results/rental_dimension_query_results.txt' 
    
    # Opening the output file in write mode
    with open(results_path, 'w') as output_file:

        # Iterating through each query and running it
        for description, query in queries.items():
            
            # Writing and printing the message for the running query
            message = f"Running {description}...\n"
            output_file.write(message)
            print(message.strip())  # Print the running message

            # Executing the SQL query
            execute_query(query, output_file)

            # Writing and printing the completion message after the query execution
            completed_message = f"Completed {description}\n\n"
            output_file.write(completed_message)
            print(completed_message.strip())  # Print the completed message

# Calling the main function
if __name__ == "__main__":
    main()
    
info_logger.info("Testing completed for Rental dimension")

Running record_completeness : rental_dimension...
('Total Records in Dimension', 3504)
('Records from Source Table', 3504)
Completed record_completeness : rental_dimension
Running null_checks : rental_dimension...


(0, 0, 0, 0, 0, 0, 0, 0)


Completed null_checks : rental_dimension
Running duplicate_records_check : rental_dimension...
Completed duplicate_records_check : rental_dimension
Running accuracy_check : rental_dimension...


('Records in source not in target', 0)
('Records in target not in source', 0)
Completed accuracy_check : rental_dimension
Running inactive_records_check : rental_dimension...
Completed inactive_records_check : rental_dimension


In [11]:

# Database credentials
DB_CONFIG = {
    'dbname': 'UK Real Estate DB',
    'user': 'postgres',
    'password': '123!@*qweQWE',
    'host': 'localhost',
    'port': '5432'
}

# Queries to execute
queries = {
    "record_completeness : sales_transactions_fact": """
        -- Verify the number of records loaded into the fact table
        SELECT 
            'Total Records in Fact Table' AS description,
            COUNT(*) AS count
        FROM sales_transactions_fact
        UNION ALL
        SELECT 
            'Records from Source Table' AS description,
            COUNT(DISTINCT (local_authority_code, date)) AS count
        FROM initial_load_etl_source_data;
    """,
    "null_checks : sales_transactions_fact": """
        -- Check for NULL values in essential fields
        SELECT
          SUM(CASE WHEN local_authority_code IS NULL THEN 1 ELSE 0 END) AS null_local_authority_code,
          SUM(CASE WHEN date IS NULL THEN 1 ELSE 0 END) AS null_date,
          SUM(CASE WHEN district_id IS NULL THEN 1 ELSE 0 END) AS null_district_id,
          SUM(CASE WHEN region_id IS NULL THEN 1 ELSE 0 END) AS null_region_id,
          SUM(CASE WHEN property_type_id IS NULL THEN 1 ELSE 0 END) AS null_property_type_id,
          SUM(CASE WHEN vehicle_id IS NULL THEN 1 ELSE 0 END) AS null_vehicle_id,
          SUM(CASE WHEN rental_id IS NULL THEN 1 ELSE 0 END) AS null_rental_id,
          SUM(CASE WHEN demographics_id IS NULL THEN 1 ELSE 0 END) AS null_demographics_id,
          SUM(CASE WHEN education_employment_id IS NULL THEN 1 ELSE 0 END) AS null_education_employment_id,
          SUM(CASE WHEN date_key IS NULL THEN 1 ELSE 0 END) AS null_date_key,
          SUM(CASE WHEN price IS NULL THEN 1 ELSE 0 END) AS null_price,
          SUM(CASE WHEN average_price IS NULL THEN 1 ELSE 0 END) AS null_average_price,
          SUM(CASE WHEN predicted_price_unscaled IS NULL THEN 1 ELSE 0 END) AS null_predicted_price_unscaled,
          SUM(CASE WHEN index IS NULL THEN 1 ELSE 0 END) AS null_index,
          SUM(CASE WHEN average_price_pct_change IS NULL THEN 1 ELSE 0 END) AS null_average_price_pct_change,
          SUM(CASE WHEN annual_change_percent IS NULL THEN 1 ELSE 0 END) AS null_annual_change_percent,
          SUM(CASE WHEN new_price IS NULL THEN 1 ELSE 0 END) AS null_new_price,
          SUM(CASE WHEN old_price IS NULL THEN 1 ELSE 0 END) AS null_old_price,
          SUM(CASE WHEN sales_volume IS NULL THEN 1 ELSE 0 END) AS null_sales_volume,
          SUM(CASE WHEN sales_volume_log IS NULL THEN 1 ELSE 0 END) AS null_sales_volume_log,
          SUM(CASE WHEN old_sales_volume IS NULL THEN 1 ELSE 0 END) AS null_old_sales_volume,
          SUM(CASE WHEN detached_flat_ratio IS NULL THEN 1 ELSE 0 END) AS null_detached_flat_ratio,
          SUM(CASE WHEN detached_terraced_ratio IS NULL THEN 1 ELSE 0 END) AS null_detached_terraced_ratio,
          SUM(CASE WHEN semi_detached_price_pct_change IS NULL THEN 1 ELSE 0 END) AS null_semi_detached_price_pct_change,
          SUM(CASE WHEN detached_semi_detached_ratio IS NULL THEN 1 ELSE 0 END) AS null_detached_semi_detached_ratio,
          SUM(CASE WHEN detached_price_log IS NULL THEN 1 ELSE 0 END) AS null_detached_price_log,
          SUM(CASE WHEN semi_detached_price_log IS NULL THEN 1 ELSE 0 END) AS null_semi_detached_price_log,
          SUM(CASE WHEN flat_price_log IS NULL THEN 1 ELSE 0 END) AS null_flat_price_log,
          SUM(CASE WHEN terraced_price_pct_change IS NULL THEN 1 ELSE 0 END) AS null_terraced_price_pct_change,
          SUM(CASE WHEN terraced_price_log IS NULL THEN 1 ELSE 0 END) AS null_terraced_price_log,
          SUM(CASE WHEN gdhi IS NULL THEN 1 ELSE 0 END) AS null_gdhi,
          SUM(CASE WHEN deprivation_adjusted_gdhi IS NULL THEN 1 ELSE 0 END) AS null_deprivation_adjusted_gdhi,
          SUM(CASE WHEN gdhi_per_capita IS NULL THEN 1 ELSE 0 END) AS null_gdhi_per_capita,
          SUM(CASE WHEN foo_price IS NULL THEN 1 ELSE 0 END) AS null_foo_price,
          SUM(CASE WHEN cash_price IS NULL THEN 1 ELSE 0 END) AS null_cash_price,
          SUM(CASE WHEN mortgage_price IS NULL THEN 1 ELSE 0 END) AS null_mortgage_price,
          SUM(CASE WHEN housing_demand_indicator IS NULL THEN 1 ELSE 0 END) AS null_housing_demand_indicator,
          SUM(CASE WHEN deprivation_reduction_potential IS NULL THEN 1 ELSE 0 END) AS null_deprivation_reduction_potential,
          SUM(CASE WHEN flat_price_pct_change IS NULL THEN 1 ELSE 0 END) AS null_flat_price_pct_change,
          SUM(CASE WHEN detached_price_pct_change IS NULL THEN 1 ELSE 0 END) AS null_detached_price_pct_change,
          SUM(CASE WHEN average_price_log IS NULL THEN 1 ELSE 0 END) AS null_average_price_log,
          SUM(CASE WHEN ftb_price IS NULL THEN 1 ELSE 0 END) AS null_ftb_price
        FROM sales_transactions_fact;
    """,
    "duplicate_records_check : sales_transactions_fact": """
        -- Check for duplicate records in the fact
        SELECT 
            local_authority_code, 
            date,
            COUNT(*) AS duplicate_count
        FROM 
            sales_transactions_fact
        GROUP BY 
            local_authority_code, 
            date
        HAVING 
            COUNT(*) > 1;
    """,
    "accuracy_check : sales_transactions_fact": """
        -- Check records from source not in target
        SELECT 'Records in source not in target' AS description, COUNT(*) AS count
        FROM (
            SELECT local_authority_code, date, price, average_price, predicted_price_unscaled, index, average_price_pct_change,
                   annual_change_percent, new_price, old_price, sales_volume, sales_volume_log, old_sales_volume, detached_flat_ratio,
                   detached_terraced_ratio, semi_detached_price_pct_change, detached_semi_detached_ratio, detached_price_log, 
                   semi_detached_price_log, flat_price_log, terraced_price_pct_change, terraced_price_log, gdhi, deprivation_adjusted_gdhi,
                   gdhi_per_capita, foo_price, cash_price, mortgage_price, housing_demand_indicator, deprivation_reduction_potential, 
                   flat_price_pct_change, detached_price_pct_change, average_price_log, ftb_price
            FROM initial_load_etl_source_data
            EXCEPT
            SELECT local_authority_code, date, price, average_price, predicted_price_unscaled, index, average_price_pct_change,
                   annual_change_percent, new_price, old_price, sales_volume, sales_volume_log, old_sales_volume, detached_flat_ratio,
                   detached_terraced_ratio, semi_detached_price_pct_change, detached_semi_detached_ratio, detached_price_log, 
                   semi_detached_price_log, flat_price_log, terraced_price_pct_change, terraced_price_log, gdhi, deprivation_adjusted_gdhi,
                   gdhi_per_capita, foo_price, cash_price, mortgage_price, housing_demand_indicator, deprivation_reduction_potential, 
                   flat_price_pct_change, detached_price_pct_change, average_price_log, ftb_price
            FROM sales_transactions_fact
        ) AS missing_records

        UNION ALL

        -- Check records in target not in source
        SELECT 'Records in target not in source' AS description, COUNT(*) AS count
        FROM (
            SELECT local_authority_code, date, price, average_price, predicted_price_unscaled, index, average_price_pct_change,
                   annual_change_percent, new_price, old_price, sales_volume, sales_volume_log, old_sales_volume, detached_flat_ratio,
                   detached_terraced_ratio, semi_detached_price_pct_change, detached_semi_detached_ratio, detached_price_log, 
                   semi_detached_price_log, flat_price_log, terraced_price_pct_change, terraced_price_log, gdhi, deprivation_adjusted_gdhi,
                   gdhi_per_capita, foo_price, cash_price, mortgage_price, housing_demand_indicator, deprivation_reduction_potential, 
                   flat_price_pct_change, detached_price_pct_change, average_price_log, ftb_price
            FROM sales_transactions_fact
            EXCEPT
            SELECT local_authority_code, date, price, average_price, predicted_price_unscaled, index, average_price_pct_change,
                   annual_change_percent, new_price, old_price, sales_volume, sales_volume_log, old_sales_volume, detached_flat_ratio,
                   detached_terraced_ratio, semi_detached_price_pct_change, detached_semi_detached_ratio, detached_price_log, 
                   semi_detached_price_log, flat_price_log, terraced_price_pct_change, terraced_price_log, gdhi, deprivation_adjusted_gdhi,
                   gdhi_per_capita, foo_price, cash_price, mortgage_price, housing_demand_indicator, deprivation_reduction_potential, 
                   flat_price_pct_change, detached_price_pct_change, average_price_log, ftb_price
            FROM initial_load_etl_source_data
        ) AS extra_records;
    """,
    "inactive_records_check : sales_transactions_fact": """
        -- Check that all older records are inactive
        SELECT 
            local_authority_code,
            date,
            is_current
        FROM sales_transactions_fact
        WHERE (local_authority_code, date) NOT IN (
            SELECT 
                local_authority_code,
                MAX(date)
            FROM initial_load_etl_source_data
            GROUP BY local_authority_code
        ) AND is_current = TRUE;
    """
}

def execute_query(query, output_file):
    """ Executes SQL query and write results to a file, also print the results."""
    
    # Establishing a connection to the database using the provided configuration
    with psycopg2.connect(**DB_CONFIG) as conn:
        
        # Creating a cursor object for executing the query
        with conn.cursor() as cur:
            
            # Executing the provided SQL query
            cur.execute(query)
            # Fetching all query results
            results = cur.fetchall()
            
            # Iterating through the results, writing each row to the output file and printing
            for row in results:
                line = str(row) + '\n'
                output_file.write(line)
                print(line.strip())  # Print each row

def main():
    # Defining the path for saving query results
    results_path = '../Results/sales_fact_query_results.txt' 
    
    # Opening the output file in write mode
    with open(results_path, 'w') as output_file:

        # Iterating through each query and running it
        for description, query in queries.items():
            
            # Writing and printing the message for the running query
            message = f"Running {description}...\n"
            output_file.write(message)
            print(message.strip())  # Print the running message

            # Executing the SQL query
            execute_query(query, output_file)

            # Writing and printing the completion message after the query execution
            completed_message = f"Completed {description}\n\n"
            output_file.write(completed_message)
            print(completed_message.strip())  # Print the completed message

# Calling the main function
if __name__ == "__main__":
    main()
    
info_logger.info("Testing completed for Sales Transaction Fact")

Running record_completeness : sales_transactions_fact...


('Total Records in Fact Table', 3504)
('Records from Source Table', 3504)
Completed record_completeness : sales_transactions_fact
Running null_checks : sales_transactions_fact...
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
Completed null_checks : sales_transactions_fact
Running duplicate_records_check : sales_transactions_fact...
Completed duplicate_records_check : sales_transactions_fact
Running accuracy_check : sales_transactions_fact...


('Records in source not in target', 0)
('Records in target not in source', 0)
Completed accuracy_check : sales_transactions_fact
Running inactive_records_check : sales_transactions_fact...


Completed inactive_records_check : sales_transactions_fact
