In [1]:
import os 
import pandas as pd
import numpy as np
import glob
import pyodbc
import gc

# Internal Consistency Checks

## Population

In [2]:
def find_files(url, variable, estimates_version):
    path = url.replace("\\", "/")

    if variable == 'age':
        variable = 'age_i'
    files_with_words = []
    for filename in os.listdir(path):
        if variable in filename and estimates_version in filename:
            file_path = os.path.join(path, filename)
            files_with_words.append(file_path)

    return files_with_words

In [3]:
def internal_consistency_population(df):
    df['tot_pop_calc'] = df['Group Quarters - College'] + df['Group Quarters - Military'] + df['Group Quarters - Other'] + df['Household Population']

    tot_pop_diff = sum((df['tot_pop_calc'] == df['Total Population'])) - len(df)

    df['tot_gq_pop_calc'] = df['Group Quarters - College'] + df['Group Quarters - Military'] + df['Group Quarters - Other']

    tot_gq_pop_diff = sum((df['tot_gq_pop_calc'] == df['Total GQ Population'])) - len(df)

    if ((tot_pop_diff == 0) & (tot_gq_pop_diff == 0)):
        return 'Dataframe is internally consistent'
    else:
        return 'Dataframe is not internally consistent'


In [27]:
def population_folder_internal_consistency(estimates_version):
    list_of_files = find_files(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', variable='population', estimates_version=estimates_version)
    for file in list_of_files:
        df = pd.read_excel(file)
        geo_level = file.split("\\")[1].split("_est")[0]
        print(f"{geo_level}: {internal_consistency_population(df)}")

In [28]:
population_folder_internal_consistency(estimates_version='2022_03')

census_tract: Dataframe is internally consistent
cpa: Dataframe is internally consistent
jurisdiction: Dataframe is internally consistent
luz: Dataframe is internally consistent
mgra: Dataframe is internally consistent
region: Dataframe is internally consistent
sra: Dataframe is internally consistent


# NaN Check

In [29]:
def check_missing_values(df):
    if df.isnull().values.any():
        return 'There are NaNs'
    else:
        return 'There are no NaNs'

In [35]:
def population_folder_internal_consistency(estimates_version, variable):
    list_of_files = find_files(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', variable=variable, estimates_version=estimates_version)
    for file in list_of_files:
        df = pd.read_excel(file)
        geo_level = file.split("\\")[1].split("_est")[0]
        print(f"{geo_level}-{variable}: {check_missing_values(df)}")

In [36]:
population_folder_internal_consistency(estimates_version='2022_03', variable='population')

census_tract-population: There are no NaNs
cpa-population: There are no NaNs
jurisdiction-population: There are no NaNs
luz-population: There are no NaNs
mgra-population: There are no NaNs
region-population: There are no NaNs
sra-population: There are no NaNs
