In [3]:
import os 
import pandas as pd
import numpy as np
import glob
import pyodbc
import gc

## Population Internal Consistency

In [2]:
def find_files(url, variable, estimates_version):
    path = url.replace("\\", "/")

    if variable == 'age':
        variable = 'age_i'
    files_with_words = []
    for filename in os.listdir(path):
        if variable in filename and estimates_version in filename:
            file_path = os.path.join(path, filename)
            files_with_words.append(file_path)

    return files_with_words

In [3]:
def internal_consistency_population(df):
    df['tot_pop_calc'] = df['Group Quarters - College'] + df['Group Quarters - Military'] + df['Group Quarters - Other'] + df['Household Population']

    tot_pop_diff = sum((df['tot_pop_calc'] == df['Total Population'])) - len(df)

    df['tot_gq_pop_calc'] = df['Group Quarters - College'] + df['Group Quarters - Military'] + df['Group Quarters - Other']

    tot_gq_pop_diff = sum((df['tot_gq_pop_calc'] == df['Total GQ Population'])) - len(df)

    if ((tot_pop_diff == 0) & (tot_gq_pop_diff == 0)):
        return 'Dataframe is internally consistent'
    else:
        return 'Dataframe is not internally consistent'


In [2]:
def population_folder_internal_consistency(estimates_version):
    list_of_files = find_files(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', variable='population', estimates_version=estimates_version)
    for file in list_of_files:
        df = pd.read_excel(file)
        geo_level = file.split("\\")[1].split("_est")[0]
        print(f"{geo_level}: {internal_consistency_population(df)}")

In [28]:
population_folder_internal_consistency(estimates_version='2022_03')

census_tract: Dataframe is internally consistent
cpa: Dataframe is internally consistent
jurisdiction: Dataframe is internally consistent
luz: Dataframe is internally consistent
mgra: Dataframe is internally consistent
region: Dataframe is internally consistent
sra: Dataframe is internally consistent


# NaN Check

In [29]:
def check_missing_values(df):
    if df.isnull().values.any():
        return 'There are NaNs'
    else:
        return 'There are no NaNs'

In [35]:
def population_folder_internal_consistency(estimates_version, variable):
    list_of_files = find_files(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', variable=variable, estimates_version=estimates_version)
    for file in list_of_files:
        df = pd.read_excel(file)
        geo_level = file.split("\\")[1].split("_est")[0]
        print(f"{geo_level}-{variable}: {check_missing_values(df)}")

In [36]:
population_folder_internal_consistency(estimates_version='2022_03', variable='population')

census_tract-population: There are no NaNs
cpa-population: There are no NaNs
jurisdiction-population: There are no NaNs
luz-population: There are no NaNs
mgra-population: There are no NaNs
region-population: There are no NaNs
sra-population: There are no NaNs


# Across Geography Internal Consistency

In [45]:
def check_dataframe_sums(*dataframes):
    # Get the column names of the first dataframe
    col_names = dataframes[0].columns
    
    # Calculate the sum of each column for the first dataframe
    sums = dataframes[0][col_names].sum()

    # Iterate through the remaining dataframes and compare the sum of each column
    for i, df in enumerate(dataframes[1:], start=1):
        # Ensure the column names match
        if not set(col_names) == set(df.columns):
            print(col_names)
            print(df.columns)
            return f"Dataframe {i} has different column names"
        
        # Calculate the sum of each column for the current dataframe
        current_sums = df[col_names].sum()
        
        # Compare the sums of each column
        if not sums.equals(current_sums):
            return f"Dataframe {i} has different sums"
    
    return "All dataframes have the same column sums"

In [49]:
def find_file(url, geo_level, variable, estimates_version):
    path = url.replace("\\", "/")

    if variable == 'age':
        variable = 'age_i'
    elif variable == 'sex':
        variable = 'sex_i'
    elif variable == 'ethnicity':
        variable = f'{estimates_version}_ethnicity'
    files_with_words = []
    for filename in os.listdir(path):
        if geo_level in filename and variable in filename and estimates_version in filename:
            file_path = os.path.join(path, filename)
            files_with_words.append(file_path)

    return files_with_words

In [47]:
estimates_version = '2022_03'
variables = ['age', 'children', 'ethnicity', 'households', 'housing','income', 'sex', 'workers', 'population']
#variables = ['ethnicity']
geo_levels = ['mgra', 'census_tract', 'luz', 'cpa', 'sra', 'jurisdiction', 'region']

In [50]:
for variable in variables:
    mgra_df = pd.read_excel(find_file(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', geo_level='mgra', variable=variable, estimates_version=estimates_version)[0])
    mgra_df = mgra_df.set_index(['mgra', 'yr_id'])

    census_tract_df = pd.read_excel(find_file(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', geo_level='census_tract', variable=variable, estimates_version=estimates_version)[0])
    census_tract_df = census_tract_df.set_index(['census_tract', 'yr_id'])

    luz_df = pd.read_excel(find_file(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', geo_level='luz', variable=variable, estimates_version=estimates_version)[0])
    luz_df = luz_df.set_index(['luz', 'yr_id'])
    
    cpa_df = pd.read_excel(find_file(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', geo_level='cpa', variable=variable, estimates_version=estimates_version)[0])
    cpa_df = cpa_df.set_index(['cpa', 'yr_id'])
    
    sra_df = pd.read_excel(find_file(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', geo_level='sra', variable=variable, estimates_version=estimates_version)[0])
    sra_df = sra_df.set_index(['sra', 'yr_id'])
    
    jurisdiction_df = pd.read_excel(find_file(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', geo_level='jurisdiction', variable=variable, estimates_version=estimates_version)[0])
    jurisdiction_df = jurisdiction_df.set_index(['jurisdiction', 'yr_id'])
    
    region_df = pd.read_excel(find_file(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', geo_level='region', variable=variable, estimates_version=estimates_version)[0])
    region_df = region_df.set_index(['region', 'yr_id'])

    print(f"{variable}: {check_dataframe_sums(mgra_df, census_tract_df, luz_df, cpa_df, jurisdiction_df, region_df)}")



age: All dataframes have the same column sums
children: All dataframes have the same column sums
ethnicity: All dataframes have the same column sums
households: All dataframes have the same column sums
housing: All dataframes have the same column sums
income: All dataframes have the same column sums
sex: All dataframes have the same column sums
workers: All dataframes have the same column sums
population: All dataframes have the same column sums


## For Age Sex Ethnicity

In [52]:
estimates_version = '2022_03'
variables = ['age_sex_ethnicity']
geo_levels = ['census_tract', 'luz', 'cpa', 'sra', 'jurisdiction', 'region']

In [53]:
for variable in variables:
    census_tract_df = pd.read_excel(find_file(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', geo_level='census_tract', variable=variable, estimates_version=estimates_version)[0])
    census_tract_df = census_tract_df.set_index(['census_tract', 'yr_id', 'age group', 'sex'])

    luz_df = pd.read_excel(find_file(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', geo_level='luz', variable=variable, estimates_version=estimates_version)[0])
    luz_df = luz_df.set_index(['luz', 'yr_id', 'age group', 'sex'])
    
    cpa_df = pd.read_excel(find_file(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', geo_level='cpa', variable=variable, estimates_version=estimates_version)[0])
    cpa_df = cpa_df.set_index(['cpa', 'yr_id', 'age group', 'sex'])
    
    sra_df = pd.read_excel(find_file(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', geo_level='sra', variable=variable, estimates_version=estimates_version)[0])
    sra_df = sra_df.set_index(['sra', 'yr_id', 'age group', 'sex'])
    
    jurisdiction_df = pd.read_excel(find_file(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', geo_level='jurisdiction', variable=variable, estimates_version=estimates_version)[0])
    jurisdiction_df = jurisdiction_df.set_index(['jurisdiction', 'yr_id', 'age group', 'sex'])
    
    region_df = pd.read_excel(find_file(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', geo_level='region', variable=variable, estimates_version=estimates_version)[0])
    region_df = region_df.set_index(['region', 'yr_id', 'age group', 'sex'])

    print(f"{variable}: {check_dataframe_sums(census_tract_df, luz_df, cpa_df, jurisdiction_df, region_df)}")

age_sex_ethnicity: All dataframes have the same column sums
