In [18]:
import os 
import pandas as pd
import numpy as np
import glob
import pyodbc
import gc

# Download DOF Data

In [19]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

with open(r'C:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2023\2023-024 Estimates 2022\sql_queries\DOF_query.sql', 'r') as sql_file:
    sql_query = sql_file.read()

df =  pd.read_sql_query(sql_query, conn)
df

Unnamed: 0,area_type,area_name,summary_type,county_name,est_yr,est_md,total_pop,household_pop,group_quarters,total_hu,single,single_detached,single_attached,multiple,two_to_four,five_plus,mobile_homes,occupied,unoccupied,vintage_yr
0,County,San Diego,Total,San Diego,2020,JAN_1,3331279,3218869,112410,1226879,729293,622861,106432,451206,87814,363392,46380,1153981,72898,2022
1,County,San Diego,Total,San Diego,2021,JAN_1,3288503,3185504,102999,1234243,730358,623705,106653,457367,89145,368222,46521,1159366,74877,2022
2,County,San Diego,Total,San Diego,2022,JAN_1,3287306,3170895,116411,1244854,733579,626055,107524,464642,89713,374929,46636,1169378,75476,2022


# Folders

In [11]:
age = r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Data\age'
age_sex_ethnicity = r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Data\age_sex_ethnicity'
children = r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Data\children'
ethnicity = r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Data\ethnicity'
households = r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Data\households'
housing = r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Data\housing'
income = r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Data\income_group'
population = r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Data\population'
sex = r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Data\sex'
workers = r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Data\workers'

all_folders = [age, age_sex_ethnicity, children, ethnicity, households, housing, income, population, sex, workers]

# Internal Consistency Checks

## Population

In [7]:
def internal_consistency_population(df):
    df['tot_pop_calc'] = df['Group Quarters - College'] + df['Group Quarters - Military'] + df['Group Quarters - Other'] + df['Household Population']

    tot_pop_sum = sum((df['tot_pop_calc'] == df['total_population'])) - len(df)

    df['tot_gq_pop_calc'] = df['Group Quarters - College'] + df['Group Quarters - Military'] + df['Group Quarters - Other']

    tot_gq_pop_sum = sum((df['tot_gq_pop_calc'] == df['total_pop_GQ'])) - len(df)

    if ((tot_pop_sum == 0) & (tot_gq_pop_sum == 0)):
        return 'Dataframe is internally consistent'
    else:
        return 'Dataframe is not internally consistent'


In [4]:
def population_folder_internal_consistency(url):
    files = os.listdir(url)
    for file in files:
        df = pd.read_csv(url+'/'+file)
        print(f"{file}: {internal_consistency_population(df)}")

In [8]:
population_folder_internal_consistency(population)

census_tract_population_est_2022_01_ind_QA.csv: Dataframe is internally consistent
cpa_population_est_2022_01_ind_QA.csv: Dataframe is internally consistent
jurisdiction_population_est_2022_01_ind_QA.csv: Dataframe is internally consistent
LUZ_population_est_2022_01_ind_QA.csv: Dataframe is internally consistent
mgra_population_est_2022_01_ind_QA.csv: Dataframe is internally consistent
region_population_est_2022_01_ind_QA.csv: Dataframe is internally consistent
SRA_population_est_2022_01_ind_QA.csv: Dataframe is internally consistent


# NaN Check

In [15]:
def check_missing_values(df):
    if df.isnull().values.any():
        return 'There are NaNs'
    else:
        return 'There are no NaNs'

In [16]:
def population_folder_NaN_Check(url):
    files = os.listdir(url)
    for file in files:
        df = pd.read_csv(url+'/'+file)
        print(f"{file}: {check_missing_values(df)}")

In [17]:
for folder in all_folders:
    population_folder_NaN_Check(population)

census_tract_population_est_2022_01_ind_QA.csv: There are no NaNs
cpa_population_est_2022_01_ind_QA.csv: There are no NaNs
jurisdiction_population_est_2022_01_ind_QA.csv: There are no NaNs
LUZ_population_est_2022_01_ind_QA.csv: There are no NaNs
mgra_population_est_2022_01_ind_QA.csv: There are no NaNs
region_population_est_2022_01_ind_QA.csv: There are no NaNs
SRA_population_est_2022_01_ind_QA.csv: There are no NaNs
census_tract_population_est_2022_01_ind_QA.csv: There are no NaNs
cpa_population_est_2022_01_ind_QA.csv: There are no NaNs
jurisdiction_population_est_2022_01_ind_QA.csv: There are no NaNs
LUZ_population_est_2022_01_ind_QA.csv: There are no NaNs
mgra_population_est_2022_01_ind_QA.csv: There are no NaNs
region_population_est_2022_01_ind_QA.csv: There are no NaNs
SRA_population_est_2022_01_ind_QA.csv: There are no NaNs
census_tract_population_est_2022_01_ind_QA.csv: There are no NaNs
cpa_population_est_2022_01_ind_QA.csv: There are no NaNs
jurisdiction_population_est_2022_01