# 2022-47 Base Year Forecast Output QC
Test Plan: https://sandag.sharepoint.com/qaqc/_layouts/15/Doc.aspx?sourcedoc={f8b3d630-1290-445b-99a1-2fa9041ade92}&action=edit

Documentation: https://sandag.sharepoint.com/:w:/r/qaqc/_layouts/15/Doc.aspx?sourcedoc=%7B3AF20D75-0A22-4B9C-9CC4-85B3EEC294E6%7D&file=MGRABased_input_ABM_2019_process_notes.docx 

### Library Imports

In [1]:
import pandas as pd
import pyodbc

from pathlib import Path

# ignore warning relating to pandas and pyodbc (just ignore all warnings)
import warnings
warnings.filterwarnings("ignore")

### Download Data


In [2]:
def download_csv_data(user):
    """
    This function downloads csv data for the 2019 Forecast Output

    :param user:    The user trying to download the data. Mostly here so that others can more 
                    easily run my code

    :returns:       Tuple with (mgra data, region data, jurisdiction data)
    """

    # Data is stored in this folder
    data_folder_path = Path(f"C:/Users/{user}/San Diego Association of Governments/" \
        "SANDAG QA QC - Documents/Projects/2022/2022-47 Base Year Forecast Output QC/Data/")

    # Define the files we need from SharePoint
    sp_files = ["mgra_ind.csv", "region_ind.csv", "jur_ind.csv"]

    # Download the data and put them into dfs
    mgra_ind = pd.read_csv(data_folder_path / sp_files[0])
    region_ind = pd.read_csv(data_folder_path / sp_files[1])
    jurisdiction_ind = pd.read_csv(data_folder_path / sp_files[2])

    # Correct some columns names
    # In region_ind.csv, "Household Population (hh)" --> "hh"
    region_ind = region_ind.rename({"Household Population (hh)": "hh"}, axis=1)

    # Return the data in tuple format
    return mgra_ind, region_ind, jurisdiction_ind

# Get the csv data
mgra, region, jurisdiction = download_csv_data("eli")

In [3]:
def download_SQL_data():
    """
    This function downloads SQL data for the 2019 Forecast Output

    :returns:       TODO
    """
    # Create the SQL connection to download juristiction data
    connection = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
        'Server=DDAMWSQL16.sandag.org;'
        'Database=dpoe_stage;'
        'Trusted_Connection=yes;')

    # Get only the data we want (Series 14; mgra_id to jurisdiction table)
    query = """SELECT mgra_id, jurisdiction FROM demographic_warehouse.dim.mgra_denormalize
        WHERE series=14"""
    jurisdiction = pd.read_sql_query(query, connection)

    # Get the data for every required fact table
    fact_table_names = ["age", "age_sex_ethnicity", "ethnicity", "household_income", "housing", 
        "jobs", "land_use", "population", "sex"]
    fact_data = {}
    for table_name in fact_table_names:
        fact_data[table_name] = pd.read_sql_query(f"""
            SELECT * FROM demographic_warehouse.fact.{table_name} 
            WHERE datasource_id=44
            AND yr_id=2019""", connection)

    # For each fact table, add on the jurisdiction information using the mgra_id to jurisdiction 
    # table
    for table_name in fact_table_names:
        fact_data[table_name] = fact_data[table_name].merge(
            jurisdiction, left_on="mgra_id", right_on="mgra_id")

    return fact_data

# get the SQL data
fact_tables = download_SQL_data()

### Tests

In [4]:
def average(numerator, denominator):
    """
    Generic function used to compute the average value of some value derived from some other 
    columns

    :param numerator:       df column as the numerator. For example, if computing average household
                            size across the region, this would be the total population column
    :param denominator:     df column as the denominator. For example, if computing average 
                            household size across the region, this would be the total number of 
                            households in the region
    :returns:               The derived average value. For example, if computing average household
                            size across the region, this would be the number of people in the region
                            divided by the number of households in the region
    """
    return numerator.sum() / denominator.sum()

def rate(numerator, denominator):
    """
    Generic function used to compute the average rate of some value derived from some other columns.
    This function is basically identical to the average function except the final value is 
    multiplied by 100

    :param numerator:       df column as the numerator. For example, if computing vacancy rate 
                            across the region, this would be the number of vacant units
    :param denominator:     df column as the denominator. For example, if computing vacancy rates 
                            across the region, this would be the total number of units in the region
    :returns:               The derived rate. For example, if computing vacancy rate across the 
                            region, this would be the number of vacant units divided by the total
                            number of units multiplied by 100
    """
    return average(numerator, denominator) * 100

def compare_values(first_val, second_val, files):
    """
    Simple function used to compare values. If the values are not identical, then some diagnostic
    printing is done

    :param first_val:   The first value to compare. Comes from the first file name
    :param second_val:  The second value to compare. Comes from the second file name
    :param files:       List with two items with the file names that the values come from
    :returns:           True or False depending on if the two input values are the same. If not
                        the same, then some diagnostic printing is done using the file names
    """
    if(first_val == second_val):
        return True
    else:
        print(f"{files[0] + ' value:' : <30}{first_val : <10}")
        print(f"{files[1] + ' value:' : <30}{second_val : <10}")
        return False

def sum_filter():
    """
    Dummy function to keep formatting consistent
    """
    pass

In [5]:
def test_mgra_region(mgra, region, region_var, function, f_args):
    """
    Generic function used to test the internal consistency of mgra_ind.csv and region_ind.csv

    :param mgra:        df containing data from mgra_ind.csv
    :param region:      df containing data from region_ind.csv
    :param region_var:  The name of the column in region to compare with
    :param function:    The function used on mgra column(s) to compare with region
    :param f_args:      The arguments (column names of mgra) to pass into the function 
    :returns:           True if the values in the two files line up, False otherwise. If False, the
                        function compare_values will print some diagnostics
    """
    files = ["mgra_ind.csv", "region_ind.csv"]
    if(function == sum):
        return compare_values(mgra[f_args[0]].sum(), region[region_var].values[0], files)
    if(function == average or function == rate):
        return compare_values(function(mgra[f_args[0]], mgra[f_args[1]]), 
            region[region_var].values[0], files)

def test_fact_jur(fact, jur, jur_var, function, f_args):
    """
    Generic function used to test the internal consistency of fact tables and jur_ind.csv

    :param fact:        Dictionary containing all data from fact tables
    :param jur:         df containing data from jur_ind.csv
    :param jur_var:     The name of the column in jur to compare with
    :param function:    The function used on fact table column(s) to compare with jur
    :param f_args:      Arguments used to select which fact tables and which columns to use to 
                        pass into the function. The format is a list of lists: [[fact table name,
                        fact table col], [second fact table name, second fact table column], etc.].
                        If the function is sum_filter, then the second list just becomes the column
                        name, column value, and type of filter ("=" or "!=")
    :returns:           Tuple containing (1) Boolean column with index jurisdictions and column
                        name jur_var, (2) Column with derived fact table values, and (3) Column
                        with existing jur_ind.csv values
    """
    files = ["fact tables", "jur_ind.csv"]
    if(function == sum):
        # Group the relevant fact table by jurisdiction
        ft_jur = fact[f_args[0][0]].groupby("jurisdiction")

        # Compute the sum of the requested column in the relevant fact table
        ft_jur = ft_jur.sum()[[f_args[0][1]]]

        # Do transformations on both tables to make comparison easy
        ft_jur = ft_jur.rename({f_args[0][1]: jur_var}, axis=1)
        jur_table = jur[["jurisdiction", jur_var]].set_index("jurisdiction")

        return (ft_jur == jur_table, ft_jur, jur_table)
    if(function == sum_filter):
        # Essentially the same as sum but filter first
        filtered_ft = fact[f_args[0][0]]
        if(f_args[1][2] == "="):
            filtered_ft = filtered_ft[filtered_ft[f_args[1][0]] == f_args[1][1]]
        elif(f_args[1][2] == "!="):
            filtered_ft = filtered_ft[filtered_ft[f_args[1][0]] != f_args[1][1]]

        # Group the relevant fact table by jurisdiction
        ft_jur = filtered_ft.groupby("jurisdiction")

        # Compute the sum of the requested column in the relevant fact table
        ft_jur = ft_jur.sum()[[f_args[0][1]]]

        # Do transformations on both tables to make comparison easy
        ft_jur = ft_jur.rename({f_args[0][1]: jur_var}, axis=1)
        jur_table = jur[["jurisdiction", jur_var]].set_index("jurisdiction")

        return (ft_jur == jur_table, ft_jur, jur_table)
    if(function == average or function == rate):
        # Group the relevant fact tables by jurisdiction
        ft_num = fact[f_args[0][0]].groupby("jurisdiction")
        ft_denom = fact[f_args[1][0]].groupby("jurisdiction")

        # Compute the sums of the requested columns in the relevant fact tables
        ft_num = ft_num.sum()[[f_args[0][1]]]
        ft_denom = ft_denom.sum()[[f_args[1][1]]]

        # Compute the derived value and add it to the numerator table
        ft_num[jur_var] = ft_num[f_args[0][1]] / ft_denom[f_args[1][1]]
        if(function == rate):
            ft_num[jur_var] = ft_num[jur_var] * 100

        # Do transformations on both tables to make comparison easy
        jur_table = jur[["jurisdiction", jur_var]].set_index("jurisdiction")

        return (ft_num[[jur_var]] == jur_table, ft_num, jur_table)

def test_fact_region(fact, region, region_var, function, f_args):
    """
    Generic function used to test the internal consistency of fact tables and jur_ind.csv

    :param fact:        Dictionary containing all data from fact tables
    :param region:      df containing data from region_ind.csv
    :param region_var:  The name of the column in region to compare with
    :param function:    The function used on fact table column(s) to compare with region
    :param f_args:      Arguments used to select which fact tables and which columns to use to 
                        pass into the function. The format is a list of lists: [[fact table name,
                        fact table col], [second fact table name, second fact table column], etc.].
                        If the function is sum_filter, then the second list just becomes the column
                        name, column value, and type of filter ("=" or "!=")
    :returns:           Tuple containing (1) True if values match, False otherwise, (2) computed
                        fact table value, and (3) region value
    """
    files = ["fact tables", "region_ind.csv"]
    if(function == sum):
        return compare_values(fact[f_args[0][0]][f_args[0][1]].sum(), 
            region[region_var].values[0], files)
    if(function == average or function == rate):
        return compare_values(function(fact[f_args[0][0]][f_args[0][1]], 
            fact[f_args[1][0]][f_args[1][1]]), region[region_var].values[0], files)
    if(function == sum_filter):
        # Get the fact table we want
        ft_filtered = fact_tables[f_args[0][0]]

        # Filter by the input condition
        if(f_args[1][2] == "="):
            ft_filtered = ft_filtered[ft_filtered[f_args[1][0]] == f_args[1][1]]
        elif(f_args[1][2] == "!="):
            ft_filtered = ft_filtered[ft_filtered[f_args[1][0]] != f_args[1][1]]

        return compare_values(ft_filtered[f_args[0][1]].sum(), region[region_var].values[0], files)

#### Tests for Population Variables

In [6]:
# pop = total population
# hhp = total household population (exlucde gq pop)
# gq_total = total gq population, derived from sum of population in civialn gq and military gq
population_variables = [
    ["pop", sum, ["pop"]],
    ["hhp", sum, ["hhp"]],
    ["gq_total", sum, ["gq_total"]]
]

# Create the gq_total column in all dfs
mgra["gq_total"] = mgra["gq_civ"] + mgra["Group Quarters - Military (gq_mil)"]
region["gq_total"] = region["gq_civ"] + region["Group Quarters - Military (gq_mil)"]
jurisdiction["gq_total"] = jurisdiction["gq_civ"] \
    + jurisdiction["Group Quarters - Military (gq_mil)"]
    
# Check the population variables match up between mgra_ind.csv and region_ind.csv
for variable in population_variables:
    print(variable[0],
        test_mgra_region(mgra, region, region_var=variable[0], function=variable[1], 
            f_args=variable[2]))
    print()

pop True

hhp True

gq_total True



In [7]:
# Check the population variables match up between fact tables and jurisdiction
population_variables = [
    ["pop", sum, [["population", "population"]]],
    ["hhp", sum_filter, [["population", "population"], ["housing_type_id", 1, "="]]],
    ["gq_total", sum_filter, [["population", "population"], ["housing_type_id", 1, "!="]]]
]

population_checks = []
fact_values = []
jur_values = []

for variable in population_variables:
    test_results = test_fact_jur(fact_tables, jurisdiction, variable[0], variable[1], variable[2])
    population_checks.append(test_results[0])
    fact_values.append(test_results[1])
    jur_values.append(test_results[2])

# Output nicely formatted results
fact_values = pd.concat(fact_values, axis=1)
jur_values = pd.concat(jur_values, axis=1)
pd.concat(population_checks, axis=1)

Unnamed: 0_level_0,pop,hhp,gq_total
jurisdiction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Carlsbad,False,False,True
Chula Vista,False,False,False
Coronado,False,False,True
Del Mar,False,False,True
El Cajon,False,False,False
Encinitas,False,False,False
Escondido,False,False,False
Imperial Beach,False,False,False
La Mesa,False,False,False
Lemon Grove,False,False,False


In [8]:
# Check the population variables match up between fact tables and region
population_variables = [
    ["pop", sum, [["population", "population"]]],
    ["hhp", sum_filter, [["population", "population"], ["housing_type_id", 1, "="]]],
    ["gq_total", sum_filter, [["population", "population"], ["housing_type_id", 1, "!="]]]
]

for variable in population_variables:
    print(variable[0],
        test_fact_region(fact_tables, region, variable[0], variable[1], variable[2]))
    print()

pop True

hhp True

gq_total True



#### Tests for Housing Variables

In [9]:
# hs = housing units
# hh = number of households
# hhs = household size
# vacancy = vacant units
# vacancy_rate = vacancy rate
housing_variables = [
    ["hs", sum, ["hs"]],
    ["hh", sum, ["Household Population (hh)"]],
    ["hhs", average, ["pop", "Household Population (hh)"]],
    ["vacancy", sum, ["vacancy"]],
    ["vacancy_rate", rate, ["vacancy", "units"]]
]
    
# Check the housing variables match up between mgra_ind.csv and region_ind.csv
for variable in housing_variables:
    print(variable[0],
        test_mgra_region(mgra, region, region_var=variable[0], function=variable[1], 
            f_args=variable[2]))
    print()

hs True

hh True

mgra_ind.csv value:           2.896117137863108
region_ind.csv value:         49743.093 
hhs False

vacancy True

mgra_ind.csv value:           4.26962170094305
region_ind.csv value:         26715.815137256173
vacancy_rate False



In [10]:
# Check the housing variables match up between fact tables and jurisdiction
housing_variables = [
    ["hs", sum, [["housing", "units"]]],
    ["Household Population (hh)", sum, [["household_income", "households"]]],
    ["hhs", average, [["population", "population"], ["housing", "units"]]],
    ["vacancy", sum, [["housing", "vacancy"]]],
    ["vacancy_rate", rate, [["housing", "vacancy"], ["housing", "units"]]]
]

housing_checks = []
fact_values = []
jur_values = []

for variable in housing_variables:
    test_results = test_fact_jur(fact_tables, jurisdiction, variable[0], variable[1], variable[2])
    housing_checks.append(test_results[0])
    fact_values.append(test_results[1])
    jur_values.append(test_results[2])

# Output nicely formatted results
fact_values = pd.concat(fact_values, axis=1)
jur_values = pd.concat(jur_values, axis=1)
pd.concat(housing_checks, axis=1)

Unnamed: 0_level_0,hs,Household Population (hh),hhs,vacancy,vacancy_rate
jurisdiction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Carlsbad,False,False,False,False,False
Chula Vista,False,False,False,False,False
Coronado,False,False,False,False,False
Del Mar,False,False,False,False,False
El Cajon,False,False,False,False,False
Encinitas,False,False,False,False,False
Escondido,False,False,False,False,False
Imperial Beach,False,False,False,False,False
La Mesa,False,False,False,False,False
Lemon Grove,False,False,False,False,False


In [11]:
# Check the housing variables match up between fact tables and region
housing_variables = [
    ["hs", sum, [["housing", "units"]]],
    ["hh", sum, [["household_income", "households"]]],
    ["hhs", average, [["population", "population"], ["housing", "units"]]],
    ["vacancy", sum, [["housing", "vacancy"]]],
    ["vacancy_rate", rate, [["housing", "vacancy"], ["housing", "units"]]]
]

for variable in housing_variables:
    print(variable[0],
        test_fact_region(fact_tables, region, variable[0], variable[1], variable[2]))
    print()

hs True

hh True

fact tables value:            2.772461898809613
region_ind.csv value:         49743.093 
hhs False

fact tables value:            51442.0   
region_ind.csv value:         51441.0   
vacancy False

fact tables value:            4.269690525871957
region_ind.csv value:         26715.815137256173
vacancy_rate False



#### Tests for Employment Variables

In [12]:
# emp_total = total employment
# Other = various categories of employment. See ABM Wiki for more details
employment_variables = [
    ["emp_total", sum, ["emp_total"]],
    ["emp_Agricultural_and_Extractive", sum, ["emp_Agricultural_and_Extractive"]],
    ["emp_const_non_bldg_prod", sum, ["emp_const_non_bldg_prod"]],
    ["emp_const_non_bldg_Office", sum, ["emp_const_non_bldg_Office"]],
    ["emp_utilities_prod", sum, ["emp_utilities_prod"]],
    ["emp_utilities_Office", sum, ["emp_utilities_Office"]],
    ["emp_const_bldg_prod", sum, ["emp_const_bldg_prod"]],
    ["emp_const_bldg_Office", sum, ["emp_const_bldg_Office"]],
    ["emp_Manufacturing_prod", sum, ["emp_Manufacturing_prod"]],
    ["emp_Manufacturing_Office", sum, ["emp_Manufacturing_Office"]],
    ["emp_whsle_whs", sum, ["emp_whsle_whs"]],
    ["emp_trans", sum, ["emp_trans"]],
    ["emp_retail", sum, ["emp_retail"]],
    ["emp_prof_bus_svcs", sum, ["emp_prof_bus_svcs"]],
    ["emp_prof_bus_svcs_bldg_maint", sum, ["emp_prof_bus_svcs_bldg_maint"]],
    ["emp_pvt_ed_k12", sum, ["emp_pvt_ed_k12"]],
    ["emp_pvt_ed_post_k12_Other_Residential", sum, ["emp_pvt_ed_post_k12_Other_Residential"]],
    ["emp_health", sum, ["emp_health"]],
    ["emp_personal_svcs_Office", sum, ["emp_personal_svcs_Office"]],
    ["emp_amusement", sum, ["emp_amusement"]],
    ["emp_hotel", sum, ["emp_hotel"]],
    ["emp_restaurant_bar", sum, ["emp_restaurant_bar"]],
    ["emp_personal_svcs_retail", sum, ["emp_personal_svcs_retail"]],
    ["emp_religious", sum, ["emp_religious"]],
    ["emp_pvt_hh", sum, ["emp_pvt_hh"]],
    ["emp_state_local_Government_ent", sum, ["emp_state_local_Government_ent"]],
    ["emp_fed_non_Military", sum, ["emp_fed_non_Military"]],
    ["emp_fed_Military", sum, ["emp_fed_Military"]],
    ["emp_state_local_Government_blue", sum, ["emp_state_local_Government_blue"]],
    ["emp_state_local_Government_white", sum, ["emp_state_local_Government_white"]],
    ["emp_public_ed", sum, ["emp_public_ed"]],
    ["emp_own_occ_dwell_mgmt", sum, ["emp_own_occ_dwell_mgmt"]],
    ["emp_fed_Government_accts", sum, ["emp_fed_Government_accts"]],
    ["emp_st_lcl_Government_accts", sum, ["emp_st_lcl_Government_accts"]],
    ["emp_cap_accts", sum, ["emp_cap_accts"]]
]
    
# Check the employment variables match up between mgra_ind.csv and region_ind.csv
for variable in employment_variables:
    print(variable[0],
        test_mgra_region(mgra, region, region_var=variable[0], function=variable[1], 
            f_args=variable[2]))
    print()

emp_total True

emp_Agricultural_and_Extractive True

emp_const_non_bldg_prod True

emp_const_non_bldg_Office True

emp_utilities_prod True

emp_utilities_Office True

emp_const_bldg_prod True

emp_const_bldg_Office True

emp_Manufacturing_prod True

emp_Manufacturing_Office True

emp_whsle_whs True

emp_trans True

emp_retail True

emp_prof_bus_svcs True

emp_prof_bus_svcs_bldg_maint True

emp_pvt_ed_k12 True

emp_pvt_ed_post_k12_Other_Residential True

emp_health True

emp_personal_svcs_Office True

emp_amusement True

emp_hotel True

emp_restaurant_bar True

emp_personal_svcs_retail True

emp_religious True

emp_pvt_hh True

emp_state_local_Government_ent True

emp_fed_non_Military True

emp_fed_Military True

emp_state_local_Government_blue True

emp_state_local_Government_white True

emp_public_ed True

emp_own_occ_dwell_mgmt True

emp_fed_Government_accts True

emp_st_lcl_Government_accts True

emp_cap_accts True



In [13]:
# Check the employment variables match up between fact tables and jurisdiction

# !!! There are no fact tables which contain 2019 employment data

In [14]:
# Check the employment variables match up between fact tables and region

# !!! There are no fact tables which contain 2019 employment data

#### Tests for School Enrollment

In [15]:
# enrollgradekto8 = Grade School K-8 enrollment
# enrollgrade9to12 = Grade School 9-12 enrollment

grade_school_enrollment_variables = [
    ["enrollgradekto8", sum, ["enrollgradekto8"]],
    ["enrollgrade9to12", sum, ["enrollgrade9to12"]]
]
    
# Check the grade school enrollment variables match up between mgra_ind.csv and region_ind.csv
for variable in grade_school_enrollment_variables:
    print(variable[0],
        test_mgra_region(mgra, region, region_var=variable[0], function=variable[1], 
            f_args=variable[2]))
    print()

enrollgradekto8 True

enrollgrade9to12 True



In [16]:
# Check the grade school enrollment variables match up between fact tables and jurisdiction

# !!! There are no fact tables which contain any 2019 school enrollment data

In [17]:
# Check the grade school enrollment variables match up between fact tables and region

# !!! There are no fact tables which contain any 2019 school enrollment data

#### Tests for College Enrollment

In [18]:
# collegeenroll = Major College enrollment
# othercollegeenroll = Other College enrollment (no idea what the difference is between the two)
# adultschenrl = Adult School enrollment
higher_ed_enrollment_variables = [
    ["collegeenroll", sum, ["collegeenroll"]],
    ["othercollegeenroll", sum, ["othercollegeenroll"]],
    ["adultschenrl", sum, ["adultschenrl"]]
]
    
# Check the housing variables match up between mgra_ind.csv and region_ind.csv
for variable in higher_ed_enrollment_variables:
    print(variable[0],
        test_mgra_region(mgra, region, region_var=variable[0], function=variable[1], 
            f_args=variable[2]))
    print()

collegeenroll True

othercollegeenroll True

adultschenrl True



In [19]:
# Check the college enrollment variables match up between fact tables and jurisdiction

# !!! There are no fact tables which contain any 2019 school enrollment data

In [20]:
# Check the college enrollment variables match up between fact tables and region

# !!! There are no fact tables which contain any 2019 school enrollment data

#### Tests for Income

In [21]:
# Self-explanatory income categories
income_variables = [
    ["Less than $15,000", sum, ["Less than $15,000"]],
    ["$15,000 to $29,999", sum, ["$15,000 to $29,999"]],
    ["$30,000 to $44,999", sum, ["$30,000 to $44,999"]],
    ["$45,000 to $59,999", sum, ["$45,000 to $59,999"]],
    ["$60,000 to $74,999", sum, ["$60,000 to $74,999"]],
    ["$75,000 to $99,999", sum, ["$75,000 to $99,999"]],
    ["$100,000 to $124,999", sum, ["$100,000 to $124,999"]],
    ["$125,000 to $149,999", sum, ["$125,000 to $149,999"]],
    ["$150,000 to $199,999", sum, ["$150,000 to $199,999"]],
    ["$200,000 or more", sum, ["$200,000 or more"]]
]
    
# Check the income variables match up between mgra_ind.csv and region_ind.csv
for variable in income_variables:
    print(variable[0],
        test_mgra_region(mgra, region, region_var=variable[0], function=variable[1], 
            f_args=variable[2]))
    print()

Less than $15,000 True

$15,000 to $29,999 True

$30,000 to $44,999 True

$45,000 to $59,999 True

$60,000 to $74,999 True

$75,000 to $99,999 True

$100,000 to $124,999 True

$125,000 to $149,999 True

$150,000 to $199,999 True

$200,000 or more True



In [22]:
# Check the income variables match up between fact tables and jurisdiction
income_variables = [
    ["Less than $15,000", sum_filter, [["household_income", "households"], ["income_group_id", 11, "="]]],
    ["$15,000 to $29,999", sum_filter, [["household_income", "households"], ["income_group_id", 12, "="]]],
    ["$30,000 to $44,999", sum_filter, [["household_income", "households"], ["income_group_id", 13, "="]]],
    ["$45,000 to $59,999", sum_filter, [["household_income", "households"], ["income_group_id", 14, "="]]],
    ["$60,000 to $74,999", sum_filter, [["household_income", "households"], ["income_group_id", 15, "="]]],
    ["$75,000 to $99,999", sum_filter, [["household_income", "households"], ["income_group_id", 16, "="]]],
    ["$100,000 to $124,999", sum_filter, [["household_income", "households"], ["income_group_id", 17, "="]]],
    ["$125,000 to $149,999", sum_filter, [["household_income", "households"], ["income_group_id", 18, "="]]],
    ["$150,000 to $199,999", sum_filter, [["household_income", "households"], ["income_group_id", 19, "="]]],
    ["$200,000 or more", sum_filter, [["household_income", "households"], ["income_group_id", 20, "="]]]
]

income_checks = []
fact_values = []
jur_values = []

for variable in income_variables:
    test_results = test_fact_jur(fact_tables, jurisdiction, variable[0], variable[1], variable[2])
    income_checks.append(test_results[0])
    fact_values.append(test_results[1])
    jur_values.append(test_results[2])

# Output nicely formatted results
fact_values = pd.concat(fact_values, axis=1)
jur_values = pd.concat(jur_values, axis=1)
pd.concat(income_checks, axis=1)

Unnamed: 0_level_0,"Less than $15,000","$15,000 to $29,999","$30,000 to $44,999","$45,000 to $59,999","$60,000 to $74,999","$75,000 to $99,999","$100,000 to $124,999","$125,000 to $149,999","$150,000 to $199,999","$200,000 or more"
jurisdiction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Carlsbad,False,False,False,False,False,False,False,False,False,False
Chula Vista,False,False,False,False,False,False,False,False,False,False
Coronado,False,False,False,False,False,False,False,False,False,False
Del Mar,False,False,False,False,False,False,False,False,False,False
El Cajon,False,False,False,False,False,False,False,False,False,False
Encinitas,False,False,False,False,False,False,False,False,False,False
Escondido,False,False,False,False,False,False,False,False,False,False
Imperial Beach,False,False,False,False,False,False,False,False,False,False
La Mesa,False,False,False,False,False,False,False,False,False,False
Lemon Grove,False,False,False,False,False,False,False,False,False,False


In [23]:
# Check the income variables match up between fact tables and region
income_variables = [
    ["Less than $15,000", sum_filter, [["household_income", "households"], ["income_group_id", 11, "="]]],
    ["$15,000 to $29,999", sum_filter, [["household_income", "households"], ["income_group_id", 12, "="]]],
    ["$30,000 to $44,999", sum_filter, [["household_income", "households"], ["income_group_id", 13, "="]]],
    ["$45,000 to $59,999", sum_filter, [["household_income", "households"], ["income_group_id", 14, "="]]],
    ["$60,000 to $74,999", sum_filter, [["household_income", "households"], ["income_group_id", 15, "="]]],
    ["$75,000 to $99,999", sum_filter, [["household_income", "households"], ["income_group_id", 16, "="]]],
    ["$100,000 to $124,999", sum_filter, [["household_income", "households"], ["income_group_id", 17, "="]]],
    ["$125,000 to $149,999", sum_filter, [["household_income", "households"], ["income_group_id", 18, "="]]],
    ["$150,000 to $199,999", sum_filter, [["household_income", "households"], ["income_group_id", 19, "="]]],
    ["$200,000 or more", sum_filter, [["household_income", "households"], ["income_group_id", 20, "="]]]
]

for variable in income_variables:
    print(variable[0],
        test_fact_region(fact_tables, region, variable[0], variable[1], variable[2]))
    print()

fact tables value:            105792    
region_ind.csv value:         102637    
Less than $15,000 False

fact tables value:            136944    
region_ind.csv value:         125847    
$15,000 to $29,999 False

fact tables value:            138600    
region_ind.csv value:         133011    
$30,000 to $44,999 False

fact tables value:            128470    
region_ind.csv value:         124399    
$45,000 to $59,999 False

fact tables value:            108525    
region_ind.csv value:         111447    
$60,000 to $74,999 False

fact tables value:            155468    
region_ind.csv value:         160115    
$75,000 to $99,999 False

fact tables value:            115643    
region_ind.csv value:         122142    
$100,000 to $124,999 False

fact tables value:            71304     
region_ind.csv value:         67956     
$125,000 to $149,999 False

fact tables value:            88240     
region_ind.csv value:         88125     
$150,000 to $199,999 False

fact tables value:     

#### Tests for Ethnicity by Category

In [24]:
# Self-explanatory ethnicity categories
ethnicity_variables = [
    ["American Indian", sum, ["American Indian"]],
    ["Asian", sum, ["Asian"]],
    ["Black", sum, ["Black"]],
    ["Hispanic", sum, ["Hispanic"]],
    ["Other", sum, ["Other"]],
    ["Pacific Islander", sum, ["Pacific Islander"]],
    ["Two or More", sum, ["Two or More"]],
    ["White", sum, ["White"]]
]
    
# Check the housing variables match up between mgra_ind.csv and region_ind.csv
for variable in ethnicity_variables:
    print(variable[0],
        test_mgra_region(mgra, region, region_var=variable[0], function=variable[1], 
            f_args=variable[2]))
    print()

American Indian True

Asian True

Black True

Hispanic True

Other True

Pacific Islander True

Two or More True

White True



In [25]:
# Check the ethnicity categories match up between fact tables and jurisdiction
ethnicity_variables = [
    ["American Indian", sum_filter, [["ethnicity", "population"], ["ethnicity_id", 4, "="]]],
    ["Asian", sum_filter, [["ethnicity", "population"], ["ethnicity_id", 5, "="]]],
    ["Black", sum_filter, [["ethnicity", "population"], ["ethnicity_id", 3, "="]]],
    ["Hispanic", sum_filter, [["ethnicity", "population"], ["ethnicity_id", 1, "="]]],
    ["Other", sum_filter, [["ethnicity", "population"], ["ethnicity_id", 7, "="]]],
    ["Pacific Islander", sum_filter, [["ethnicity", "population"], ["ethnicity_id", 6, "="]]],
    ["Two or More", sum_filter, [["ethnicity", "population"], ["ethnicity_id", 8, "="]]],
    ["White", sum_filter, [["ethnicity", "population"], ["ethnicity_id", 2, "="]]],
]

ethnicity_checks = []
fact_values = []
jur_values = []

for variable in ethnicity_variables:
    test_results = test_fact_jur(fact_tables, jurisdiction, variable[0], variable[1], variable[2])
    ethnicity_checks.append(test_results[0])
    fact_values.append(test_results[1])
    jur_values.append(test_results[2])

# Output nicely formatted results
fact_values = pd.concat(fact_values, axis=1)
jur_values = pd.concat(jur_values, axis=1)
pd.concat(ethnicity_checks, axis=1)

Unnamed: 0_level_0,American Indian,Asian,Black,Hispanic,Other,Pacific Islander,Two or More,White
jurisdiction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Carlsbad,False,False,False,False,False,False,False,False
Chula Vista,False,False,False,False,False,False,False,False
Coronado,False,False,False,False,False,True,False,False
Del Mar,False,False,False,False,False,False,False,False
El Cajon,False,False,False,False,False,False,False,False
Encinitas,False,False,False,False,False,False,False,False
Escondido,False,False,False,False,False,False,False,False
Imperial Beach,False,False,False,False,False,False,False,False
La Mesa,False,False,False,False,False,False,False,False
Lemon Grove,False,False,False,False,False,False,False,False


In [26]:
# Check the ethnicity categories match up between fact tables and jurisdiction
ethnicity_variables = [
    ["American Indian", sum_filter, [["ethnicity", "population"], ["ethnicity_id", 4, "="]]],
    ["Asian", sum_filter, [["ethnicity", "population"], ["ethnicity_id", 5, "="]]],
    ["Black", sum_filter, [["ethnicity", "population"], ["ethnicity_id", 3, "="]]],
    ["Hispanic", sum_filter, [["ethnicity", "population"], ["ethnicity_id", 1, "="]]],
    ["Other", sum_filter, [["ethnicity", "population"], ["ethnicity_id", 7, "="]]],
    ["Pacific Islander", sum_filter, [["ethnicity", "population"], ["ethnicity_id", 6, "="]]],
    ["Two or More", sum_filter, [["ethnicity", "population"], ["ethnicity_id", 8, "="]]],
    ["White", sum_filter, [["ethnicity", "population"], ["ethnicity_id", 2, "="]]],
]


for variable in ethnicity_variables:
    print(variable[0],
        test_fact_region(fact_tables, region, variable[0], variable[1], variable[2]))
    print()

fact tables value:            15092     
region_ind.csv value:         15086     
American Indian False

fact tables value:            354704    
region_ind.csv value:         354680    
Asian False

fact tables value:            158563    
region_ind.csv value:         158568    
Black False

Hispanic True

fact tables value:            7475      
region_ind.csv value:         7527      
Other False

fact tables value:            14621     
region_ind.csv value:         14626     
Pacific Islander False

fact tables value:            111553    
region_ind.csv value:         111558    
Two or More False

fact tables value:            1540604   
region_ind.csv value:         1540559   
White False



#### Tests for Age

In [27]:
# Self-explanatory age categories
age_variables = [	
    ["Under 5", sum, ["Under 5"]],
    ["5 to 9", sum, ["5 to 9"]],
    ["10 to 14", sum, ["10 to 14"]],
    ["15 to 17", sum, ["15 to 17"]],
    ["18 and 19", sum, ["18 and 19"]],
    ["20 to 24", sum, ["20 to 24"]],
    ["25 to 29", sum, ["25 to 29"]],
    ["30 to 34", sum, ["30 to 34"]],
    ["35 to 39", sum, ["35 to 39"]],
    ["40 to 44", sum, ["40 to 44"]],
    ["45 to 49", sum, ["45 to 49"]],
    ["50 to 54", sum, ["50 to 54"]],
    ["55 to 59", sum, ["55 to 59"]],
    ["60 and 61", sum, ["60 and 61"]],
    ["62 to 64", sum, ["62 to 64"]],
    ["65 to 69", sum, ["65 to 69"]],
    ["70 to 74", sum, ["70 to 74"]],
    ["75 to 79", sum, ["75 to 79"]],
    ["80 to 84", sum, ["80 to 84"]],
    ["85 and Older", sum, ["85 and Older"]]
]
    
# Check the housing variables match up between mgra_ind.csv and region_ind.csv
for variable in age_variables:
    print(variable[0],
        test_mgra_region(mgra, region, region_var=variable[0], function=variable[1], 
            f_args=variable[2]))
    print()

Under 5 True

5 to 9 True

10 to 14 True

15 to 17 True

18 and 19 True

20 to 24 True

25 to 29 True

30 to 34 True

35 to 39 True

40 to 44 True

45 to 49 True

50 to 54 True

55 to 59 True

60 and 61 True

62 to 64 True

65 to 69 True

70 to 74 True

75 to 79 True

80 to 84 True

85 and Older True



In [28]:
# Check the age variables match up between fact tables and jurisdiction
age_variables = [	
    ["Under 5", sum_filter, [["age", "population"], ["age_group_id", 1, "="]]],
    ["5 to 9", sum_filter, [["age", "population"], ["age_group_id", 2, "="]]],
    ["10 to 14", sum_filter, [["age", "population"], ["age_group_id", 3, "="]]],
    ["15 to 17", sum_filter, [["age", "population"], ["age_group_id", 4, "="]]],
    ["18 and 19", sum_filter, [["age", "population"], ["age_group_id", 5, "="]]],
    ["20 to 24", sum_filter, [["age", "population"], ["age_group_id", 6, "="]]],
    ["25 to 29", sum_filter, [["age", "population"], ["age_group_id", 7, "="]]],
    ["30 to 34", sum_filter, [["age", "population"], ["age_group_id", 8, "="]]],
    ["35 to 39", sum_filter, [["age", "population"], ["age_group_id", 9, "="]]],
    ["40 to 44", sum_filter, [["age", "population"], ["age_group_id", 10, "="]]],
    ["45 to 49", sum_filter, [["age", "population"], ["age_group_id", 11, "="]]],
    ["50 to 54", sum_filter, [["age", "population"], ["age_group_id", 12, "="]]],
    ["55 to 59", sum_filter, [["age", "population"], ["age_group_id", 13, "="]]],
    ["60 and 61", sum_filter, [["age", "population"], ["age_group_id", 14, "="]]],
    ["62 to 64", sum_filter, [["age", "population"], ["age_group_id", 15, "="]]],
    ["65 to 69", sum_filter, [["age", "population"], ["age_group_id", 16, "="]]],
    ["70 to 74", sum_filter, [["age", "population"], ["age_group_id", 17, "="]]],
    ["75 to 79", sum_filter, [["age", "population"], ["age_group_id", 18, "="]]],
    ["80 to 84", sum_filter, [["age", "population"], ["age_group_id", 19, "="]]],
    ["85 and Older", sum_filter, [["age", "population"], ["age_group_id", 20, "="]]]
]

age_checks = []
fact_values = []
jur_values = []

for variable in age_variables:
    test_results = test_fact_jur(fact_tables, jurisdiction, variable[0], variable[1], variable[2])
    age_checks.append(test_results[0])
    fact_values.append(test_results[1])
    jur_values.append(test_results[2])

# Output nicely formatted results
fact_values = pd.concat(fact_values, axis=1)
jur_values = pd.concat(jur_values, axis=1)
pd.concat(age_checks, axis=1)

Unnamed: 0_level_0,Under 5,5 to 9,10 to 14,15 to 17,18 and 19,20 to 24,25 to 29,30 to 34,35 to 39,40 to 44,45 to 49,50 to 54,55 to 59,60 and 61,62 to 64,65 to 69,70 to 74,75 to 79,80 to 84,85 and Older
jurisdiction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Carlsbad,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
Chula Vista,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
Coronado,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
Del Mar,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
El Cajon,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
Encinitas,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
Escondido,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
Imperial Beach,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
La Mesa,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
Lemon Grove,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [29]:
# Check the age variables match up between fact tables and region
age_variables = [	
    ["Under 5", sum_filter, [["age", "population"], ["age_group_id", 1, "="]]],
    ["5 to 9", sum_filter, [["age", "population"], ["age_group_id", 2, "="]]],
    ["10 to 14", sum_filter, [["age", "population"], ["age_group_id", 3, "="]]],
    ["15 to 17", sum_filter, [["age", "population"], ["age_group_id", 4, "="]]],
    ["18 and 19", sum_filter, [["age", "population"], ["age_group_id", 5, "="]]],
    ["20 to 24", sum_filter, [["age", "population"], ["age_group_id", 6, "="]]],
    ["25 to 29", sum_filter, [["age", "population"], ["age_group_id", 7, "="]]],
    ["30 to 34", sum_filter, [["age", "population"], ["age_group_id", 8, "="]]],
    ["35 to 39", sum_filter, [["age", "population"], ["age_group_id", 9, "="]]],
    ["40 to 44", sum_filter, [["age", "population"], ["age_group_id", 10, "="]]],
    ["45 to 49", sum_filter, [["age", "population"], ["age_group_id", 11, "="]]],
    ["50 to 54", sum_filter, [["age", "population"], ["age_group_id", 12, "="]]],
    ["55 to 59", sum_filter, [["age", "population"], ["age_group_id", 13, "="]]],
    ["60 and 61", sum_filter, [["age", "population"], ["age_group_id", 14, "="]]],
    ["62 to 64", sum_filter, [["age", "population"], ["age_group_id", 15, "="]]],
    ["65 to 69", sum_filter, [["age", "population"], ["age_group_id", 16, "="]]],
    ["70 to 74", sum_filter, [["age", "population"], ["age_group_id", 17, "="]]],
    ["75 to 79", sum_filter, [["age", "population"], ["age_group_id", 18, "="]]],
    ["80 to 84", sum_filter, [["age", "population"], ["age_group_id", 19, "="]]],
    ["85 and Older", sum_filter, [["age", "population"], ["age_group_id", 20, "="]]]
]

for variable in age_variables:
    print(variable[0],
        test_fact_region(fact_tables, region, variable[0], variable[1], variable[2]))
    print()

Under 5 True

fact tables value:            230600    
region_ind.csv value:         230599    
5 to 9 False

fact tables value:            220580    
region_ind.csv value:         220579    
10 to 14 False

fact tables value:            132502    
region_ind.csv value:         132501    
15 to 17 False

18 and 19 True

fact tables value:            263897    
region_ind.csv value:         263896    
20 to 24 False

25 to 29 True

fact tables value:            216586    
region_ind.csv value:         216584    
30 to 34 False

fact tables value:            233121    
region_ind.csv value:         233120    
35 to 39 False

40 to 44 True

45 to 49 True

50 to 54 True

fact tables value:            210308    
region_ind.csv value:         210307    
55 to 59 False

60 and 61 True

62 to 64 True

65 to 69 True

70 to 74 True

75 to 79 True

80 to 84 True

85 and Older True



#### Tests for Total Hotel Rooms

In [30]:
# Self-explanatory hotel categories
hotel_variables = [
    ["budgetroom", sum, ["budgetroom"]],
    ["economyroom", sum, ["economyroom"]],
    ["luxuryroom", sum, ["luxuryroom"]],
    ["midpriceroom", sum, ["midpriceroom"]],
    ["upscaleroom", sum, ["upscaleroom"]],
    ["hotelroomtotal", sum, ["hotelroomtotal"]]
]
    
# Check the housing variables match up between mgra_ind.csv and region_ind.csv
for variable in hotel_variables:
    print(variable[0],
        test_mgra_region(mgra, region, region_var=variable[0], function=variable[1], 
            f_args=variable[2]))
    print()

budgetroom True

economyroom True

luxuryroom True

midpriceroom True

upscaleroom True

hotelroomtotal True



In [31]:
# Check that hotel categories match up between fact tables and region

# !!! There are no fact tables which contain any 2019 hotel data

In [32]:
# Check that hotel categories match up between fact tables and region

# !!! There are no fact tables which contain any 2019 hotel data