# 2022-47 Base Year Forecast Output QC
Test Plan: https://sandag.sharepoint.com/qaqc/_layouts/15/Doc.aspx?sourcedoc={f8b3d630-1290-445b-99a1-2fa9041ade92}&action=edit

Documentation: https://sandag.sharepoint.com/:w:/r/qaqc/_layouts/15/Doc.aspx?sourcedoc=%7B3AF20D75-0A22-4B9C-9CC4-85B3EEC294E6%7D&file=MGRABased_input_ABM_2019_process_notes.docx 

### Library Imports

In [1]:
import pandas as pd
import pyodbc

from pathlib import Path

# ignore warning relating to pandas and pyodbc (just ignore all warnings)
import warnings
warnings.filterwarnings("ignore")

### Download Data


In [2]:
def download_csv_data(user):
    """
    This function downloads csv data for the 2019 Forecast Output

    :param user:    The user trying to download the data. Mostly here so that others can more 
                    easily run my code

    :returns:       Tuple with (mgra data, region data, jurisdiction data)
    """

    # Data is stored in this folder
    data_folder_path = Path(f"C:/Users/{user}/San Diego Association of Governments/" \
        "SANDAG QA QC - Documents/Projects/2022/2022-47 Base Year Forecast Output QC/Data/")

    # Define the files we need from SharePoint
    sp_files = ["mgra_ind.csv", "region_ind.csv", "jur_ind.csv"]

    # Download the data and put them into dfs
    mgra_ind = pd.read_csv(data_folder_path / sp_files[0])
    region_ind = pd.read_csv(data_folder_path / sp_files[1])
    jurisdiction_ind = pd.read_csv(data_folder_path / sp_files[2])

    # Correct some columns names
    # In region_ind.csv, "Household Population (hh)" --> "hh"
    region_ind = region_ind.rename({"Household Population (hh)": "hh"}, axis=1)

    # Return the data in tuple format
    return mgra_ind, region_ind, jurisdiction_ind

# Get the csv data
mgra, region, jurisdiction = download_csv_data("eli")

In [3]:
def download_SQL_data():
    """
    This function downloads SQL data for the 2019 Forecast Output

    :returns:       Tuple with (mgra data, region data)
    """
    # Create the SQL connection to download juristiction data
    connection = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
        'Server=DDAMWSQL16.sandag.org;'
        'Database=dpoe_stage;'
        'Trusted_Connection=yes;')

    # Get only the data we want (Series 14; mgra_id to jurisdiction table)
    query = """SELECT mgra_id, jurisdiction FROM demographic_warehouse.dim.mgra_denormalize
        WHERE series=14"""
    jurisdiction = pd.read_sql_query(query, connection)

    # Get the data for every single fact table
    fact_table_names = ["age", "age_sex_ethnicity", "ethnicity", "household_income", "housing", "jobs",
        "land_use", "population", "sex"]
    fact_data = {}
    for table_name in fact_table_names:
        fact_data[table_name] = pd.read_sql_query(f"""
            SELECT * FROM demographic_warehouse.fact.{table_name} 
            WHERE datasource_id=44
            AND yr_id=2019""", connection)

    # For each fact table, add on the jurisdiction information using the mgra_id to jurisdiction 
    # table
    for table_name in fact_table_names:
        fact_data[table_name] = fact_data[table_name].merge(
            jurisdiction, left_on="mgra_id", right_on="mgra_id")

    return fact_data

# # get the SQL data
# fact_tables = download_SQL_data()

### Tests

In [4]:
def average(numerator, denominator):
    """
    TODO
    """
    return numerator.sum() / denominator.sum()

def rate(numerator, denominator):
    """
    TODO
    """
    return (numerator.sum() / denominator.sum()) * 100

def compare_values(mgra_value, region_value):
    """
    TODO
    """
    if(mgra_value == region_value):
        return True
    else:
        print("mgra_ind.csv   value:", mgra_value)
        print("region_ind.csv value:", region_value)
        return False

In [5]:
def test_mgra_region(mgra, region, region_var, function, f_args):
    """
    Generic function used to test the internal consistency of mgra_ind.csv and region_ind.csv

    TODO
    """
    if(function == sum):
        return compare_values(mgra[f_args[0]].sum(), region[region_var].values[0])
    if(function == average or function == rate):
        return compare_values(function(mgra[f_args[0]], mgra[f_args[1]]), 
            region[region_var].values[0])

#### Tests for Population Variables

In [6]:
# pop = total population
# hhp = total household population (exlucde gq pop)
# gq_total = total gq population, derived from sum of population in civialn gq and military gq
population_variables = [
    ["pop", sum, ["pop"]],
    ["hhp", sum, ["hhp"]],
    ["gq_total", sum, ["gq_total"]]
]
# population_variables = ["pop", "hhp", "gq_total"]
# population_functions = [sum, sum, sum]

# Create the gq_total column in all dfs
mgra["gq_total"] = mgra["gq_civ"] + mgra["Group Quarters - Military (gq_mil)"]
region["gq_total"] = region["gq_civ"] + region["Group Quarters - Military (gq_mil)"]
jurisdiction["gq_total"] = jurisdiction["gq_civ"] \
    + jurisdiction["Group Quarters - Military (gq_mil)"]
    
# Check the population variables match up between mgra_ind.csv and region_ind.csv
for variable in population_variables:
    print(variable[0],
        test_mgra_region(mgra, region, region_var=variable[0], function=variable[1], 
            f_args=variable[2]))
    print()

pop True

hhp True

gq_total True



In [7]:
# # Check the population variables match up between fact tables and jurisdiction
# population_checks = []

# # Check total population
# fact_table_pop = fact_tables["population"].groupby("jurisdiction").sum()[["population"]].rename(
#     {"population": "pop"}, axis=1)
# jurisdiction_pop = jurisdiction[["jurisdiction", "pop"]].set_index("jurisdiction")
# population_checks.append(fact_table_pop == jurisdiction_pop)

# # Check total household population
# # housing_type_id = 1 is household population without gq
# fact_table_hhp = fact_tables["population"][fact_tables["population"]["housing_type_id"] == 1]
# fact_table_hhp = fact_table_hhp.groupby("jurisdiction").sum()[["population"]].rename(
#     {"population": "hhp"}, axis=1)
# jurisdiction_hhp = jurisdiction[["jurisdiction", "hhp"]].set_index("jurisdiction")
# population_checks.append(fact_table_hhp == jurisdiction_hhp)

# # Check total gq population
# # housing_type_id != 1 (aka housing_type_id = 2, 3, 4) is household population in gq
# fact_table_gq_total = fact_tables["population"][fact_tables["population"]["housing_type_id"] != 1]
# fact_table_gq_total = fact_table_gq_total.groupby("jurisdiction").sum()[["population"]].rename(
#     {"population": "gq_total"}, axis=1)
# jurisdiction_gq_total = jurisdiction[["jurisdiction", "gq_total"]].set_index("jurisdiction")
# population_checks.append(fact_table_gq_total == jurisdiction_gq_total)

# # Output nicely formatted results
# pd.concat(population_checks, axis=1)

In [8]:
# # Check the population variables match up between Jurisdiction and Region
# population_checks = {}

# # Check total population 
# population_checks["pop"] = (fact_tables["population"]["population"].sum() == region["pop"])

# # Check total household population
# non_gq_table = fact_tables["population"][fact_tables["population"]["housing_type_id"] == 1]
# population_checks["hhp"] = (non_gq_table["population"].sum() == region["hhp"])

# # Check total gq population
# gq_table = fact_tables["population"][fact_tables["population"]["housing_type_id"] != 1]
# population_checks["gq_total"] = (gq_table["population"].sum() == region["gq_total"])

# # Output nicely formatted results
# pd.DataFrame.from_dict(population_checks)

#### Tests for Housing Variables

In [9]:
# hs = housing units
# hh = number of households
# hhs = household size
# vacancy = vacant units
# vacancy_rate = vacancy rate
housing_variables = [
    ["hs", sum, ["hs"]],
    ["hh", sum, ["Household Population (hh)"]],
    ["hhs", average, ["pop", "Household Population (hh)"]],
    ["vacancy", sum, ["vacancy"]],
    ["vacancy_rate", rate, ["vacancy", "units"]]
]
    
# Check the housing variables match up between mgra_ind.csv and region_ind.csv
for variable in housing_variables:
    print(variable[0],
        test_mgra_region(mgra, region, region_var=variable[0], function=variable[1], 
            f_args=variable[2]))
    print()

hs True

hh True

mgra_ind.csv   value: 2.896117137863108
region_ind.csv value: 49743.093
hhs False

vacancy True

mgra_ind.csv   value: 4.26962170094305
region_ind.csv value: 26715.815137256173
vacancy_rate False



In [10]:
# # Check the housing variables match up between fact tables and jurisdiction
# housing_checks = []

# # Check housing units
# fact_table_hs = fact_tables["housing"].groupby("jurisdiction").sum()[["units"]].rename(
#     {"units": "hs"}, axis=1)
# jurisdiction_hs = jurisdiction[["jurisdiction", "hs"]].set_index("jurisdiction")
# housing_checks.append(fact_table_hs == jurisdiction_hs)

# # Check number of households
# fact_table_hh = fact_tables["household_income"].groupby("jurisdiction").sum()[
#     ["households"]].rename({"households": "hh"}, axis=1)
# jurisdiction_hh = jurisdiction[["jurisdiction", "Household Population (hh)"]].rename(
#     {"Household Population (hh)": "hh"}, axis=1).set_index("jurisdiction")
# housing_checks.append(fact_table_hh == jurisdiction_hh)

# # Check household size
# fact_table_hhs = fact_tables["population"].groupby("jurisdiction").sum()[["population"]].rename(
#     {"population": "hhs"}, axis=1) / fact_tables["housing"].groupby("jurisdiction").sum() \
#     [["units"]].rename({"units": "hhs"}, axis=1)
# jurisdiction_hhs = jurisdiction[["jurisdiction", "hhs"]].set_index("jurisdiction")
# housing_checks.append(fact_table_hhs == jurisdiction_hhs)

# # Check vacant units
# fact_table_vacancy = fact_tables["housing"].groupby("jurisdiction").sum()[["vacancy"]]
# jurisdiction_vacancy = jurisdiction[["jurisdiction", "vacancy"]].set_index("jurisdiction")
# housing_checks.append(fact_table_vacancy == jurisdiction_vacancy)

# # Check vacancy rate
# fact_table_vacancy_rate = fact_tables["housing"].groupby("jurisdiction").sum()[["vacancy"]].rename(
#     {"vacancy": "vacancy_rate"}, axis=1) / fact_tables["housing"].groupby("jurisdiction").sum() \
#     [["units"]].rename({"units": "vacancy_rate"}, axis=1)
# jurisdiction_vacancy_rate = jurisdiction[["jurisdiction", "vacancy_rate"]].set_index("jurisdiction")
# housing_checks.append(fact_table_vacancy_rate == jurisdiction_vacancy_rate)

# # Output nicely formatted results
# pd.concat(housing_checks, axis=1)

In [11]:
# # Check the housing variables match up between Jurisdiction and Region
# housing_checks = {}

# # Check housing units
# housing_checks["hs"] = [fact_tables["housing"]["units"].sum() == int(region["hs"])]

# # Check number of households
# housing_checks["hh"] = [fact_tables["household_income"]["households"].sum() 
#     == int(region["Household Population (hh)"])]

# # Check household size
# housing_checks["hhs"] = [(fact_tables["population"]["population"].sum() 
#     / fact_tables["household_income"]["households"].sum()) == float(region["hhs"])]
# print("hhs from fact tables:\t\t", (fact_tables["population"]["population"].sum() 
#     / fact_tables["household_income"]["households"].sum()))
# print("hhs from region_ind.csv:\t", float(region["hhs"]))

# # Check vacant units
# housing_checks["vacancy"] = [fact_tables["housing"]["vacancy"].sum() == int(region["vacancy"])]
# print("vacancy from fact tables:\t", fact_tables["housing"]["vacancy"].sum())
# print("vacancy from region_ind.csv:\t", int(region["vacancy"]))


# # Check vacancy rate
# housing_checks["vacancy_rate"] = [(fact_tables["housing"]["vacancy"].sum() / fact_tables[
#     "housing"]["units"].sum() == float(region["vacancy_rate"]))]
# print("vacancy_rate from fact tables:\t\t", (fact_tables["housing"]["vacancy"].sum() / fact_tables[
#     "housing"]["units"].sum()))
# print("vacancy_rate from region_ind.csv:\t", float(region["vacancy_rate"]))

# # Output nicely formatted results
# pd.DataFrame.from_dict(housing_checks)

#### Tests for Employment Variables

In [12]:
# emp_total = total employment
# Other = various categories of employment. See ABM Wiki for more details
employment_variables = [
    ["emp_total", sum, ["emp_total"]],
    ["emp_Agricultural_and_Extractive", sum, ["emp_Agricultural_and_Extractive"]],
    ["emp_const_non_bldg_prod", sum, ["emp_const_non_bldg_prod"]],
    ["emp_const_non_bldg_Office", sum, ["emp_const_non_bldg_Office"]],
    ["emp_utilities_prod", sum, ["emp_utilities_prod"]],
    ["emp_utilities_Office", sum, ["emp_utilities_Office"]],
    ["emp_const_bldg_prod", sum, ["emp_const_bldg_prod"]],
    ["emp_const_bldg_Office", sum, ["emp_const_bldg_Office"]],
    ["emp_Manufacturing_prod", sum, ["emp_Manufacturing_prod"]],
    ["emp_Manufacturing_Office", sum, ["emp_Manufacturing_Office"]],
    ["emp_whsle_whs", sum, ["emp_whsle_whs"]],
    ["emp_trans", sum, ["emp_trans"]],
    ["emp_retail", sum, ["emp_retail"]],
    ["emp_prof_bus_svcs", sum, ["emp_prof_bus_svcs"]],
    ["emp_prof_bus_svcs_bldg_maint", sum, ["emp_prof_bus_svcs_bldg_maint"]],
    ["emp_pvt_ed_k12", sum, ["emp_pvt_ed_k12"]],
    ["emp_pvt_ed_post_k12_Other_Residential", sum, ["emp_pvt_ed_post_k12_Other_Residential"]],
    ["emp_health", sum, ["emp_health"]],
    ["emp_personal_svcs_Office", sum, ["emp_personal_svcs_Office"]],
    ["emp_amusement", sum, ["emp_amusement"]],
    ["emp_hotel", sum, ["emp_hotel"]],
    ["emp_restaurant_bar", sum, ["emp_restaurant_bar"]],
    ["emp_personal_svcs_retail", sum, ["emp_personal_svcs_retail"]],
    ["emp_religious", sum, ["emp_religious"]],
    ["emp_pvt_hh", sum, ["emp_pvt_hh"]],
    ["emp_state_local_Government_ent", sum, ["emp_state_local_Government_ent"]],
    ["emp_fed_non_Military", sum, ["emp_fed_non_Military"]],
    ["emp_fed_Military", sum, ["emp_fed_Military"]],
    ["emp_state_local_Government_blue", sum, ["emp_state_local_Government_blue"]],
    ["emp_state_local_Government_white", sum, ["emp_state_local_Government_white"]],
    ["emp_public_ed", sum, ["emp_public_ed"]],
    ["emp_own_occ_dwell_mgmt", sum, ["emp_own_occ_dwell_mgmt"]],
    ["emp_fed_Government_accts", sum, ["emp_fed_Government_accts"]],
    ["emp_st_lcl_Government_accts", sum, ["emp_st_lcl_Government_accts"]],
    ["emp_cap_accts", sum, ["emp_cap_accts"]]
]
    
# Check the employment variables match up between mgra_ind.csv and region_ind.csv
for variable in employment_variables:
    print(variable[0],
        test_mgra_region(mgra, region, region_var=variable[0], function=variable[1], 
            f_args=variable[2]))
    print()

emp_total True

emp_Agricultural_and_Extractive True

emp_const_non_bldg_prod True

emp_const_non_bldg_Office True

emp_utilities_prod True

emp_utilities_Office True

emp_const_bldg_prod True

emp_const_bldg_Office True

emp_Manufacturing_prod True

emp_Manufacturing_Office True

emp_whsle_whs True

emp_trans True

emp_retail True

emp_prof_bus_svcs True

emp_prof_bus_svcs_bldg_maint True

emp_pvt_ed_k12 True

emp_pvt_ed_post_k12_Other_Residential True

emp_health True

emp_personal_svcs_Office True

emp_amusement True

emp_hotel True

emp_restaurant_bar True

emp_personal_svcs_retail True

emp_religious True

emp_pvt_hh True

emp_state_local_Government_ent True

emp_fed_non_Military True

emp_fed_Military True

emp_state_local_Government_blue True

emp_state_local_Government_white True

emp_public_ed True

emp_own_occ_dwell_mgmt True

emp_fed_Government_accts True

emp_st_lcl_Government_accts True

emp_cap_accts True



#### Tests for School Enrollment

In [13]:
# enrollgradekto8 = Grade School K-8 enrollment
# enrollgrade9to12 = Grade School 9-12 enrollment

grade_school_enrollment_variables = [
    ["enrollgradekto8", sum, ["enrollgradekto8"]],
    ["enrollgrade9to12", sum, ["enrollgrade9to12"]]
]
    
# Check the grade school enrollment variables match up between mgra_ind.csv and region_ind.csv
for variable in grade_school_enrollment_variables:
    print(variable[0],
        test_mgra_region(mgra, region, region_var=variable[0], function=variable[1], 
            f_args=variable[2]))
    print()

enrollgradekto8 True

enrollgrade9to12 True



#### Tests for College Enrollment

In [14]:
# collegeenroll = Major College enrollment
# othercollegeenroll = Other College enrollment (no idea what the difference is between the two)
# adultschenrl = Adult School enrollment
higher_ed_enrollment_variables = [
    ["collegeenroll", sum, ["collegeenroll"]],
    ["othercollegeenroll", sum, ["othercollegeenroll"]],
    ["adultschenrl", sum, ["adultschenrl"]]
]
    
# Check the housing variables match up between mgra_ind.csv and region_ind.csv
for variable in higher_ed_enrollment_variables:
    print(variable[0],
        test_mgra_region(mgra, region, region_var=variable[0], function=variable[1], 
            f_args=variable[2]))
    print()

collegeenroll True

othercollegeenroll True

adultschenrl True



#### Tests for Income

In [15]:
# Self-explanatory income categories
income_variables = [
    ["Less than $15,000", sum, ["Less than $15,000"]],
    ["$15,000 to $29,999", sum, ["$15,000 to $29,999"]],
    ["$30,000 to $44,999", sum, ["$30,000 to $44,999"]],
    ["$45,000 to $59,999", sum, ["$45,000 to $59,999"]],
    ["$60,000 to $74,999", sum, ["$60,000 to $74,999"]],
    ["$75,000 to $99,999", sum, ["$75,000 to $99,999"]],
    ["$100,000 to $124,999", sum, ["$100,000 to $124,999"]],
    ["$125,000 to $149,999", sum, ["$125,000 to $149,999"]],
    ["$150,000 to $199,999", sum, ["$150,000 to $199,999"]],
    ["$200,000 or more", sum, ["$200,000 or more"]]
]
    
# Check the income variables match up between mgra_ind.csv and region_ind.csv
for variable in income_variables:
    print(variable[0],
        test_mgra_region(mgra, region, region_var=variable[0], function=variable[1], 
            f_args=variable[2]))
    print()

Less than $15,000 True

$15,000 to $29,999 True

$30,000 to $44,999 True

$45,000 to $59,999 True

$60,000 to $74,999 True

$75,000 to $99,999 True

$100,000 to $124,999 True

$125,000 to $149,999 True

$150,000 to $199,999 True

$200,000 or more True



#### Tests for Ethnicity by Category

In [16]:
# Self-explanatory ethnicity categories
ethnicity_variables = [
    ["American Indian", sum, ["American Indian"]],
    ["Asian", sum, ["Asian"]],
    ["Black", sum, ["Black"]],
    ["Hispanic", sum, ["Hispanic"]],
    ["Other", sum, ["Other"]],
    ["Pacific Islander", sum, ["Pacific Islander"]],
    ["Two or More", sum, ["Two or More"]],
    ["White", sum, ["White"]]
]
    
# Check the housing variables match up between mgra_ind.csv and region_ind.csv
for variable in ethnicity_variables:
    print(variable[0],
        test_mgra_region(mgra, region, region_var=variable[0], function=variable[1], 
            f_args=variable[2]))
    print()

American Indian True

Asian True

Black True

Hispanic True

Other True

Pacific Islander True

Two or More True

White True



#### Tests for Age

In [17]:
# Self-explanatory age categories
age_variables = [	
    ["Under 5", sum, ["Under 5"]],
    ["5 to 9", sum, ["5 to 9"]],
    ["10 to 14", sum, ["10 to 14"]],
    ["15 to 17", sum, ["15 to 17"]],
    ["18 and 19", sum, ["18 and 19"]],
    ["20 to 24", sum, ["20 to 24"]],
    ["25 to 29", sum, ["25 to 29"]],
    ["30 to 34", sum, ["30 to 34"]],
    ["35 to 39", sum, ["35 to 39"]],
    ["40 to 44", sum, ["40 to 44"]],
    ["45 to 49", sum, ["45 to 49"]],
    ["50 to 54", sum, ["50 to 54"]],
    ["55 to 59", sum, ["55 to 59"]],
    ["60 and 61", sum, ["60 and 61"]],
    ["62 to 64", sum, ["62 to 64"]],
    ["65 to 69", sum, ["65 to 69"]],
    ["70 to 74", sum, ["70 to 74"]],
    ["75 to 79", sum, ["75 to 79"]],
    ["80 to 84", sum, ["80 to 84"]],
    ["85 and Older", sum, ["85 and Older"]]
]
    
# Check the housing variables match up between mgra_ind.csv and region_ind.csv
for variable in age_variables:
    print(variable[0],
        test_mgra_region(mgra, region, region_var=variable[0], function=variable[1], 
            f_args=variable[2]))
    print()

Under 5 True

5 to 9 True

10 to 14 True

15 to 17 True

18 and 19 True

20 to 24 True

25 to 29 True

30 to 34 True

35 to 39 True

40 to 44 True

45 to 49 True

50 to 54 True

55 to 59 True

60 and 61 True

62 to 64 True

65 to 69 True

70 to 74 True

75 to 79 True

80 to 84 True

85 and Older True



#### Tests for Total Hotel Rooms

In [18]:
# Self-explanatory hotel categories
hotel_variables = [
    ["budgetroom", sum, ["budgetroom"]],
    ["economyroom", sum, ["economyroom"]],
    ["luxuryroom", sum, ["luxuryroom"]],
    ["midpriceroom", sum, ["midpriceroom"]],
    ["upscaleroom", sum, ["upscaleroom"]],
    ["hotelroomtotal", sum, ["hotelroomtotal"]]
]
    
# Check the housing variables match up between mgra_ind.csv and region_ind.csv
for variable in hotel_variables:
    print(variable[0],
        test_mgra_region(mgra, region, region_var=variable[0], function=variable[1], 
            f_args=variable[2]))
    print()

budgetroom True

economyroom True

luxuryroom True

midpriceroom True

upscaleroom True

hotelroomtotal True

