# 2022-47 Base Year Forecast Output QC
Test Plan: https://sandag.sharepoint.com/qaqc/_layouts/15/Doc.aspx?sourcedoc={f8b3d630-1290-445b-99a1-2fa9041ade92}&action=edit

Documentation: https://sandag.sharepoint.com/:w:/r/qaqc/_layouts/15/Doc.aspx?sourcedoc=%7B3AF20D75-0A22-4B9C-9CC4-85B3EEC294E6%7D&file=MGRABased_input_ABM_2019_process_notes.docx 

### Library Imports

In [1]:
import pandas as pd
import pyodbc

from pathlib import Path

# ignore warning relating to pandas and pyodbc (just ignore all warnings)
import warnings
warnings.filterwarnings("ignore")

### Download Data


In [2]:
def download_csv_data(user):
    """
    This function downloads csv data for the 2019 Forecast Output

    :param user:    The user trying to download the data. Mostly here so that others can more 
                    easily run my code

    :returns:       Tuple with (mgra data, region data, jurisdiction data)
    """

    # Data is stored in this folder
    data_folder_path = Path(f"C:/Users/{user}/San Diego Association of Governments/" \
        "SANDAG QA QC - Documents/Projects/2022/2022-47 Base Year Forecast Output QC/Data/")

    # Define the files we need from SharePoint
    sp_files = ["mgra_ind.csv", "region_ind.csv", "jur_ind.csv"]

    # Download the data and put them into dfs
    mgra_ind = pd.read_csv(data_folder_path / sp_files[0])
    region_ind = pd.read_csv(data_folder_path / sp_files[1])
    jurisdiction_ind = pd.read_csv(data_folder_path / sp_files[2])

    # Return the data in tuple format
    return mgra_ind, region_ind, jurisdiction_ind

# Get the csv data
mgra, region, jurisdiction = download_csv_data("eli")

In [3]:
def download_SQL_data():
    """
    This function downloads SQL data for the 2019 Forecast Output

    :returns:       Tuple with (mgra data, region data)
    """
    # Create the SQL connection to download juristiction data
    connection = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
        'Server=DDAMWSQL16.sandag.org;'
        'Database=dpoe_stage;'
        'Trusted_Connection=yes;')

    # Get only the data we want (Series 14; mgra_id to jurisdiction table)
    query = """SELECT mgra_id, jurisdiction FROM demographic_warehouse.dim.mgra_denormalize
        WHERE series=14"""
    jurisdiction = pd.read_sql_query(query, connection)

    # Get the data for every single fact table
    fact_table_names = ["age", "age_sex_ethnicity", "ethnicity", "household_income", "housing", "jobs",
        "land_use", "population", "sex"]
    fact_data = {}
    for table_name in fact_table_names:
        fact_data[table_name] = pd.read_sql_query(f"""
            SELECT * FROM demographic_warehouse.fact.{table_name} 
            WHERE datasource_id=44
            AND yr_id=2019""", connection)

    # For each fact table, add on the jurisdiction information using the mgra_id to jurisdiction 
    # table
    for table_name in fact_table_names:
        fact_data[table_name] = fact_data[table_name].merge(
            jurisdiction, left_on="mgra_id", right_on="mgra_id")

    return fact_data

# get the SQL data
fact_tables = download_SQL_data()

### Tests

#### Tests for Population Variables

In [4]:
# pop = total population
# hhp = total household population (exlucde gq pop)
# gq_total = total gq population, derived from sum of population in civialn gq and military gq
population_variables = ["pop", "hhp", "gq_total"]

# Create the gq_total column in all dfs
mgra["gq_total"] = mgra["gq_civ"] + mgra["Group Quarters - Military (gq_mil)"]
region["gq_total"] = region["gq_civ"] + region["Group Quarters - Military (gq_mil)"]
jurisdiction["gq_total"] = jurisdiction["gq_civ"] \
    + jurisdiction["Group Quarters - Military (gq_mil)"]

In [5]:
# Check the population variables match up between mgra and region
mgra[population_variables].sum() == region[population_variables]

Unnamed: 0,pop,hhp,gq_total
0,True,True,True


In [6]:
# Check the population variables match up between fact tables and jurisdiction
population_checks = []

# Check total population
fact_table_pop = fact_tables["population"].groupby("jurisdiction").sum()[["population"]].rename(
    {"population": "pop"}, axis=1)
jurisdiction_pop = jurisdiction[["jurisdiction", "pop"]].set_index("jurisdiction")
population_checks.append(fact_table_pop == jurisdiction_pop)

# Check total household population
# housing_type_id = 1 is household population without gq
fact_table_hhp = fact_tables["population"][fact_tables["population"]["housing_type_id"] == 1]
fact_table_hhp = fact_table_hhp.groupby("jurisdiction").sum()[["population"]].rename(
    {"population": "hhp"}, axis=1)
jurisdiction_hhp = jurisdiction[["jurisdiction", "hhp"]].set_index("jurisdiction")
population_checks.append(fact_table_hhp == jurisdiction_hhp)

# Check total gq population
# housing_type_id != 1 (aka housing_type_id = 2, 3, 4) is household population in gq
fact_table_gq_total = fact_tables["population"][fact_tables["population"]["housing_type_id"] != 1]
fact_table_gq_total = fact_table_gq_total.groupby("jurisdiction").sum()[["population"]].rename(
    {"population": "gq_total"}, axis=1)
jurisdiction_gq_total = jurisdiction[["jurisdiction", "gq_total"]].set_index("jurisdiction")
population_checks.append(fact_table_gq_total == jurisdiction_gq_total)

# Output nicely formatted results
pd.concat(population_checks, axis=1)

Unnamed: 0_level_0,pop,hhp,gq_total
jurisdiction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Carlsbad,False,False,True
Chula Vista,False,False,False
Coronado,False,False,True
Del Mar,False,False,True
El Cajon,False,False,False
Encinitas,False,False,False
Escondido,False,False,False
Imperial Beach,False,False,False
La Mesa,False,False,False
Lemon Grove,False,False,False


In [7]:
# Check the population variables match up between Jurisdiction and Region
population_checks = {}

# Check total population 
population_checks["pop"] = (fact_tables["population"]["population"].sum() == region["pop"])

# Check total household population
non_gq_table = fact_tables["population"][fact_tables["population"]["housing_type_id"] == 1]
population_checks["hhp"] = (non_gq_table["population"].sum() == region["hhp"])

# Check total gq population
gq_table = fact_tables["population"][fact_tables["population"]["housing_type_id"] != 1]
population_checks["gq_total"] = (gq_table["population"].sum() == region["gq_total"])

# Output nicely formatted results
pd.DataFrame.from_dict(population_checks)

Unnamed: 0,pop,hhp,gq_total
0,True,True,True


#### Tests for Housing Variables

In [8]:
# hs = housing units
# Household Population (hh) = number of households
# hhs = household size
# vacancy = vacant units
# vacancy_rate = vacancy rate
housing_variables = ["hs", "Household Population (hh)", "hhs", "vacancy", "vacancy_rate"]

#### Tests for Employment Variables

#### Tests for School Enrollment

#### Tests for College Enrollment

#### Tests for Income

#### Tests for Ethnicity by Category

#### Tests for Age

#### Tests for Total Hotel Rooms