# 2022-58 Base Year Forecast V2 QC

## Library Imports

In [1]:
import pandas as pd
import numpy as np

from pathlib import Path

## Download Data


In [2]:
def download_data(user):
    """
    This function downloads csv data for the 2019 Forecast Output

    :param user:    The user trying to download the data. Mostly here so that others can more 
                    easily run my code

    :returns:       Tuple with (mgra data, persons data, households data)
    """
    # Data is stored in this folder
    data_folder_path = Path(f"C:/Users/eli/San Diego Association of Governments/SANDAG QA QC - Documents/Projects/2022/2022-58 2019 Base Year Forecast Output QC/data/MGRA13 Updated Data/")
    
    # Define the files here
    files = ["mgra13_update_mgra_ind_QA.csv", "persons_2019_01.csv", "households_2019_01.csv"]

    # Download the data from each file and load into a dataframe
    dfs = []
    for file in files:
        dfs.append(pd.read_csv(data_folder_path / file))

    # Use the households file to add which mgra each person belongs to
    # Note, "unittype" is included for gq/non-gq population checks
    hhid_to_mgra = dfs[2][["hhid", "mgra", "unittype"]]
    dfs[1] = pd.merge(dfs[1], hhid_to_mgra, how="left", on="hhid")

    return dfs

mgra, persons, households = download_data("eli")

## Tests

In [3]:
def households_simple_tests(households, mgra) -> pd.DataFrame:
    """Perform simple tests aggregating from households to MGRA

    The simple tests are as follows:
    1. Count number of households per MGRA, compare with MGRA hh (total number of households)
    2. Sum number of workders per MGRA, compare with MGRA emp_total (total employment)
    3. Sum number of persons per MGRA, compare with MGRA pop (total population)

    Args:
        households (pandas.DataFrame): A df containing data from "households_2019_01.csv". No 
            modifications to the dataframe are necessary
        mgra (pandas.DataFrame): A df containing data from "mgra13_update_mgra_ind_QA.csv". No 
            modifications to the dataframe are necessary

    Returns:
        pandas.DataFrame: A df containing every row where households and mgra do not match up,
            for every test defined above
        Also prints out some information on how many MGRAs fail each test
    """
    # Store the aggregations and how to do them here. The dictionary key is what column of 
    # households to aggregate on, and the value is how to aggregate and what to compare to in MGRA
    aggregations = {
        # "hhid": ["count", "hh"],
        # "hworkers": ["sum", "emp_total"],
        "persons": ["sum", "pop"]
    }
    
    # Store test results here
    results = pd.DataFrame(columns=["mgra"])

    # Iterate over each aggregation and test
    for agg_col, agg_instructions in aggregations.items():
        # Aggregate the correct column with the correct function
        aggregated = households[["mgra", agg_col]].copy(deep=True)
        if(agg_instructions[0] == "count"):
            aggregated = aggregated.groupby("mgra").count()
        elif(agg_instructions[0] == "sum"):
            aggregated = aggregated.groupby("mgra").sum()
        aggregated = aggregated.reset_index(drop=False)

        # Combine the household aggregated values and MGRA values into one table, while filling
        # missing household aggregated values with zero
        combined = pd.merge(mgra[["mgra", agg_instructions[1]]], aggregated, how="left", on="mgra")
        combined = combined.fillna(value=0)

        # Create a difference column and print out where values are not identical
        combined[f"{agg_instructions[1]}_diff"] = combined[agg_instructions[1]] - combined[agg_col]

        # Rename columns to be more descriptive
        combined = combined.rename({
            agg_instructions[1]: f"{agg_instructions[1]}_from_MGRA",
            agg_col: f"{agg_instructions[1]}_aggregated_from_households"}, axis=1)

        # Add the outputs to the results df
        results = pd.merge(results, combined[combined[f"{agg_instructions[1]}_diff"] != 0], how="outer", on="mgra")

        # Print out diagnostic print statement
        print(f"For the variable {agg_instructions[1]}, a total of {combined[combined[f'{agg_instructions[1]}_diff'] != 0].shape[0]} MGRAs have different values")
        
    # # Replace nan values (essentailly, values where there were no differences) with a more
    # # descriptive term
    # results = results.fillna("No Errors")
    return results

In [4]:
def households_complex_tests(households, mgra) -> pd.DataFrame:
    """Perform complex tests aggregating from households to MGRA

    The complex tests are as follows:
    1. Count number of households per income category per MGRA, compare with MGRA income categories
       transformed from 10 categories to 5 categories
    2. Count number of buildings of each type per MGRA, compare with MGRA hs_sf (single family 
       households), hs_mf (multi family households), and hs_mh (mobile home households)
    3. Count number of households in GQ vs not GQ per MGRA, compare with gq_civ + gq_mil (total
       number of gq households)
    4. Sum population total vs total excluding GQ per MGRA, compare with pop (total population) and
       hhp (total population excluding gq pop)

    Args:
        households (pandas.DataFrame): A df containing data from "households_2019_01.csv". No 
            modifications to the dataframe are necessary. All transformations will be done in this
            function
        mgra (pandas.DataFrame): A df containing data from "mgra13_update_mgra_ind_QA.csv". No 
            modifications to the dataframe are necessary. All transformations will be done in this
            function

    Returns:
        pandas.DataFrame: A df containing every row where households and mgra do not match up,
            for every test defined above
        Also prints out some information on how many MGRAs fail each test
    """
    # TODO: Move the two below variables into a config file of some kin

    # How to do transformations for each test 1-4. First comes transformations to households, then
    # comes transformations to MGRA. Within each transformation list, each list is its own 
    # transform. Note that for households, it is assumed that at the end we groupby MGRA
    # 1. For groupby transformation, we have ["groupby", groupby input, aggregation function]. Note, 
    #    this will always reset_index(drop=False) afterwards
    # 2. For pivot transformation, we have ["pivot", values, index, columns] which are input to 
    #    pd.pivot_table
    # 3. For sum transformation, we have ["sum", new column name, columns to sum]
    transformations = {
        "income categories": [
            [
                ["groupby", ["mgra", "hinccat1"], "count"],
                ["pivot", "hhid", "mgra", "hinccat1"]
            ], [
                ["sum", 1, ["i1", "i2"]],
                ["sum", 2, ["i3", "i4"]],
                ["sum", 3, ["i5", "i6"]],
                ["sum", 4, ["i7", "i8"]],
                ["sum", 5, ["i9", "i10"]]
            ]
        ],
        "building sizes": [
            [
                ["groupby", ["mgra", "bldgsz"], "count"],
                ["pivot", "hhid", "mgra", "bldgsz"],
                ["sum", "hh_mh", [1]],
                ["sum", "hh_mf", [8, 9]],
                ["sum", "hh_sf", [2, 3]]
            ], []
        ],
        "population excluding GQ": [
            [
                ["groupby", ["mgra", "unittype"], "sum"],
                ["pivot", "persons", "mgra", "unittype"],
                ["sum", "hhp", [0]]
            ], []
        ],
        "GQ/non GQ/total households": [
            [
                ["groupby", ["mgra", "unittype"], "count"],
                ["pivot", "hhid", "mgra", "unittype"],
                ["sum", "non_gq_hh", [0]],
                ["sum", "gq_hh", [1]],
                ["sum", "total_hh", [0, 1]],
            ], [
                ["sum", "gq_hh", ["gq_civ", "gq_mil"]],
                ["sum", "total_hh", ["hh", "gq_hh"]],
                ["sum", "non_gq_hh", ["hh"]],
            ]
        ],
    }

    # How to do comparisons for each test 1-4. The first list defines any pre-comparison functions,
    # the second list defines which columns to print out and under what condition(s)
    # 1. For differences, we have ["diff", first column, minus the second column, name of diff
    #    column]
    comparisons = {
        "income categories": [[
                ["diff", "1_mgra", "1_households", "1_diff"],
                ["diff", "2_mgra", "2_households", "2_diff"],
                ["diff", "3_mgra", "3_households", "3_diff"],
                ["diff", "4_mgra", "4_households", "4_diff"],
                ["diff", "5_mgra", "5_households", "5_diff"],
            ], [
                ["mgra", 
                    "1_mgra", "1_households", "1_diff",
                    "2_mgra", "2_households", "2_diff",
                    "3_mgra", "3_households", "3_diff",
                    "4_mgra", "4_households", "4_diff",
                    "5_mgra", "5_households", "5_diff",],
                [
                    ["!= 0", ["1_diff", "2_diff", "3_diff", "4_diff", "5_diff"]]
                ]
            ]
        ],
        "building sizes": [[
                ["diff", "hh_mh_mgra", "hh_mh_households", "hh_mh_diff"],
                ["diff", "hh_mf_mgra", "hh_mf_households", "hh_mf_diff"],
                ["diff", "hh_sf_mgra", "hh_sf_households", "hh_sf_diff"],
            ], [
                ["mgra", 
                    "hh_mh_mgra", "hh_mh_households", "hh_mh_diff",
                    "hh_mf_mgra", "hh_mf_households", "hh_mf_diff",
                    "hh_sf_mgra", "hh_sf_households", "hh_sf_diff"],
                [
                    ["!= 0", ["hh_mh_diff", "hh_mf_diff", "hh_sf_diff"]]
                ]
            ]
        ],
        "population excluding GQ": [[
                ["diff", "hhp_mgra", "hhp_households", "hhp_diff"],
            ], [
                ["mgra", "hhp_mgra", "hhp_households", "hhp_diff"],
                [
                    ["!= 0", ["hhp_diff"]]
                ]
            ]
        ],
        "GQ/non GQ/total households": [[
                ["diff", "non_gq_hh_mgra", "non_gq_hh_households", "non_gq_hh_diff"],
                ["diff", "gq_hh_mgra", "gq_hh_households", "gq_hh_diff"],
                ["diff", "total_hh_mgra", "total_hh_households", "total_hh_diff"],
            ], [
                ["mgra", 
                    "non_gq_hh_mgra", "non_gq_hh_households", "non_gq_hh_diff",
                    "gq_hh_mgra", "gq_hh_households", "gq_hh_diff",
                    "total_hh_mgra", "total_hh_households", "total_hh_diff"],
                [
                    ["!= 0", ["non_gq_hh_diff", "gq_hh_diff", "total_hh_diff"]]
                ]
            ]
        ],
    }

    # Store results here
    results = pd.DataFrame(columns=["mgra"])

    # Do all the transformations, then compare for tests
    for name, trans in transformations.items():
        # Split up the transformations into better variable names
        households_instructions = trans[0]
        mgra_instructions = trans[1]

        # Do all the household transformations
        households_transformed = households.copy(deep=True)
        for instructions in households_instructions:
            if(instructions[0] == "groupby"):
                # groupby and aggregate as requested
                households_transformed = households_transformed.groupby(instructions[1])
                if(instructions[2] == "count"):
                    households_transformed = households_transformed.count()
                if(instructions[2] == "sum"):
                    households_transformed = households_transformed.sum()

                # Reset index to avoid multi-indexing and for (any potential) pivoting
                households_transformed = households_transformed.reset_index(drop=False)
            
            if(instructions[0] == "pivot"):
                # Pivot as requested
                households_transformed = pd.pivot_table(households_transformed,
                    values=instructions[1],
                    index=instructions[2],
                    columns=instructions[3]).reset_index(drop=False)

            if(instructions[0] == "sum"):
                households_transformed[instructions[1]] = households_transformed[instructions[2]].sum(axis=1)

        # Do all MGRA transformations
        mgra_transformed = mgra.copy(deep=True)
        for instructions in mgra_instructions:
            if(instructions[0] == "sum"):
                mgra_transformed[instructions[1]] = mgra_transformed[instructions[2]].sum(axis=1)

            if(instructions[0] == "diff"):
                mgra_transformed[instructions[1]] = mgra_transformed[instructions[2][0]] \
                    - mgra_transformed[instructions[2][1]]

        # Combine the household aggregated values and MGRA values into one table, while filling
        # missing household aggregated values with zero
        combined = pd.merge(mgra_transformed, households_transformed, 
            how="left", 
            on="mgra",
            suffixes=["_mgra", "_households"])
        combined = combined.fillna(value=0)

        # Do pre-comparison operations
        for instruction in comparisons[name][0]:
            if(instruction[0] == "diff"):
                combined[instruction[3]] = combined[instruction[1]] - combined[instruction[2]]

        # Create the comparison conditional
        conditional = [False for _ in range(0, combined.shape[0])]
        for comparision in comparisons[name][1][1]:
            if(comparision[0] == "!= 0"):
                for col in comparision[1]:
                    conditional = conditional | (combined[col] != 0)

        # Print out diagnostic print statement
        print(f"For the variable(s) {name}, a total of {conditional.sum()} MGRAs have different values")

        # Store results
        results = pd.merge(results, combined[comparisons[name][1][0]][conditional], how="outer", on="mgra")

    return results

In [5]:
def households_tests() -> None:
    """Perform all tests on the households file. For details, see simple_tests and complex_tests

    Returns:
        None
        Also prints out some information on how many MGRAs fail each test
        Also saves the output file locally
    """
    # Get data and put the dfs into named containers
    mgra, _, households = download_data("eli")

    # Run tests
    simple_results = households_simple_tests(households, mgra)
    complex_results = households_complex_tests(households, mgra)

    # Combine results into one large file for saving
    combined = pd.merge(simple_results, complex_results, how="outer", on="mgra").sort_values("mgra", axis=0)
    combined.to_csv("./results.csv", index=False)

households_tests()

For the variable pop, a total of 1382 MGRAs have different values
For the variable(s) income categories, a total of 45 MGRAs have different values
For the variable(s) building sizes, a total of 45 MGRAs have different values
For the variable(s) population excluding GQ, a total of 0 MGRAs have different values
For the variable(s) GQ/non GQ/total households, a total of 1382 MGRAs have different values


In [6]:
def persons_complex_tests(persons, mgra) -> pd.DataFrame:
    """Perform complex tests aggregating from persons to MGRA

    The complex tests are as follows:
    TODO

    Args:
        persons (pandas.DataFrame): A df containing data from "persons_2019_01.csv". No 
            modifications to the dataframe are necessary. All transformations will be done in this
            function
        mgra (pandas.DataFrame): A df containing data from "mgra13_update_mgra_ind_QA.csv". No 
            modifications to the dataframe are necessary. All transformations will be done in this
            function

    Returns:
        pandas.DataFrame: A df containing every row where persons and mgra do not match up,
            for every test defined above
        Also prints out some information on how many MGRAs fail each test
    """
    # TODO: Move the two below variables into a config file of some kind

    # How to do transformations for each test 1-4. First comes transformations to persons, then
    # comes transformations to MGRA. Within each transformation list, each list is its own 
    # transform. Note that for persons, it is assumed that at the end we groupby MGRA
    # 1. For groupby transformation, we have ["groupby", groupby input, aggregation function]. Note, 
    #    this will always reset_index(drop=False) afterwards
    # 2. For pivot transformation, we have ["pivot", values, index, columns] which are input to 
    #    pd.pivot_table
    # 3. For sum transformation, we have ["sum", new column name, columns to sum]
    transformations = {
        "total population": [
            [
                ["groupby", ["mgra"], "count"],
                ["sum", "total_pop", ["perid"]]
            ], [
                ["sum", "total_pop", ["pop"]]
            ]
        ],
        "gq/non-gq population": [
            [
                ["groupby", ["mgra", "unittype"], "count"],
                ["pivot", "perid", "mgra", "unittype"],
                ["sum", "non_gq_pop", [0]],
                ["sum", "gq_pop", [1]]
            ], [
                ["sum", "non_gq_pop", ["hhp"]],
                ["diff", "gq_pop", ["pop", "non_gq_pop"]]
            ]
        ],
    }

    # How to do comparisons for each test 1-4. The first list defines any pre-comparison functions,
    # the second list defines which columns to print out and under what condition(s)
    # 1. For differences, we have ["diff", first column, minus the second column, name of diff
    #    column]
    comparisons = {
        "total population": [[
                ["diff", "total_pop_mgra", "total_pop_persons", "total_pop_diff"],
            ], [
                ["mgra", "total_pop_mgra", "total_pop_persons", "total_pop_diff"],
                [
                    ["!= 0", ["total_pop_diff"]]
                ]
            ]
        ],
        "gq/non-gq population": [[
                ["diff", "non_gq_pop_mgra", "non_gq_pop_persons", "non_gq_pop_diff"],
                ["diff", "gq_pop_mgra", "gq_pop_persons", "gq_pop_diff"],
            ], [
                ["mgra", "non_gq_pop_mgra", "non_gq_pop_persons", "non_gq_pop_diff",
                    "gq_pop_mgra", "gq_pop_persons", "gq_pop_diff"],
                [
                    ["!= 0", ["non_gq_pop_diff", "gq_pop_diff"]]
                ]
            ]
        ],
    }

    # Store results here
    results = pd.DataFrame(columns=["mgra"])

    # Do all the transformations, then compare for tests
    for name, trans in transformations.items():
        # Split up the transformations into better variable names
        households_instructions = trans[0]
        mgra_instructions = trans[1]

        # Do all the household transformations
        households_transformed = persons.copy(deep=True)
        for instructions in households_instructions:
            if(instructions[0] == "groupby"):
                # groupby and aggregate as requested
                households_transformed = households_transformed.groupby(instructions[1])
                if(instructions[2] == "count"):
                    households_transformed = households_transformed.count()
                if(instructions[2] == "sum"):
                    households_transformed = households_transformed.sum()

                # Reset index to avoid multi-indexing and for (any potential) pivoting
                households_transformed = households_transformed.reset_index(drop=False)
            
            if(instructions[0] == "pivot"):
                # Pivot as requested
                households_transformed = pd.pivot_table(households_transformed,
                    values=instructions[1],
                    index=instructions[2],
                    columns=instructions[3]).reset_index(drop=False)

            if(instructions[0] == "sum"):
                households_transformed[instructions[1]] = households_transformed[instructions[2]].sum(axis=1)

        # Do all MGRA transformations
        mgra_transformed = mgra.copy(deep=True)
        for instructions in mgra_instructions:
            if(instructions[0] == "sum"):
                mgra_transformed[instructions[1]] = mgra_transformed[instructions[2]].sum(axis=1)

            if(instructions[0] == "diff"):
                mgra_transformed[instructions[1]] = mgra_transformed[instructions[2][0]] \
                    - mgra_transformed[instructions[2][1]]

        # Combine the household aggregated values and MGRA values into one table, while filling
        # missing household aggregated values with zero
        combined = pd.merge(mgra_transformed, households_transformed, 
            how="left", 
            on="mgra",
            suffixes=["_mgra", "_persons"])
        combined = combined.fillna(value=0)

        # Do pre-comparison operations
        for instruction in comparisons[name][0]:
            if(instruction[0] == "diff"):
                combined[instruction[3]] = combined[instruction[1]] - combined[instruction[2]]

        # Create the comparison conditional
        conditional = pd.Series([False for _ in range(0, combined.shape[0])])
        for comparision in comparisons[name][1][1]:
            if(comparision[0] == "!= 0"):
                for col in comparision[1]:
                    conditional = conditional | (combined[col] != 0)

        # Print out diagnostic print statement
        print(f"For the variable(s) {name}, a total of {conditional.sum()} MGRAs have different values")

        # Store results
        results = pd.merge(results, combined[comparisons[name][1][0]][conditional], how="outer", on="mgra")

    return results

In [7]:
def persons_tests() -> None:
    """Perform all tests on the persons file. For details, complex_tests

    Returns:
        None
        Also prints out some information on how many MGRAs fail each test
        Also saves the output file locally
    """
    # Get data and put the dfs into named containers
    mgra, persons, _ = download_data("eli")

    # Run tests
    complex_results = persons_complex_tests(persons, mgra)

    # Combine results into one large file for saving
    combined = complex_results
    combined.to_csv("./results.csv", index=False)

persons_tests()

For the variable(s) total population, a total of 1382 MGRAs have different values
For the variable(s) gq/non-gq population, a total of 1382 MGRAs have different values


In [8]:
def hh_gq_comparison() -> None:
    """Create a comparision file with variables relating to # of households and gq households

    Returns:
        None
        Also saves the output file locally
    """
    # Get data
    mgra, _, households = download_data("eli")

    # Store data here
    hh_gq = mgra[["mgra", "hh", "gq_civ", "gq_mil"]].copy(deep=True)

    # mgra derived columns
    hh_gq["hh+gq"] = hh_gq[["hh", "gq_civ", "gq_mil"]].sum(axis=1)

    # Do some transformations on the households data
    households = households.groupby(["mgra", "unittype"]).count()
    households = pd.pivot_table(households, values=["hhid"], columns=["unittype"], index=["mgra"]).fillna(0).reset_index(drop=False)
    households.columns = [households.columns[0][0]] + [households.columns[i][1] for i in (1,2)]
    households = households.rename({0: "non-GQ_hh households", 1:"GQ_hh households"}, axis=1)

    # Combine
    hh_gq = pd.merge(hh_gq, households, how="outer", on="mgra").fillna(0)

    # Rename columns with the source
    hh_gq = hh_gq.rename({
        "hh": "hh mgra",
        "gq_civ": "gq_civ mgra",
        "gq_mil": "gq_mil mgra",
        "hh+gq": "hh+gq mgra",
    }, axis=1)

    hh_gq["hh_mgra = non-GQ_hh households"] = hh_gq["hh mgra"] == hh_gq["non-GQ_hh households"]
    hh_gq["diff:hh_mgra - non-GQ_hh households"] = hh_gq["hh mgra"] - hh_gq["non-GQ_hh households"]

    # hh_gq.to_excel("./hh_persons_diff.xlsx", index=False)