# 2022-47 Base Year Forecast Output QC

Test Plan: https://sandag.sharepoint.com/qaqc/_layouts/15/Doc.aspx?sourcedoc={f8b3d630-1290-445b-99a1-2fa9041ade92}&action=edit

Documentation: https://sandag.sharepoint.com/:w:/r/qaqc/_layouts/15/Doc.aspx?sourcedoc=%7B3AF20D75-0A22-4B9C-9CC4-85B3EEC294E6%7D&file=MGRABased_input_ABM_2019_process_notes.docx 

### Library Imports

In [1]:
import pandas as pd
import numpy as np

from pathlib import Path

### Download Data


In [2]:
def download_data(user):
    """
    This function downloads csv data for the 2019 Forecast Output

    :param user:    The user trying to download the data. Mostly here so that others can more 
                    easily run my code

    :returns:       Tuple with (mgra data, person data)
    """
    # Data is stored in this folder
    data_folder_path = Path(f"C:/Users/{user}/San Diego Association of Governments/" \
        "SANDAG QA QC - Documents/Projects/2022/2022-47 Base Year Forecast Output QC/Data/")
    
    # Define the files here
    files = ["mgra_ind.csv", "persons_2019_01.csv", "households_2019_01.csv"]

    # Download the data from each file and load into a dataframe
    dfs = []
    for file in files:
        dfs.append(pd.read_csv(data_folder_path / file))

    # Use the households file to add which mgra each person belongs to
    hhid_to_mgra = dfs[2][["hhid", "mgra"]]
    dfs[1] = dfs[1].merge(hhid_to_mgra, left_on="hhid", right_on="hhid")

    return dfs

# Get data and put the dfs into named containers
mgra, persons, _ = download_data("eli")

### Tests

The following checks between "mgra_ind.csv" and "persons_2019_01.csv" will be done:

Ensure that...
1. the number of people per mgra, 
2. the number of male/female people per mgra, and
3. the number of each age category per mgra

is the same between both files

In [3]:
def test_nulls(combined_df, mgra_col, persons_col):
    """
    Check that the number of mgra values matches up in the two input columns of the combined df.
    In other words, make sure that for every mgra value in mgra, there is a corresponding one in
    persons

    :param combined_df: The data from the files "mgra_ind.csv" and "persons_2019_01.csv" combined
                        using how="outer" on the column mgra
    :param mgra_col:    The name of the column representing "mgra_ind.csv" values 
    :param persons_col: The name of the column representing "persons_2019_01.csv" values
    :returns:           None
    """
    files = ["mgra_ind.csv", "persons_2019_01.csv"]

    # Print out the number of rows where mgra_col is null
    print(f"Number of mgra values that appear in \"{files[1]}\" but not \"{files[0]}\":")
    print("\t", f"{combined_df[mgra_col].isna().sum()}\n")

    # Print out the number of rows where persons_col is null
    print(f"Number of mgra values that appear in \"{files[0]}\" but not \"{files[1]}\", ignoring " \
        "mgras with no population:")
    print("\t", f"{combined_df[persons_col].isna().sum()}\n")


In [4]:
def test_values(combined_df, mgra_col, persons_col):
    """
    Check that values matche up in the two input columns of the combined df. In other words, make 
    sure that for every mgra value for which there is data in both files, that the values match up

    :param combined_df: The data from the files "mgra_ind.csv" and "persons_2019_01.csv" combined
                        using how="outer" on the column mgra
    :param mgra_col:    The name of the column representing "mgra_ind.csv" values 
    :param persons_col: The name of the column representing "persons_2019_01.csv" values
    :returns:           None
    """
    files = ["mgra_ind.csv", "persons_2019_01.csv"]

    # Print out the number of rows where population does not match up
    # Make sure to remove null rows first
    combined_df_nn = combined_df.dropna(axis=0).copy(deep=True)
    combined_df_nn["diff"] = combined_df_nn[mgra_col] - combined_df_nn[persons_col]
    print(f"Number of rows in both \"{files[1]}\" and \"{files[0]}\" where population values do " \
        "not match:")
    # For God knows what reason, the f string goes crazy unless I use a dummy variable (x)
    x = combined_df_nn.loc[combined_df_nn["diff"] != 0].shape[0]
    print("\t", f"{x}\n")

#### Test 1

In [5]:
# Get the number of people per mgra from "persons_2019_01.csv"
pop_persons = persons.groupby("mgra").count()[["perid"]].reset_index()

# Get the number of people per mgra from "mgra_ind.csv", ignoring mgras that have no population
pop_mgra = mgra[mgra["pop"] != 0][["mgra", "pop"]]

# Combine the two dfs together
pop_combined = pop_mgra.merge(pop_persons, how="outer", left_on="mgra", right_on="mgra")

# The tests
test_nulls(pop_combined, "pop", "perid")
test_values(pop_combined, "pop", "perid")

Number of mgra values that appear in "persons_2019_01.csv" but not "mgra_ind.csv":
	 0

Number of mgra values that appear in "mgra_ind.csv" but not "persons_2019_01.csv", ignoring mgras with no population:
	 266

Number of rows in both "persons_2019_01.csv" and "mgra_ind.csv" where population values do not match:
	 1116



#### Test 2

In [6]:
# Get the number of male/female per mgra from "persons_2019_01.csv"
male_persons = persons[persons["sex"] == 1].groupby("mgra").count()[["perid"]].reset_index()
female_persons = persons[persons["sex"] == 2].groupby("mgra").count()[["perid"]].reset_index()

# Get the number of number of male/female per mgra from "mgra_ind.csv", ignoring mgras that have 
# no population
populated_mgras = mgra[mgra["pop"] != 0]
male_mgra = populated_mgras[["mgra", "Male"]]
female_mgra = populated_mgras[["mgra", "Female"]]

# Combine the dfs together
male_combined = male_mgra.merge(male_persons, how="outer", left_on="mgra", right_on="mgra")
female_combined = female_mgra.merge(female_persons, how="outer", left_on="mgra", right_on="mgra")

# The tests, male version
print("*** MALE TESTS ***")
test_nulls(male_combined, "Male", "perid")
test_values(male_combined, "Male", "perid")

# The test, female version
print("*** FEMALE TESTS ***")
test_nulls(female_combined, "Female", "perid")
test_values(female_combined, "Female", "perid")

*** MALE TESTS ***
Number of mgra values that appear in "persons_2019_01.csv" but not "mgra_ind.csv":
	 0

Number of mgra values that appear in "mgra_ind.csv" but not "persons_2019_01.csv", ignoring mgras with no population:
	 395

Number of rows in both "persons_2019_01.csv" and "mgra_ind.csv" where population values do not match:
	 1053

*** FEMALE TESTS ***
Number of mgra values that appear in "persons_2019_01.csv" but not "mgra_ind.csv":
	 0

Number of mgra values that appear in "mgra_ind.csv" but not "persons_2019_01.csv", ignoring mgras with no population:
	 365

Number of rows in both "persons_2019_01.csv" and "mgra_ind.csv" where population values do not match:
	 935



#### Test 3

In [7]:
# Get the number of each ethnicity per mgra from "persons_2019_01.csv"
white_persons =             persons[persons["rac1p"] == 1].groupby("mgra").count()[["perid"]].reset_index()
black_persons =             persons[persons["rac1p"] == 2].groupby("mgra").count()[["perid"]].reset_index()
american_indian_persons =   persons[persons["rac1p"] == 5].groupby("mgra").count()[["perid"]].reset_index()
asian_persons =             persons[persons["rac1p"] == 6].groupby("mgra").count()[["perid"]].reset_index()
pacific_islander_persons =  persons[persons["rac1p"] == 7].groupby("mgra").count()[["perid"]].reset_index()
other_persons =             persons[persons["rac1p"] == 8].groupby("mgra").count()[["perid"]].reset_index()
two_plus_persons =          persons[persons["rac1p"] == 9].groupby("mgra").count()[["perid"]].reset_index()
hispanic_persons =          persons[persons["hisp"] == 2].groupby("mgra").count()[["perid"]].reset_index()

# Get the number of each ethnicity per mgra from "mgra_ind.csv", ignoring mgras that have 
# no population
populated_mgras =       mgra[mgra["pop"] != 0]
white_mgra =            populated_mgras[["mgra", "White"]]
black_mgra =            populated_mgras[["mgra", "Black"]]
american_indian_mgra =  populated_mgras[["mgra", "American Indian"]]
asian_mgra =            populated_mgras[["mgra", "Asian"]]
pacific_islander_mgra = populated_mgras[["mgra", "Pacific Islander"]]
other_mgra =            populated_mgras[["mgra", "Other"]]
two_plus_mgra =         populated_mgras[["mgra", "Two or More"]]
hispanic_mgra =         populated_mgras[["mgra", "Hispanic"]]

# Combine the dfs together
white_combined =            white_mgra.merge(white_persons, how="outer", left_on="mgra", right_on="mgra")
black_combined =            black_mgra.merge(black_persons, how="outer", left_on="mgra", right_on="mgra")
american_indian_combined =  american_indian_mgra.merge(american_indian_persons, how="outer", left_on="mgra", right_on="mgra")
asian_combined =            asian_mgra.merge(asian_persons, how="outer", left_on="mgra", right_on="mgra")
pacific_islander_combined = pacific_islander_mgra.merge(pacific_islander_persons, how="outer", left_on="mgra", right_on="mgra")
other_combined =            other_mgra.merge(other_persons, how="outer", left_on="mgra", right_on="mgra")
two_plus_combined =         two_plus_mgra.merge(two_plus_persons, how="outer", left_on="mgra", right_on="mgra")
hispanic_combined =         hispanic_mgra.merge(hispanic_persons, how="outer", left_on="mgra", right_on="mgra")

# The tests, white version
print("*** WHITE TESTS ***")
test_nulls(white_combined, "White", "perid")
test_values(white_combined, "White", "perid")

# The tests, black version
print("*** BLACK TESTS ***")
test_nulls(black_combined, "Black", "perid")
test_values(black_combined, "Black", "perid")

# The tests, amerian indian version
print("*** AMERICAN INDIAN TESTS ***")
test_nulls(american_indian_combined, "American Indian", "perid")
test_values(american_indian_combined, "American Indian", "perid")

# The tests, asian version
print("*** ASIAN TESTS ***")
test_nulls(asian_combined, "Asian", "perid")
test_values(asian_combined, "Asian", "perid")

# The tests, pacific islander version
print("*** PACIFIC ISLANDER TESTS ***")
test_nulls(pacific_islander_combined, "Pacific Islander", "perid")
test_values(pacific_islander_combined, "Pacific Islander", "perid")

# The tests, other version
print("*** OTHER TESTS ***")
test_nulls(other_combined, "Other", "perid")
test_values(other_combined, "Other", "perid")

# The tests, two plus version
print("*** TWO PLUS TESTS ***")
test_nulls(two_plus_combined, "Two or More", "perid")
test_values(two_plus_combined, "Two or More", "perid")

# The tests, hispanic version
print("*** HISPANIC TESTS ***")
test_nulls(hispanic_combined, "Hispanic", "perid")
test_values(hispanic_combined, "Hispanic", "perid")

*** WHITE TESTS ***
Number of mgra values that appear in "persons_2019_01.csv" but not "mgra_ind.csv":
	 0

Number of mgra values that appear in "mgra_ind.csv" but not "persons_2019_01.csv", ignoring mgras with no population:
	 370

Number of rows in both "persons_2019_01.csv" and "mgra_ind.csv" where population values do not match:
	 15916

*** BLACK TESTS ***
Number of mgra values that appear in "persons_2019_01.csv" but not "mgra_ind.csv":
	 0

Number of mgra values that appear in "mgra_ind.csv" but not "persons_2019_01.csv", ignoring mgras with no population:
	 6279

Number of rows in both "persons_2019_01.csv" and "mgra_ind.csv" where population values do not match:
	 5901

*** AMERICAN INDIAN TESTS ***
Number of mgra values that appear in "persons_2019_01.csv" but not "mgra_ind.csv":
	 0

Number of mgra values that appear in "mgra_ind.csv" but not "persons_2019_01.csv", ignoring mgras with no population:
	 8663

Number of rows in both "persons_2019_01.csv" and "mgra_ind.csv" wher