# 2022-47 Base Year Forecast Output QC
Test Plan: https://sandag.sharepoint.com/qaqc/_layouts/15/Doc.aspx?sourcedoc={f8b3d630-1290-445b-99a1-2fa9041ade92}&action=edit

Documentation: https://sandag.sharepoint.com/:w:/r/qaqc/_layouts/15/Doc.aspx?sourcedoc=%7B3AF20D75-0A22-4B9C-9CC4-85B3EEC294E6%7D&file=MGRABased_input_ABM_2019_process_notes.docx 

Raw Data Dictionary: https://sandag.sharepoint.com/:x:/s/EDAMTeam/EfmFNcTItplOgXP43j_G-5cB9fJ91xv9GtLmS_bQxwsOhw?e=WfLAdD 

### Library Imports

In [78]:
import pandas as pd
import numpy as np

from pathlib import Path

### Download Data


In [79]:
def download_data():
    """
    This function gets the three csv files necessary for the analyis. 
    Those files can be found in the folder: T:/socioec/Current_Projects/XPEF39/abm_csv/

    1. mgra13_based_input2019_01.csv
    2. households_2019_01.csv
    3. persons_2019_01.csv

    :returns:       List containing one dataframe for each of the files
    """
    # Raw data is stored in the following folder
    raw_folder = Path(r"T:/socioec/Current_Projects/XPEF39/abm_csv/")
    
    # Define the files here
    files = ["mgra13_based_input2019_01.csv", "households_2019_01.csv", "persons_2019_01.csv"]

    # Download the data from each file and load into a dataframe
    dfs = []
    for file in files:
        dfs.append(pd.read_csv(raw_folder / file))

    return dfs

# Get data and put the dfs into named containers
mgra, households, persons = download_data()

In [135]:
# There are 23002 unique mgra values
mgra.shape

(23002, 104)

In [138]:
# There are 5182 unique mgra values with a household size (hhs) of zero
mgra[mgra["hhs"] == 0].shape

(5182, 104)

In [136]:
# But households only has 17844 unique MGRA values...
households["mgra"].nunique()

17844

In [None]:
# 23002 mgra values - 5182 mgra's with no households = 17820 mgra's which should have households
# But only 17884 mgras have at least one household

### Tests

In [83]:
# Test number TODO

# Check for null values in population
print(persons.isnull().values.any() == False)

# check for other bad values
# persons.describe()

True


In [92]:
# Test number TODO
# Check that the number of people per household is the same between "households_2019_01.csv" and
# "persons_2019_01.csv"

# Get the number of people per household from "households_2019_01.csv"
pph_households = households[["hhid", "persons"]]

# Get the number of people per household from "persons_2019_01.csv"
pph_persons = persons.groupby("hhid").count()["perid"].reset_index().rename(
    {"perid": "persons"}, axis=1)

# Run the actual test
pph_households.equals(pph_persons)

True

In [123]:
# Test number TODO
# Check that the number of people per MGRA is the same between "mgra13_based_input2019_01.csv" and
# "persons_2019_01.csv"

# Get the number of people per MGRA from "mgra13_based_input2019_01.csv"
ppm_MGRA = mgra[["mgra", "pop"]]

# Get the number of people per MGRA from "persons_2019_01.csv" and "households_2019_01.csv"
ppm_persons = persons.groupby("hhid").count()["perid"].reset_index().rename(
    {"perid": "persons"}, axis=1)
hhid_to_mgra_map = dict(households[["hhid", "mgra"]].values)
ppm_persons["mgra"] = ppm_persons["hhid"].map(hhid_to_mgra_map)
ppm_persons = ppm_persons.groupby("mgra").sum()["persons"].reset_index(drop=False)

# Run the actual test
ppm_MGRA.equals(ppm_persons)


False

In [124]:
ppm_persons

Unnamed: 0,mgra,persons
0,1,44
1,2,78
2,3,139
3,4,71
4,5,71
...,...,...
17839,22995,85
17840,22996,283
17841,22998,241
17842,23000,319


In [140]:
ppm_MGRA[ppm_MGRA["pop"] == 0]

Unnamed: 0,mgra,pop
81,82,0
94,95,0
112,113,0
157,158,0
163,164,0
...,...,...
22967,22968,0
22971,22972,0
22977,22978,0
22998,22999,0


In [125]:
ppm_MGRA

Unnamed: 0,mgra,pop
0,1,44
1,2,78
2,3,139
3,4,71
4,5,71
...,...,...
22997,22998,241
22998,22999,0
22999,23000,319
23000,23001,208
