# 2022-47 Base Year Forecast Output QC

Test Plan: https://sandag.sharepoint.com/qaqc/_layouts/15/Doc.aspx?sourcedoc={f8b3d630-1290-445b-99a1-2fa9041ade92}&action=edit

Documentation: https://sandag.sharepoint.com/:w:/r/qaqc/_layouts/15/Doc.aspx?sourcedoc=%7B3AF20D75-0A22-4B9C-9CC4-85B3EEC294E6%7D&file=MGRABased_input_ABM_2019_process_notes.docx 

### Library Imports

In [1]:
import pandas as pd

from pathlib import Path

### Download Data

In [2]:
def download_XPEF35_data():
    """
    This function downloads XPEF35 data for the years 2018 and 2020

    :returns:       Tuple with (2018 data, 2020 data)
    """
    # Data is stored in this folder
    data_folder_path = Path(r"T:\socioec\Current_Projects\XPEF35\abm_csv\New_mgra_based_input")
    
    # Define the files here
    files = ["mgra13_based_input" + year + "_02.csv" for year in ["2018", "2020"]]

    # Download the data from each file and load into a dataframe
    dfs = []
    for file in files:
        dfs.append(pd.read_csv(data_folder_path / file))

    return dfs

def download_XPEF39_data():
    """
    This function downloads XPEF39 data for the year 2019

    :returns:       df with 2019 data
    """
    # Data is stored in this folder
    data_folder_path = Path(r"T:\socioec\Current_Projects\XPEF39\abm_csv")
    
    # Define the files here
    files = ["mgra13_based_input2019_01.csv"]

    # Download the data from each file and load into a dataframe
    dfs = []
    for file in files:
        dfs.append(pd.read_csv(data_folder_path / file))

    return dfs[0]

# Get data and put the dfs into named containers
xpef35_2018, xpef35_2020 = download_XPEF35_data()
xpef39_2019 = download_XPEF39_data()

### Transform Data

In [3]:
# For the most part, region wide value for each variable can be computed using a simple sum. 
# However, for some variables, such as density varialbes or other variables, more advanced
# calcuations must be done. 

# Here, I define which columns to drop as their data is meaningless in regional context
# The items in this list which are commented out are variables which exist in ABM wiki but for
# whatever reason do not exist in the downloaded files
drop = [
    "pseudomsa", # MSA does not matter when data is collected for the entire region
    "parkarea", # Cross MGRA value, no meaning in region wide context
    # "luzid", # ABM wiki has no description...
    "truckregiontype", # ABM wiki has no description...
    "district27", # ABM wiki has no description...
    "milestocoast", # Obviously meaningless
    # "MicroAccessTime", # Micro-mobility access time (mins) is too MGRA specific to aggregate
    "mgra", # Ignore for simplicity
    "taz", # Ignore for simplicity
    "zip09", # Ignore for simplicity
    # "remoteAVParking", # Ignore for simplicity
]

# Here, I define additional columns that need to be computed
# The t_ prefix is used as a flag to delete these after processing
additional_columns = {
    # The number of the different kinds of parking stalls
    "t_num_hourly_stalls": ["sum", "hstallsoth", "hstallssam"],
    "t_num_daily_stalls": ["sum", "dstallsoth", "dstallssam"],
    "t_num_monthly_stalls": ["sum", "mstallsoth", "mstallssam"],

    # Dummy variables used to (at some point) compute average parking rates
    "t_hourly_stalls_total_cost": ["product", "t_num_hourly_stalls", "hparkcost"],
    "t_daily_stalls_total_cost": ["product", "t_num_daily_stalls", "dparkcost"],
    "t_monthly_stalls_total_cost": ["product", "t_num_monthly_stalls", "mparkcost"],

    # Get the total group quarter population
    "gq_total": ["sum", "gq_civ", "gq_mil"],
}

# Here, I define which columns require special treatment
complex_calculations = {
    # # Collect all the different mgra numbers in the region
    # "mgra": ["collect"],

    # # Collect all the different taz numbers in the region
    # "taz": ["collect"],

    # Calculate household size by dividing total population by number of households
    "hhs": ["divide", "pop", "hh"],

    # Calculate aveage cost of parking for one hour in [hourly/daily/montly] stalls by dividing the
    # total cost of all [hourly/daily/montly] stalls by the number of [hourly/daily/montly] stalls
    "hparkcost": ["divide", "t_hourly_stalls_total_cost", "t_num_hourly_stalls"],
    "dparkcost": ["divide", "t_daily_stalls_total_cost", "t_num_daily_stalls"],
    "mparkcost": ["divide", "t_monthly_stalls_total_cost", "t_num_monthly_stalls"],

    # # Collect all the different zip codes in the region
    # "zip09": ["collect"],

    # # Count the number of MGRAs that have remote AV parking
    # "remoteAVParking": ["count"],
}

In [4]:
# First make copies of the data so we don't have to keep accessing the T drive
collated_2018 = xpef35_2018.copy(deep=True)
collated_2019 = xpef39_2019.copy(deep=True)
collated_2020 = xpef35_2020.copy(deep=True)

# Drop the columns not needed
collated_2018 = collated_2018.drop(drop, axis=1)
collated_2019 = collated_2019.drop(drop, axis=1)
collated_2020 = collated_2020.drop(drop, axis=1)

# Do the add on the additional columns requested
for new_col_name, computation in additional_columns.items():
    if(computation[0] == "sum"):
        collated_2018[new_col_name] = collated_2018[computation[1]] + collated_2018[computation[2]]
        collated_2019[new_col_name] = collated_2019[computation[1]] + collated_2019[computation[2]]
        collated_2020[new_col_name] = collated_2020[computation[1]] + collated_2020[computation[2]]
    elif(computation[0] == "product"):
        collated_2018[new_col_name] = collated_2018[computation[1]] * collated_2018[computation[2]]
        collated_2019[new_col_name] = collated_2019[computation[1]] * collated_2019[computation[2]]
        collated_2020[new_col_name] = collated_2020[computation[1]] * collated_2020[computation[2]]

# Do the complex calculations
# First just take the simple sum
collated_2018 = collated_2018.sum()
collated_2019 = collated_2019.sum()
collated_2020 = collated_2020.sum()

# Overwrite certain summed variables with the correct calculation
for incorrect_col, correction in complex_calculations.items():
    if(correction[0] == "divide"):
        collated_2018[incorrect_col] = collated_2018[correction[1]] / collated_2018[correction[2]]
        collated_2019[incorrect_col] = collated_2019[correction[1]] / collated_2019[correction[2]]
        collated_2020[incorrect_col] = collated_2020[correction[1]] / collated_2020[correction[2]]

# Remove temp columns (prefix of t_):
collated_2018 = collated_2018.drop([x for x in additional_columns.keys() if "t_" in x])
collated_2019 = collated_2019.drop([x for x in additional_columns.keys() if "t_" in x])
collated_2020 = collated_2020.drop([x for x in additional_columns.keys() if "t_" in x])

# Ensure we don't lose year values
collated_2018["year"] = 2018
collated_2019["year"] = 2019
collated_2020["year"] = 2020

# Combine the collated files together
regionwide = pd.DataFrame([collated_2018, collated_2019, collated_2020])

# # Add a new row containing the difference in values
# regionwide.loc[len(regionwide.index)] = regionwide.loc[1] - regionwide.loc[0]

# Move the year column as the first column
column_order = regionwide.columns.tolist()[-1:] + regionwide.columns.tolist()[:-1]
regionwide = regionwide[column_order]

# # Transpose and set year as column
# regionwide = regionwide.transpose()
# regionwide.columns = regionwide.iloc[0]
# regionwide = regionwide.drop(regionwide.index[0])

# # Other formatting things
# regionwide.index.name = "variable"
# regionwide.columns = [2018, 2020, "difference"]

In [5]:
# A requirement added after the above was coded is to only select the variables we want to look at
# and to do some additional diffs

# The variables we want are the following:
variables = [
    # Of course we need to keep track of the year
    "year",

    # Population variables
    "pop", "hhp", "gq_total",

    # Housing variables
    "hs", "hh", "hhs", 
    # "vacant", "vacancy_rate", These variables are not present in mgra files

    # Employment variables, anything that starts with "emp_"
    *[x for x in list(regionwide.columns) if "emp_" in x],

    # School Enrollment variables, anything that has "enroll"
    *[x for x in list(regionwide.columns) if "enroll" in x], "adultschenrl",

    # Income variables
    # None are present in mgra files

    # Ethnicity by Category variables
    # None are present in mgra files

    # Age variables
    # None are present in mgra files

    # Hotel variables
    *[x for x in list(regionwide.columns) if "room" in x],
]
regionwide = regionwide[variables]

# Transpose and set year as column
regionwide = regionwide.transpose()
regionwide.columns = regionwide.iloc[0]
regionwide = regionwide.drop(regionwide.index[0])

In [6]:
# Compute numeric and percentage differences between all of the year combos

# 2018 --> 2019
regionwide["2018 --> 2019 numeric diff"] = regionwide[2019] - regionwide[2018]
regionwide["2018 --> 2019 percent diff"] = 100 * (regionwide[2019] - regionwide[2018]) / regionwide[2018]

# 2018 --> 2020
regionwide["2018 --> 2020 numeric diff"] = regionwide[2020] - regionwide[2018]
regionwide["2018 --> 2020 percent diff"] = 100 * (regionwide[2020] - regionwide[2018]) / regionwide[2018]

# 2019 --> 2020
regionwide["2019 --> 2020 numeric diff"] = regionwide[2020] - regionwide[2019]
regionwide["2019 --> 2020 percent diff"] = 100 * (regionwide[2020] - regionwide[2019]) / regionwide[2019]

In [7]:
# Export to csv file locally, will move the file manually to sharepoint later
user = "eli"
temp_folder = Path(r"C:\Users\eli\Documents")
filename = "Regionwide_Comparison_XPEF35(2018_and_2020)_XPEF39(2019).csv"
regionwide.to_csv(temp_folder / filename)