# 2022-58 2019 Base Year Forecast V2 QC

## 2020_06 estimates and csv file comparison

In [17]:
import pathlib
import json
import pandas as pd
import sqlalchemy as sql

import compare_variables as cv

# :( horrible import, I had to manually move the file over here
import estimates_automation as ea

### Estimates 2020_06 (2019 Only)

In [9]:
# The setup for estimates automation
DDAM = sql.create_engine('mssql+pymssql://DDAMWSQL16')

CONFIG = None
with open("./../2022-56 Estimates 2021 QC/config.json", "r") as f:
    CONFIG = json.load(f)

In [12]:
# Get every table
estimates_table_list = ["population", "housing", "household_income", "ethnicity"]
estimates_tables = {}
for table_name in estimates_table_list:
    estimates_tables[table_name] = \
        ea.get_table_by_geography(DDAM, CONFIG, table_name, "mgra", est_vintage="2020_06", pivot=True)

    # Additionally filter by year of 2019
    estimates_tables[table_name] = estimates_tables[table_name][estimates_tables[table_name]["yr_id"] == 2019]

In [32]:
# Combine the tables together
combined_estimates = pd.DataFrame(columns=["mgra", "yr_id"])
for name, table in estimates_tables.items():
    combined_estimates = pd.merge(combined_estimates, table, how="outer", on=["mgra", "yr_id"])

# Select only the columns we want, in the order we want
combined_estimates["Group Quarters - Civilian"] = \
    combined_estimates[["Group Quarters - College", "Group Quarters - Other"]].sum(axis=1)
combined_columns = ["mgra", "yr_id", 
    # population variables
    "Total Population", "Household Population", "Group Quarters - Military", 
    "Group Quarters - Civilian",             

    # housing variables                  
    "units", "unoccupiable", "vacancy",             

    # income variables
    "Less than $15,000", "$15,000 to $29,999", "$30,000 to $44,999", "$45,000 to $59,999",
    "$60,000 to $74,999", "$75,000 to $99,999", "$100,000 to $124,999", "$125,000 to $149,999",
    "$150,000 to $199,999", "$200,000 or more",  

    # ethnicity variables
    "Hispanic", "Non-Hispanic, White", "Non-Hispanic, Black",
    "Non-Hispanic, American Indian or Alaska Native", "Non-Hispanic, Asian", 
    "Non-Hispanic, Hawaiian or Pacific Islander", "Non-Hispanic, Other", 
    "Non-Hispanic, Two or More Races",
]
combined_estimates = combined_estimates[combined_columns]


### csv files 

In [19]:
# Get the data we want
user = "eli"
base_folder = pathlib.Path(f"C:/Users/{user}/San Diego Association of Governments/SANDAG QA QC - Documents/Projects/2022/2022-58 2019 Base Year Forecast Output QC/data/MGRA13 Updated Data/")
csv = cv.get_data(base_folder, "_mgra_")

In [33]:
# Select only the columns we want, in the order we want them
csv_columns = ["mgra", "year", 
    # population variables
    "pop", "hhp", "gq_mil", "gq_civ",                

    # housing variables               
    "units", "unoccupiable", "vacancy",            

    # income variables
    "i1", "i2", "i3", "i4", "i5", "i6", "i7", "i8", "i9", "i10",    

    # ethnicity variables
    "Hispanic", "White", "Black", "American Indian", "Asian",       
    "Pacific Islander", "Other", "Two or More", 
]
csv = csv[csv_columns]

### Create a difference file

In [42]:
diff = pd.DataFrame()
diff["mgra"] = combined_estimates["mgra"].copy(deep=True)
diff["yr_id"] = 2019

# Start at 2 to skip "mgra" and "year"
for i in range(2, len(combined_columns)):
    diff[csv_columns[i]] = combined_estimates[combined_columns[i]] - csv[csv_columns[i]]

### Combine the three files into one excel sheet

In [43]:
save_path = pathlib.Path("./2020_06_estimates_vs_csv.xlsx")

# Save the two dfs into the same Excel file
with pd.ExcelWriter(save_path) as writer:
    combined_estimates.to_excel(writer, sheet_name="2020_06 Estimates (2019 Only)", index=False)
    csv.to_excel(writer, sheet_name="mgra13_update_mgra_ind_QA.csv", index=False)
    diff.to_excel(writer, sheet_name="Estimates - MGRA csv", index=False)