# age_sex_ethnicity table from DOF data

In [44]:
# Library imports
import pandas as pd
import sqlalchemy as sql
from pathlib import Path

In [36]:
DDAM = sql.create_engine('mssql+pymssql://DDAMWSQL16')

## Actually creating the table

In [37]:
# Get the raw data from DDAM

# Note, 06073 corresponds to San Diego County
query = """
SELECT *
FROM [socioec_data].[ca_dof].[population_proj_2021_07_14]
WHERE county_fips_code=06073
"""

# Get the table
# Note, it is small enough that I'm just going to pull into python and manipulate here, as I'm not
# as comfortable in SQL
table = pd.read_sql_query(query, con=DDAM).drop("county_fips_code", axis=1)

# Order the table by "fiscal_yr" to make viewing slices more enjoyable
table = table.sort_values("fiscal_yr").reset_index(drop=True)

In [38]:
# Replcae the values in race_code with the actual race string

# From: https://dof.ca.gov/wp-content/uploads/Forecasting/Demographics/Documents/P3_Dictionary.txt
# The values should be replaced as such:
# 1	White, Non-Hispanic
# 2	Black, Non-Hispanic
# 3	American Indian or Alaska Native, Non-Hispanic
# 4	Asian, Non-Hispanic
# 5	Native Hawaiian or Pacific Islander, Non-Hispanic
# 6	Multiracial (two or more of above races), Non-Hispanic
# 7	Hispanic (any race)

# Above uses different race strings from the estimates table, so it's not exactly one to one
num_to_race = {
    "race_code": {
        1: "Non-Hispanic, White",
        2: "Non-Hispanic, Black",
        3: "Non-Hispanic, American Indian or Alaska Native",
        4: "Non-Hispanic, Asian",
        5: "Non-Hispanic, Hawaiian or Pacific Islander",
        6: "Non-Hispanic, Two or More Races",
        7: "Hispanic",
    }
}

# Replace the values as indicated above
table = table.replace(num_to_race)

In [39]:
# Replace the age column with an age group column

# Pull the group names and bounds from SQL
# BUG: [demographic_warehouse].[dim].[age_group] defines the category name "85 and Older" as ages
# between 85 and 100 inclusive. But DOF tables contain ages larger than 100... For now, I have
# assumed that ages > 100 in DOF tables DO COUNT as being "85 and Older"
query = """
SELECT [name], [lower_bound]
FROM [demographic_warehouse].[dim].[age_group]
"""
age_groups = pd.read_sql_query(query, con=DDAM)

# Apply the age groups
table["age_group"] = pd.cut(table["age"], 
    bins=list(age_groups["lower_bound"]) + [999], 
    labels=list(age_groups["name"]),
    right=False)

# Remove the no longer needed age column
table = table.drop("age", axis=1)

# Since age_group has less categories than age, population values need to be summed across each
# age_gropu
table = table.groupby(["fiscal_yr", "age_group", "sex", "race_code"]).sum().reset_index()

In [40]:
# Pivot the table to match the format generated by estimates_automation.py
table = table.pivot_table(
    values=["population"], 
    index=["fiscal_yr", "age_group", "sex"],
    columns=["race_code"])

# DEATH TO MULTI-INDEXES AND MULTI-COLUMNS
table = table.reset_index()
table.columns = table.columns.get_level_values(0)[:3].append(
    table.columns.get_level_values(1)[3:])

In [41]:
# Do some final transformations to sync up formatting between DOF and files generated by 
# estimates_automation.py

# Add the region column
table["region"] = "San Diego"

# Rename columns
table = table.rename({"fiscal_yr": "yr_id", "age_group": "name"}, axis=1)

# Reorder columns
table = table[["region", "yr_id", "name", "sex"] + list(table.columns[3:10])]

In [46]:
# Commented out to avoid running on accident

# # Save the file locally
# save_folder = Path("C:/Users/eli/OneDrive - San Diego Association of Governments/Desktop/Desktop")
# file_name = Path("DOF_2021_07_14_age_sex_ethnicity_region_QA.csv")
# table.to_csv(save_folder / file_name, index=False)