# Estimate Tests

In [1]:
# For config file
import json

import pandas as pd
import sqlalchemy as sql

In [2]:
DDAM = sql.create_engine('mssql+pymssql://DDAMWSQL16')

CONFIG = None
with open("./config.json", "r") as f:
    CONFIG = json.load(f)

## Aggregation by Geography Function

In [3]:
def get_table_by_geography(connection, config, est_table, geo_level, pivot=False, debug=False):
    """
    Get the input estimates table grouped by the input geography level

    :param connection:  The connection to the relevant SQL server (AFAIK always DDAMWSWL16)
    :param config:      The config file. See "./config.json" for details
    :param est_table:   The name of the estimates table. This is the part after "dw_"
    :param geo_level:   The geography level to group by. This is a string input corresponding to one
                        of the column names of [demographic_warehouse].[dim].[mgra_denormalize]. For
                        example, this variable could contain "sra", "college", or "jurisdiction"
    :param pivot:       By default, False. If True, change the format of the table from being tall
                        to wide. For more details, see the bottom of the function for exactly what 
                        is going on
    :param debug:       By default, False. If True, then print out diagnostic statements including
                        the complete SQL query used
    :returns:           Dataframe containing the requested table grouped by the geography level
    """
    # This variable is used to deal with the unique behavior of households table. We ignore the 
    # household_size_id column and just group by the geography level
    households = (est_table == "households")

    # The basic format of every table we are looking at. To use, call
    # EST_BASE_TABLE.format(<TABLE NAME>)
    EST_BASE_TABLE = "[estimates].[est_2020_06].[dw_{0}]"

    # The basic format of every dim table we are looking at. To use, call:
    # DIM_BASE_TABLE.format(<TABLE NAME>)
    DIM_BASE_TABLE = "[demographic_warehouse].[dim].[{0}]"

    # Create the basic structure of the SQL query
    # Note, none of the formatted strings should end with a ","
    query = """
SELECT {mgra_denormalize_col}, yr_id, {dim_named_cols}, {agg_col}
FROM {est_base_table} as tbl
{joins}
WHERE {geography_filter}
GROUP BY {mgra_denormalize_col}, yr_id, {join_col}, {dim_named_cols}
ORDER BY {mgra_denormalize_col}, yr_id, {join_col}
"""
    if(households):
        # In the households table, we ignore the hosueholds_size_id column, which means we only have
        # to join with mgra_denormalize
        query = """
SELECT {mgra_denormalize_col}, yr_id, {agg_col}
FROM {est_base_table} as tbl
{joins}
WHERE {geography_filter}
GROUP BY {mgra_denormalize_col}, yr_id
ORDER BY {mgra_denormalize_col}, yr_id
"""

    # We additionally need the columns that exist in the estimates table
    COLUMNS = pd.read_sql_query(f"""
        SELECT TOP(0) *
        FROM {EST_BASE_TABLE.format(est_table)}
    """, con=connection).columns
    if(debug):
        print(f"{'Columns in estimates table:' : <32}", list(COLUMNS))
        print()

    # From the list of columns, we can find exactly which columns we want to be joining on. These
    # are the columns which end with "_id" but are not "mgra_id" nor "yr_id"
    ID_COLUMNS = [col for col in COLUMNS if 
        col.endswith("_id") and 
        col != "mgra_id" and 
        col != "yr_id"
    ]
    if(households):
        ID_COLUMNS = []

    # The field {mgra_denormalize_col} is asking for the column name that contains the geography
    # variable ("sra", "college", "jurisdiction", etc.)
    mgra_denormalize_col = geo_level

    # The field {dim_named_cols} is asking for the (formatted) columns in the dim tables that 
    # contain the long form representations of the ids. For example, in the dim table age_group,
    # age_group_id=1 corresponds to name="Under 5", so we want the "name" column as it is the most
    # descriptive
    dim_named_cols = ""
    if(not households):
        for id_col in ID_COLUMNS:
            dim_named_cols += f"{config['dim'][id_col]['dim_table']}.{config['dim'][id_col]['column(s)'][0]}, "
        dim_named_cols = dim_named_cols[:-2] # Remove the trailing comma
    
    # The field {agg_col} is asking for the column of the estimates table we are aggregating on
    # and the function used to aggregate. This information is contained in config["est"]
    agg_list = config["est"][est_table]
    agg_col = ""
    for aggregation in agg_list:
        agg_col += "{function}({col}) as {col}".format(function=aggregation[1], col=aggregation[0])
        agg_col += ", "
    agg_col = agg_col[:-2] # Remove the trailing comma
    if(debug):
        print(f"{'Aggregation instructions:' : <32}", config["est"][est_table])
        print()
    
    # The field {est_base_table} is asking for
    est_base_table = EST_BASE_TABLE.format(est_table)
    
    # The field {joins} is asking for formatted list of INNER JOINs that add on each dim table to
    # the estimates table. This information is contained in config["dim"]
    # TODO: Are there null mgra_id values? May need to change to LEFT JOIN
    # Note, we always want to join on mgra_id, so add that to the list
    JOIN_COLS = ["mgra_id"] + ID_COLUMNS
    if(households):
        JOIN_COLS = ["mgra_id"]
    joins = ""
    for join_col in JOIN_COLS:
        dim_table = config["dim"][join_col]["dim_table"]
        joins += f"""
INNER JOIN {DIM_BASE_TABLE.format(dim_table)} as {dim_table} ON
    {dim_table}.{join_col} = tbl.{join_col}
"""
    if(debug):
        print(f"{'Columns to join on:' : <32}", list(JOIN_COLS))
        print()
    
    # The field {geography_filter} is asking for the conditional where we only get the rows of the 
    # table where the geography level we are interested in is not NULL
    geography_filter = f"{mgra_denormalize_col} IS NOT NULL"

    # The field {join_col} is asking for the column of the estimates table we are joining on in 
    # order to keep categorical variables in the same order
    # TODO: This assumes there is only one join to be made
    join_col = ""
    if(not households):
        join_col = f"tbl.{ID_COLUMNS[0]}"

    # Fill in the blanks of the query
    if(not households):
        query = query.format(
            mgra_denormalize_col=mgra_denormalize_col,
            dim_named_cols=dim_named_cols,
            agg_col=agg_col,
            est_base_table=est_base_table,
            joins=joins,
            geography_filter=geography_filter,
            join_col = join_col
        )
    else:
        query = query.format(
            mgra_denormalize_col=mgra_denormalize_col,
            agg_col=agg_col,
            est_base_table=est_base_table,
            joins=joins,
            geography_filter=geography_filter,
            join_col = join_col
        )
    if(debug):
        print("*** FULL QUERY BELOW ***")
        print(query)
        print("*** END FULL QUERY ***")

    # Get the table into pandas
    table = pd.read_sql_query(query, con=connection)

    # Pivot the table if requested
    if(pivot):
        # For every table, there are 1-3 categorical columns, and 1-4 value columns. Each unique
        # combination of all categorical columns and one value column will form a new column

        # First, create the list of index columns, categorical column(s), and value column(s)
        IND_COLS = [geo_level, "yr_id"]
        CAT_COLS = [config["dim"][col]["column(s)"][0] for col in ID_COLUMNS]
        VAL_COLS = [col[0] for col in config["est"][est_table]]

        # Custom behavior for the age_sex_ethnicity table
        if(est_table == "age_sex_ethnicity"):
            IND_COLS += ["name", "sex"]
            CAT_COLS = ["long_name"]

        # Pivot the table
        table = table.pivot_table(
            index=IND_COLS, 
            columns=CAT_COLS,
            values=VAL_COLS,
            aggfunc=sum) # Not used except for age_sex_ethnicity table

    # Return the table
    return table

In [4]:
# Example usage
get_table_by_geography(DDAM, CONFIG, "age", "jurisdiction")

Unnamed: 0,jurisdiction,yr_id,name,population
0,Carlsbad,2010,Under 5,6236
1,Carlsbad,2010,5 to 9,7225
2,Carlsbad,2010,10 to 14,7351
3,Carlsbad,2010,15 to 17,4374
4,Carlsbad,2010,18 and 19,2047
...,...,...,...,...
4175,Vista,2020,65 to 69,4508
4176,Vista,2020,70 to 74,3478
4177,Vista,2020,75 to 79,2366
4178,Vista,2020,80 to 84,1706


## Consolidation Function

In [5]:
def consolidate(connection, config,
    geo_list=["region", "jurisdiction", "cpa"], 
    est_table_list=["age", "ethnicity", "household_income", "households", "housing", "population", "sex"]):
    """
    Conoslidate the input estimates tables column wise for each input geography. This function 
    returns one dataframe per geography level, as opposed to combining everything together

    :param connection:      The connection to the relevant SQL server (AFAIK always DDAMWSWL16)
    :param config:          The config file. See "./config.json" for details
    :param geo_list:        The geographies to cosolidate along.
    :param est_table_list:  TODO
    :returns:               TODO
    """
    # Store each cosolidated table by geography level here
    combined_tables = []

    # Loop over the geography levels we want to consolidate on
    for geo in geo_list:

        # Each estimate table will create one df each of which has the same number of rows (one row
        # per unique geography region and year). Store them here to merge after
        est_tables = []

        # Loop over every estimate table we want to consolidate
        for est_table_name in est_table_list:

            # Get the estimate table
            est_table = get_table_by_geography(connection, config, est_table_name, geo, pivot=True)

            # Do some transformations to align the format with what we want in the csv
            # Similar to in get_table_by_geography, we have different behavior for the households
            # table as we ignore the column household_size_id. As a result, the table returned by
            # get_table_by_geography is already in the correct format
            est_table = est_table.reset_index()
            if(est_table_name != "households"):
                est_table.columns = est_table.columns.get_level_values(0)[:2].append(
                    est_table.columns.get_level_values(1)[2:])

            # Add the transformed estimate table to our list of tables
            est_tables.append(est_table)

        # Combine all the transformed estimate tables into one large table
        combined_table = pd.concat(est_tables, axis=1)

        # Since each of the estimates table has its own version of geo, "yr_id", remove those
        # duplicate columns
        combined_table = combined_table.loc[:, ~combined_table.columns.duplicated()]

        # Store the combined table
        combined_tables.append(combined_table)

    # Return all the combined tables
    # TODO: Save the tables somewhere?
    return combined_tables

In [6]:
# Example usage
# Note, geo_list and est_table_list are optional parameters as they both have default values
# Also, this is only showing the jurisdiction table (index [1])
consolidate(DDAM, CONFIG)[1]

Unnamed: 0,jurisdiction,yr_id,10 to 14,15 to 17,18 and 19,20 to 24,25 to 29,30 to 34,35 to 39,40 to 44,...,Single Family - Detached,Single Family - Multiple Unit,Single-family Attached,Single-family Detached,Group Quarters - College,Group Quarters - Military,Group Quarters - Other,Household Population,Female,Male
0,Carlsbad,2010,7351,4374,2047,4806,5986,6348,7595,8248,...,22004,5934,0,0,0,0,915,104413,53736,51592
1,Carlsbad,2011,7372,4510,2464,5387,6025,6464,7269,8321,...,22096,5995,0,0,0,0,915,105853,54437,52331
2,Carlsbad,2012,7276,4650,2675,6018,6009,6626,6837,8451,...,22263,6044,0,0,0,0,915,107357,55193,53079
3,Carlsbad,2013,7229,4684,2721,6593,6102,6779,6731,8377,...,22517,6088,0,0,0,0,915,108740,55752,53903
4,Carlsbad,2014,7049,4724,2718,7068,6121,6876,6747,8080,...,22823,6151,0,0,0,0,915,110202,56456,54661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,Vista,2016,7259,4671,2952,7488,6683,7125,6081,5853,...,14712,2251,0,0,0,0,2051,97084,49492,49643
205,Vista,2017,7664,4782,2903,8165,6819,7068,6339,5810,...,14834,2259,0,0,0,0,2059,99820,50811,51068
206,Vista,2018,7860,4851,2893,7867,6819,6779,6570,5745,...,14917,2271,0,0,0,0,2123,99861,50789,51195
207,Vista,2019,8038,4810,2939,7933,6699,6513,6719,5756,...,14941,2274,0,0,0,0,2142,99956,50918,51180
