# Estimate Tests

In [1]:
# For config file
import json

import pandas as pd
import sqlalchemy as sql

In [2]:
DDAM = sql.create_engine('mssql+pymssql://DDAMWSQL16')

CONFIG = None
with open("./config.json", "r") as f:
    CONFIG = json.load(f)

Below is the function I wrote that gets estimates data aggregated to the specified geography level

In [38]:
def get_table_by_geography(connection, config, est_table, geo_level, debug=False):
    """
    Get the input estimates table grouped by the input geography level

    :param connection:  The connection to the relevant SQL server (AFAIK always DDAMWSWL16)
    :param config:      The config file. See "./config.json" for details
    :param est_table:   The name of the estimates table. This is the part after "dw_"
    :param geo_level:   The geography level to group by. This is a string input corresponding to one
                        of the column names of [demographic_warehouse].[dim].[mgra_denormalize]. For
                        example, this variable could contain "sra", "college", or "jurisdiction"
    :param debug:       By default, False. If True, then print out diagnostic statements
    :returns:           Dataframe containing the requested table grouped by the geography level
    """
    # The basic format of every table we are looking at. To use, call
    # EST_BASE_TABLE.format(<TABLE NAME>)
    EST_BASE_TABLE = "[estimates].[est_2020_06].[dw_{0}]"

    # The basic format of every dim table we are looking at. To use, call:
    # DIM_BASE_TABLE.format(<TABLE NAME>)
    DIM_BASE_TABLE = "[demographic_warehouse].[dim].[{0}]"

    # Create the basic structure of the SQL query
    # Note, none of the formatted strings should end with a ","
    query = """
SELECT {mgra_denormalize_col}, yr_id, {dim_named_cols}, {agg_col}
FROM {est_base_table} as tbl
{joins}
WHERE {geography_filter}
GROUP BY {mgra_denormalize_col}, yr_id, {join_col}, {dim_named_cols}
ORDER BY {mgra_denormalize_col}, yr_id, {join_col}
"""

    # We additionally need the columns that exist in the estimates table
    COLUMNS = pd.read_sql_query(f"""
        SELECT TOP(0) *
        FROM {EST_BASE_TABLE.format(est_table)}
    """, con=connection).columns
    if(debug):
        print(f"{'Columns in estimates table:' : <32}", list(COLUMNS))
        print()

    # From the list of columns, we can find exactly which columns we want to be joining on. These
    # are the columns which end with "_id" but are not "mgra_id" nor "yr_id"
    ID_COLUMNS = [col for col in COLUMNS if 
        col.endswith("_id") and 
        col != "mgra_id" and 
        col != "yr_id"
    ]

    # The field {mgra_denormalize_col} is asking for the column name that contains the geography
    # variable ("sra", "college", "jurisdiction", etc.)
    mgra_denormalize_col = geo_level

    # The field {dim_named_cols} is asking for the (formatted) columns in the dim tables that 
    # contain the long form representations of the ids. For example, in the dim table age_group,
    # age_group_id=1 corresponds to name="Under 5", so we want the "name" column as it is the most
    # descriptive
    dim_named_cols = config["dim"][ID_COLUMNS[0]]["column(s)"][0]
    # TODO: What if the dim table has multiple columns we are looking for?
    
    # The field {agg_col} is asking for the column of the estimates table we are aggregating on
    # and the function used to aggregate. This information is contained in config["est"]
    agg_list = config["est"][est_table]
    agg_col = ""
    for aggregation in agg_list:
        agg_col += "{function}({col}) as {col}".format(function=aggregation[1], col=aggregation[0])
        agg_col += ", "
    agg_col = agg_col[:-2] # Remove the trailing comma
    if(debug):
        print(f"{'Aggregation instructions:' : <32}", config["est"][est_table])
        print()
    
    # The field {est_base_table} is asking for
    est_base_table = EST_BASE_TABLE.format(est_table)
    
    # The field {joins} is asking for formatted list of INNER JOINs that add on each dim table to
    # the estimates table. This information is contained in config["dim"]
    # TODO: Are there null mgra_id values? May need to change to LEFT JOIN
    # Note, we always want to join on mgra_id, so add that to the list
    JOIN_COLS = ["mgra_id"] + ID_COLUMNS
    joins = ""
    for join_col in JOIN_COLS:
        dim_table = config["dim"][join_col]["dim_table"]
        joins += f"""
INNER JOIN {DIM_BASE_TABLE.format(dim_table)} as {dim_table} ON
    {dim_table}.{join_col} = tbl.{join_col}
"""
    if(debug):
        print(f"{'Columns to join on:' : <32}", list(JOIN_COLS))
        print()
    
    # The field {geography_filter} is asking for the conditional where we only get the rows of the 
    # table where the geography level we are interested in is not NULL
    geography_filter = f"{mgra_denormalize_col} IS NOT NULL"

    # The field {join_col} is asking for the column of the estimates table we are joining on in 
    # order to keep categorical variables in the same order
    # TODO: This assumes there is only one join to be made
    join_col = f"tbl.{ID_COLUMNS[0]}"

    # Fill in the blanks of the query
    query = query.format(
        mgra_denormalize_col=mgra_denormalize_col,
        dim_named_cols=dim_named_cols,
        agg_col=agg_col,
        est_base_table=est_base_table,
        joins=joins,
        geography_filter=geography_filter,
        join_col = join_col
    )
    if(debug):
        print("*** FULL QUERY BELOW ***")
        print(query)
        print("*** END FULL QUERY ***")

    # Get the table into pandas
    table = pd.read_sql_query(query, con=connection)
    return table

Here is some testing I did of the function

In [39]:
get_table_by_geography(DDAM, CONFIG, "housing", "jurisdiction", True)

Columns in estimates table:      ['mgra_id', 'yr_id', 'structure_type_id', 'units', 'unoccupiable', 'occupied', 'vacancy']

Aggregation instructions:        [['units', 'SUM'], ['unoccupiable', 'SUM'], ['occupied', 'SUM'], ['vacancy', 'SUM']]

Columns to join on:              ['mgra_id', 'structure_type_id']

*** FULL QUERY BELOW ***

SELECT jurisdiction, yr_id, long_name, SUM(units) as units, SUM(unoccupiable) as unoccupiable, SUM(occupied) as occupied, SUM(vacancy) as vacancy
FROM [estimates].[est_2020_06].[dw_housing] as tbl

INNER JOIN [demographic_warehouse].[dim].[mgra_denormalize] as mgra_denormalize ON
    mgra_denormalize.mgra_id = tbl.mgra_id

INNER JOIN [demographic_warehouse].[dim].[structure_type] as structure_type ON
    structure_type.structure_type_id = tbl.structure_type_id

WHERE jurisdiction IS NOT NULL
GROUP BY jurisdiction, yr_id, tbl.structure_type_id, long_name
ORDER BY jurisdiction, yr_id, tbl.structure_type_id

*** END FULL QUERY ***


Unnamed: 0,jurisdiction,yr_id,long_name,units,unoccupiable,occupied,vacancy
0,Carlsbad,2010,Single Family - Detached,23359,1034,22004,1355
1,Carlsbad,2010,Single Family - Multiple Unit,6350,290,5934,416
2,Carlsbad,2010,Multifamily,13221,703,12249,972
3,Carlsbad,2010,Mobile Home,1299,120,1158,141
4,Carlsbad,2010,Single-family Detached,0,0,0,0
...,...,...,...,...,...,...,...
1249,Vista,2020,Single Family - Multiple Unit,2363,43,2298,65
1250,Vista,2020,Multifamily,12550,236,12226,324
1251,Vista,2020,Mobile Home,1998,47,1948,50
1252,Vista,2020,Single-family Detached,0,0,0,0
