# MGRA Series 13 Specific Outputs

In [1]:
# Keep track of runtime which will be printed at the end of this notebook
import time
start = time.time()

In [2]:
import textwrap
import pathlib

import pandas as pd
import sqlalchemy as sql

## Parameters and Queries

In [3]:
# TODO
DDAM = sql.create_engine('mssql+pymssql://DDAMWSQL16/')

# Where to save outputs and the file format to save as
SAVE_FOLDER = pathlib.Path("./mgra_13_outputs/")
FILE_TEMPLATE = "forecast_{dsid}_{table}.csv"

# TODO
GEOGRAPHIES = ["mgra_id", "mgra", "cpa", "jurisdiction", "region"]
FORMATTED_GEOS = f"mgra.{', '.join(GEOGRAPHIES)}"

# TODO
MGRA_SERIES = 14
DATASOURCE_IDS = [35, 38, 41, 42]

# TODO
category_queries = {
    "age": textwrap.dedent("""\
        SELECT DISTINCT name
        FROM [demographic_warehouse].[dim].[age_group]
        ORDER BY name
        """),
    "ethnicity": textwrap.dedent("""\
        SELECT DISTINCT long_name
        FROM [demographic_warehouse].[dim].[ethnicity]
        ORDER BY long_name
        """),
    "household_income": textwrap.dedent("""\
        SELECT DISTINCT name
        FROM [demographic_warehouse].[dim].[income_group]
        WHERE categorization = 10
            AND constant_dollars_year = 2010
        """),
    "housing": textwrap.dedent("""\
        SELECT DISTINCT long_name
        FROM [demographic_warehouse].[dim].[structure_type]
        ORDER BY long_name
        """),
    "jobs": textwrap.dedent("""\
        SELECT DISTINCT full_name
        FROM [demographic_warehouse].[dim].[employment_type]
        ORDER BY full_name
        """),
    "population":textwrap.dedent("""\
        SELECT DISTINCT long_name
        FROM [demographic_warehouse].[dim].[housing_type]
        ORDER BY long_name
        """),
    "sex": textwrap.dedent("""\
        SELECT DISTINCT sex
        FROM [demographic_warehouse].[dim].[sex]
        ORDER BY sex
        """),
}

# TODO
fact_queries = {
    "age": textwrap.dedent("""\
        SELECT * FROM (
            SELECT {geos}, yr_id, age_group.[name], [population] 
            FROM [demographic_warehouse].[fact].[age] as tbl
            INNER JOIN [demographic_warehouse].[dim].[mgra_denormalize] AS mgra
                ON mgra.mgra_id = tbl.mgra_id
                AND mgra.series = {mgra_series}
            INNER JOIN [demographic_warehouse].[dim].[age_group] as age_group
                ON age_group.age_group_id = tbl.age_group_id
            WHERE tbl.datasource_id = {dsid}) as p
        PIVOT (
            SUM([population])
            FOR [name] IN (
                {categories}
            )
        ) as pivot_table
        ORDER BY mgra_id, yr_id
        """),
    "ethnicity": textwrap.dedent("""\
        SELECT * FROM (
            SELECT {geos}, yr_id, ethnicity.[long_name], [population] 
            FROM [demographic_warehouse].[fact].[ethnicity] as tbl
            INNER JOIN [demographic_warehouse].[dim].[mgra_denormalize] AS mgra
                ON mgra.mgra_id = tbl.mgra_id
                AND mgra.series = {mgra_series}
            INNER JOIN [demographic_warehouse].[dim].[ethnicity] as ethnicity
                ON ethnicity.ethnicity_id = tbl.ethnicity_id
            WHERE tbl.datasource_id = {dsid}) as p
        PIVOT (
            SUM([population])
            FOR [long_name] IN (
                {categories}
            )
        ) as pivot_table
        ORDER BY mgra_id, yr_id
        """),
    "household_income": textwrap.dedent("""\
        SELECT * FROM (
            SELECT {geos}, yr_id, income_group.[name], [households] 
            FROM [demographic_warehouse].[fact].[household_income] as tbl
            INNER JOIN [demographic_warehouse].[dim].[mgra_denormalize] AS mgra
                ON mgra.mgra_id = tbl.mgra_id
                AND mgra.series = {mgra_series}
            INNER JOIN [demographic_warehouse].[dim].[income_group] as income_group
                ON income_group.income_group_id = tbl.income_group_id
            WHERE tbl.datasource_id = {dsid}) as p
        PIVOT (
            SUM([households])
            FOR [name] IN (
                {categories}
            )
        ) as pivot_table
        ORDER BY mgra_id, yr_id
        """),
    "housing": textwrap.dedent("""\
        SELECT * FROM (
            SELECT {geos}, yr_id, structure_type.[long_name], [units] 
            FROM [demographic_warehouse].[fact].[housing] as tbl
            INNER JOIN [demographic_warehouse].[dim].[mgra_denormalize] AS mgra
                ON mgra.mgra_id = tbl.mgra_id
                AND mgra.series = {mgra_series}
            INNER JOIN [demographic_warehouse].[dim].[structure_type] as structure_type
                ON structure_type.structure_type_id = tbl.structure_type_id
            WHERE tbl.datasource_id = {dsid}) as p
        PIVOT (
            SUM([units])
            FOR [long_name] IN (
                {categories}
            )
        ) as pivot_table
        ORDER BY mgra_id, yr_id
        """),
    "housing_units": textwrap.dedent("""\
        SELECT {geos}, yr_id, 
            SUM([units]) as units, 
            SUM([unoccupiable]) as unoccupiable, 
            SUM([occupied]) as occupied, 
            SUM([vacancy]) as vacancy
        FROM [demographic_warehouse].[fact].[housing] as tbl
        INNER JOIN [demographic_warehouse].[dim].[mgra_denormalize] AS mgra
            ON mgra.mgra_id = tbl.mgra_id
            AND mgra.series = {mgra_series}
        WHERE tbl.datasource_id = {dsid}
        GROUP BY {geos}, yr_id
        ORDER BY {geos}, yr_id
        """),
    "jobs": textwrap.dedent("""\
        SELECT * FROM (
            SELECT {geos}, yr_id, employment_type.[full_name], [jobs] 
            FROM [demographic_warehouse].[fact].[jobs] as tbl
            INNER JOIN [demographic_warehouse].[dim].[mgra_denormalize] AS mgra
                ON mgra.mgra_id = tbl.mgra_id
                AND mgra.series = {mgra_series}
            INNER JOIN [demographic_warehouse].[dim].[employment_type] as employment_type
                ON employment_type.employment_type_id = tbl.employment_type_id
            WHERE tbl.datasource_id = {dsid}) as p
        PIVOT (
            SUM([jobs])
            FOR [full_name] IN (
                {categories}
            )
        ) as pivot_table
        ORDER BY mgra_id, yr_id
        """),
    "population":textwrap.dedent("""\
        SELECT * FROM (
            SELECT {geos}, yr_id, housing_type.[long_name], [population] 
            FROM [demographic_warehouse].[fact].[population] as tbl
            INNER JOIN [demographic_warehouse].[dim].[mgra_denormalize] AS mgra
                ON mgra.mgra_id = tbl.mgra_id
                AND mgra.series = {mgra_series}
            INNER JOIN [demographic_warehouse].[dim].[housing_type] as housing_type
                ON housing_type.housing_type_id = tbl.housing_type_id
            WHERE tbl.datasource_id = {dsid}) as p
        PIVOT (
            SUM([population])
            FOR [long_name] IN (
                {categories}
            )
        ) as pivot_table
        ORDER BY mgra_id, yr_id
        """),
    "sex": textwrap.dedent("""\
        SELECT * FROM (
            SELECT {geos}, yr_id, sex.[sex], [population] 
            FROM [demographic_warehouse].[fact].[sex] as tbl
            INNER JOIN [demographic_warehouse].[dim].[mgra_denormalize] AS mgra
                ON mgra.mgra_id = tbl.mgra_id
                AND mgra.series = {mgra_series}
            INNER JOIN [demographic_warehouse].[dim].[sex] as sex
                ON sex.sex_id = tbl.sex_id
            WHERE tbl.datasource_id = {dsid}) as p
        PIVOT (
            SUM([population])
            FOR [sex] IN (
                {categories}
            )
        ) as pivot_table
        ORDER BY mgra_id, yr_id
        """),
}

## Getting the Data

In [4]:
for datasource_id in DATASOURCE_IDS:
    for table_name, query in fact_queries.items():

        # Get and save the file
        print(f"Getting datasource_id={datasource_id}, table={table_name}")

        # Skip the file if it exists already
        file_name = SAVE_FOLDER / FILE_TEMPLATE.format(dsid=datasource_id, table=table_name)
        if(file_name.is_file()):
            print("File already exists, skipping...")
            
        # If the file does not exist than download and save
        else:
            print("Getting table from DDAMWSQL16")

            # The housing units table is already in pivot table format, no need to get the 
            # categorical variables
            if(table_name != "housing_units"):

                # Get the list of categorical variables
                cat_vars = [f"[{x[0]}]" for x in pd.read_sql_query(category_queries[table_name], con=DDAM).values]
                cat_vars = ", ".join(cat_vars)

                # Format the query
                formatted_query = query.format(
                    geos=FORMATTED_GEOS, 
                    mgra_series=MGRA_SERIES, 
                    dsid=datasource_id,
                    categories=cat_vars)
                
                # Actually get and save the file
                table = pd.read_sql_query(formatted_query, con=DDAM)
                
                # The jobs table contains a bunch of extra years, remove them
                if(table_name == "jobs"):
                    years = pd.read_sql_query(textwrap.dedent("""\
                        SELECT DISTINCT yr_id
                        FROM [demographic_warehouse].[fact].[age]
                        WHERE datasource_id = 41
                        """), con=DDAM)
                    table = table[table["yr_id"].isin(years["yr_id"])]
                table.to_csv(file_name, index=False)
            
            else:
                # Custom behavior for the housing units table
                formatted_query = query.format(
                    geos=FORMATTED_GEOS, 
                    mgra_series=MGRA_SERIES, 
                    dsid=datasource_id)
                table = pd.read_sql_query(formatted_query, con=DDAM)
                table.to_csv(file_name, index=False)

            print("Completed")
            
        print()

Getting datasource_id=35, table=age
File already exists, skipping...

Getting datasource_id=35, table=ethnicity
File already exists, skipping...

Getting datasource_id=35, table=household_income
File already exists, skipping...

Getting datasource_id=35, table=housing
File already exists, skipping...

Getting datasource_id=35, table=housing_units
File already exists, skipping...

Getting datasource_id=35, table=jobs
File already exists, skipping...

Getting datasource_id=35, table=population
File already exists, skipping...

Getting datasource_id=35, table=sex
File already exists, skipping...

Getting datasource_id=38, table=age
File already exists, skipping...

Getting datasource_id=38, table=ethnicity
File already exists, skipping...

Getting datasource_id=38, table=household_income
File already exists, skipping...

Getting datasource_id=38, table=housing
File already exists, skipping...

Getting datasource_id=38, table=housing_units
File already exists, skipping...

Getting datasour

## Verifying the Data

In [5]:
# Get all the files in the save folder
files = [f for f in SAVE_FOLDER.glob("**/*") if f.is_file()]

# It is possible that consolidated files already exist in the folder, so filter out any files
# which contain the string "ind"
files = [f for f in files if "ind" not in str(f)]

In [6]:
# Check that within each file...
for file in files:
    print(f"Checking {file}")
    table = pd.read_csv(file)
    errors = False

    # Each mgra_id is associated with each year. In other words, the number of rows should be the 
    # number of distinct mgra_ids multiplied by the number of distinct yr_ids
    num_mgra_id = len(table["mgra_id"].unique())
    num_yr_id = len(table["yr_id"].unique())
    if(num_mgra_id * num_yr_id != table.shape[0]):
        errors = True
        print(textwrap.dedent(f"""\
            {file} has {num_mgra_id} distinct mgra_ids and {num_yr_id} distinct num_yr_ids, so it 
            should have {num_mgra_id} x {num_yr_id} = {num_mgra_id * num_yr_id} rows of data. 
            However, {file} only has {table.shape[0]} rows of data.
            """).replace("\n", "").replace("\r", ""))

    # Each unique mgra_id appears in num_yr_id rows of data, and each unique yr_id appears in 
    # num_mgra_id rows of data
    count = (table["mgra_id"].value_counts() == num_yr_id).sum()
    if(not count == num_mgra_id):
        errors = True
        print(textwrap.dedent(f"""\
            Each unique mgra_id should appear once for each distinct year, or {num_yr_id} times. 
            However, this only occurs for {count} mgra_ids instead of {num_mgra_id} mgra_ids.
            """).replace("\n", ""))
    count = (table["yr_id"].value_counts() == num_mgra_id).sum()
    if(not count == num_yr_id):
        errors = True
        print(textwrap.dedent(f"""\
            Each unique yr_id should appear once for each distinct mgra_id, or {num_mgra_id} times. 
            However, this only occurs for {count} yr_ids instead of {num_yr_id} yr_ids.
            """).replace("\n", ""))

    # Note the lack of errors if necessary
    if(not errors):
        print("No Errors")
    print()
    

Checking mgra_13_outputs\forecast_35_age.csv
No Errors

Checking mgra_13_outputs\forecast_35_ethnicity.csv
No Errors

Checking mgra_13_outputs\forecast_35_household_income.csv
No Errors

Checking mgra_13_outputs\forecast_35_housing.csv
No Errors

Checking mgra_13_outputs\forecast_35_housing_units.csv
No Errors

Checking mgra_13_outputs\forecast_35_jobs.csv
No Errors

Checking mgra_13_outputs\forecast_35_population.csv
No Errors

Checking mgra_13_outputs\forecast_35_sex.csv
No Errors

Checking mgra_13_outputs\forecast_38_age.csv
No Errors

Checking mgra_13_outputs\forecast_38_ethnicity.csv
No Errors

Checking mgra_13_outputs\forecast_38_household_income.csv
No Errors

Checking mgra_13_outputs\forecast_38_housing.csv
No Errors

Checking mgra_13_outputs\forecast_38_housing_units.csv
No Errors

Checking mgra_13_outputs\forecast_38_jobs.csv
No Errors

Checking mgra_13_outputs\forecast_38_population.csv
No Errors

Checking mgra_13_outputs\forecast_38_sex.csv
No Errors

Checking mgra_13_outpu

## Consolidating the files

In [7]:
# Get the files for each datasource
for datasource_id in DATASOURCE_IDS:
    ds_files = [f for f in files if str(datasource_id) in str(f)]

    # Get the tables
    tables = []
    for file in ds_files:
        tables.append(pd.read_csv(file))

    consolidated = tables[0]
    for i in range(1, len(tables)):
        consolidated = pd.concat([consolidated, tables[i].drop(GEOGRAPHIES + ["yr_id"], axis=1)], axis=1)

    consolidated.to_csv(SAVE_FOLDER / f"forecast_{datasource_id}_mgra_id_ind.csv", index=False)

## Aggregating from mgra_id to mgra, cpa, jurisdiction, region

In [8]:
# Get the consolidated file for each datasource
for datasource_id in DATASOURCE_IDS:
    consolidated = pd.read_csv(SAVE_FOLDER / f"forecast_{datasource_id}_mgra_id_ind.csv")

    # Aggregate up to every level except mgra_id
    aggregate_geo_list = GEOGRAPHIES[:]
    aggregate_geo_list.remove("mgra_id")

    print(f"Aggregating forecast {datasource_id} from \"mgra_id\" to {', '.join(aggregate_geo_list)}")

    # Aggregate up to each geography level
    for agg_geo in aggregate_geo_list:

        # First select the columns which have actual data
        # NOTE: This includes "yr_id"
        data_cols = list(consolidated.columns[len(GEOGRAPHIES):])

        # Then filter to only the geography we want and those data columns
        aggregated = consolidated.copy(deep=True)[[agg_geo] + data_cols]

        # Finally group by the geography and year
        aggregated = aggregated.groupby([agg_geo, "yr_id"]).sum().reset_index(drop=False)

        # Save the aggregated file with an appropriate name
        aggregated.to_csv(SAVE_FOLDER / f"forecast_{datasource_id}_{agg_geo}_ind.csv", index=False)

    print("Completed")
    print()

Aggregating forecast 35 from "mgra_id" to mgra, cpa, jurisdiction, region
Completed

Aggregating forecast 38 from "mgra_id" to mgra, cpa, jurisdiction, region
Completed

Aggregating forecast 41 from "mgra_id" to mgra, cpa, jurisdiction, region
Completed

Aggregating forecast 42 from "mgra_id" to mgra, cpa, jurisdiction, region
Completed



In [9]:
runtime = time.time() - start
minutes, seconds = divmod(runtime, 60)
print(f'Runtime: {int(minutes)} minutes, {(int(seconds))} seconds')

Runtime: 1 minutes, 3 seconds
