# Example Usage

In [1]:
# Keep track of runtime which will be printed at the end of this notebook
import time
start = time.time()

In [2]:
# Library imports
import pathlib

import numpy as np
import pandas as pd

# Local modules
import functions as f

In [3]:
# The list of geographies to run on. To add any additional geography such as "mgra", simply add it 
# to the list below. 
# NOTE: Including "mgra" may result in extremely long processing times. As far as I can tell, this 
# is due to SQL Server taking an extremely long time to process the age_sex_ethnicity table at the
# mgra level. When running on Eric's Surface laptop, simply getting/saving the age_ethnicity and
# age_sex_ethnicity tables at the mgra level took about 20 minutes each.
GEOGRAPHIES = ["region", "jurisdiction", "cpa", "mgra"]

# The estimates tables to run on. The tables are split up into three different variables
# EST_TABLES contains Estimates tables where the key columns are ONLY geography name and year
# AGE_SEX_ETHNICITY contains Estimates tables where key columns contain additional information such
# as age category or ethnicity category
# CONSOLIDATED is the name of the consolidated file
EST_TABLES = ['age', 'ethnicity', 'household_income', 'households', 'housing', 'population', 'sex']
AGE_SEX_ETHNICITY = ["age_ethnicity", "age_sex_ethnicity"]
CONSOLIDATED = ["consolidated"]

# The Estimates/DOF vintages to run on. Checks are run on the NEW_VINTAGE by default. OLD_VINTAGE is
# only used in the creation of diff files, which contain NEW_VINTAGE - OLD_VINTAGE. DOF_VINTAGE is
# used for downloading DOF data, and comparing region level population values in Check 6
NEW_VINTAGE = "2021_01"
OLD_VINTAGE = "2020_06"
DOF_VINTAGE = "2021_07_14"

# Default locations where files should be saved to and loaded from.
# NOTE: These folders are created by the save function, no need to do so yourself.
RAW_FOLDER = pathlib.Path("./data/raw_data/")
DIFF_FOLDER = pathlib.Path("./data/diff/")
PROP_FOLDER = pathlib.Path("./data/proportion/")
OUTPUTS_FOLDER = pathlib.Path("./data/outputs/")

In [4]:
# Import the module that allows us to create tables
import generate_tables as gt

### Creating Estimates Tables (generate_tables.EstimatesTables)

In [5]:
# Creating individual files (one file per unique geography and Estimate table)
# "_ =" to suppress output
_ = gt.EstimatesTables().individual(
        est_vintage=NEW_VINTAGE,
        geo_list=GEOGRAPHIES,
        est_table_list=EST_TABLES + AGE_SEX_ETHNICITY,
        save=True,
        save_folder=RAW_FOLDER,
        overwrite=False)

In [6]:
# Creating consolidated files (one file per geography, contains all requested tables)
# NOTE: The additional parameter get_from_file=True allows the function to pull data from already
# downloaded Estimates tables, instead of re-downloading and holding in memory
_ = gt.EstimatesTables().consolidate(
        est_vintage=NEW_VINTAGE, 
        geo_list=GEOGRAPHIES,
        est_table_list=EST_TABLES, 
        get_from_file=True,
        raw_folder=RAW_FOLDER,
        save=True,
        save_folder=RAW_FOLDER)

### Creating DOF Data Files (generate_tables.CA_DOF)

In [7]:
# Creating DOF data files:
# 1. Region level population in each year
# 2. Region level population in each year broken down by age/sex/ethnicity
gt.CA_DOF().get_CA_DOF_region_pop(
    dof_vintage=DOF_VINTAGE, 
    save_folder=RAW_FOLDER)

### Creating Diff Files (generate_tables.DiffFiles)

In [8]:
# First create consolidated and individual files using the same parameters as before but with 
# the OLD_VINTAGE instead of the NEW_VINTAGE
_ = gt.EstimatesTables().individual(
        est_vintage=OLD_VINTAGE,
        geo_list=GEOGRAPHIES,
        est_table_list=EST_TABLES + AGE_SEX_ETHNICITY,
        save=True,
        save_folder=RAW_FOLDER,
        overwrite=False)
_ = gt.EstimatesTables().consolidate(
        est_vintage=OLD_VINTAGE, 
        geo_list=GEOGRAPHIES,
        est_table_list=EST_TABLES, 
        get_from_file=True,
        raw_folder=RAW_FOLDER,
        save=True,
        save_folder=RAW_FOLDER)

In [9]:
# Create diff files 
# NOTE: This will output xlsx files with sheets containing OLD_VINTAGE data, NEW_VINTAGE data, and
# NEW_VINTAGE-OLD_VINTAGE data. This is NOT percent difference, but numeric difference
# NOTE: Because of the way that the diff files are created, python is required to hold essentially
# three full tables in memory at the same time. For the "mgra" level and large tables such as 
# consolidated or age_sex_ethnicity, these result in a lack of memory and impossibly long run times.
# "mgra" is thus removed from diff files
NO_MGRA = GEOGRAPHIES.copy()
NO_MGRA.remove("mgra")
_ = gt.DiffFiles().create_diff_tables(
        old_vintage=OLD_VINTAGE, 
        new_vintage=NEW_VINTAGE, 
        geo_list=NO_MGRA,
        est_table_list=EST_TABLES + CONSOLIDATED + AGE_SEX_ETHNICITY,
        raw_data_folder=RAW_FOLDER,
        save=True,
        save_folder=DIFF_FOLDER)

### Creating Proportion Files (generate_tables.ProportionFiles)

In [10]:
# Create proportion files for estimates tables
gt.ProportionFiles().create_est_proportion_tables( 
    est_vintage=NEW_VINTAGE, 
    geo_list=['region'],
    est_table_list=['age', "sex", 'ethnicity', 'household_income', 'age_ethnicity', 'age_sex_ethnicity'],
    raw_data_folder=RAW_FOLDER,
    save=True,
    save_folder=PROP_FOLDER)

In [11]:
# Create proportion files for DOF tables
gt.ProportionFiles().create_DOF_proportion_table(
    DOF_vintage=DOF_VINTAGE,
    raw_data_folder=RAW_FOLDER,
    save=True,
    save_folder=PROP_FOLDER)

## Run Checks

In [12]:
# Import the module that allows us to run checks
import perform_checks as pc

### Run Check 1

In [13]:
# Run internal consistency checks by geography level
# NOTE: Due to a limitation of the function, "region" cannot be included in the input geo_list
# BUG: If new geographies were added to GEOGRAPHIES, make sure that aggregation instructions are
# updated in the variable InternalConsistency()._geography_aggregation. For more details on how,
# see the class docstring
NO_REGION = GEOGRAPHIES.copy()
NO_REGION.remove("region")
pc.InternalConsistency().check_geography_aggregations(
    vintage=NEW_VINTAGE,
    geo_list=NO_REGION,
    est_table="consolidated",
    raw_folder=RAW_FOLDER,  
    save=True,
    save_location=OUTPUTS_FOLDER)

Running Check 1: Check aggregated values between geography levels


ProgrammingError: (pymssql._pymssql.ProgrammingError) (207, b"Invalid column name 'luz'.DB-Lib error message 20018, severity 16:\nGeneral SQL Server error: Check messages from the SQL Server\n")
[SQL: 
SELECT mgra, luz, cpa, jurisdiction, region
FROM [demographic_warehouse].[dim].[mgra_denormalize]
WHERE series=14
]
(Background on this error at: https://sqlalche.me/e/14/f405)

In [14]:
# Run internal consistency checks between Estimates tables
# See function signature for the meaning of "est_table_types"
pc.InternalConsistency().check_internal_aggregations(
        vintage=NEW_VINTAGE, 
        geo_list=GEOGRAPHIES,
        est_table_types=["population", "households"],
        raw_folder=RAW_FOLDER,
        save=True,
        save_location=OUTPUTS_FOLDER)

Running Check 1: Check aggregated values between Estimates tables
Checking Estimates tables with population values at the region level
No errors

Checking Estimates tables with population values at the jurisdiction level
No errors

Checking Estimates tables with population values at the cpa level
No errors

Checking Estimates tables with population values at the mgra level


KeyError: "['Total Population'] not found in axis"

### Run Check 2

In [15]:
# Run spot nulls check
# NOTE: Every table in that is combined into CONSOLIDATED is already represented in EST_TABLES
pc.NullValues().spot_nulls(
    vintage=NEW_VINTAGE, 
    geo_list=GEOGRAPHIES, 
    est_table_list=EST_TABLES + AGE_SEX_ETHNICITY, 
    raw_folder=RAW_FOLDER, 
    save=True,
    save_location=OUTPUTS_FOLDER)

Running Check 2: Spot Nulls
Checking QA_2021_01_region_age.
No errors

Checking QA_2021_01_region_ethnicity.
No errors

Checking QA_2021_01_region_household_income.
No errors

Checking QA_2021_01_region_households.
No errors

Checking QA_2021_01_region_housing.
No errors

Checking QA_2021_01_region_population.
No errors

Checking QA_2021_01_region_sex.
No errors

Checking QA_2021_01_region_consolidated.
No errors

Checking QA_2021_01_region_age_ethnicity.
No errors

Checking QA_2021_01_region_age_sex_ethnicity.
No errors

Checking QA_2021_01_jurisdiction_age.
No errors

Checking QA_2021_01_jurisdiction_ethnicity.
No errors

Checking QA_2021_01_jurisdiction_household_income.
No errors

Checking QA_2021_01_jurisdiction_households.
No errors

Checking QA_2021_01_jurisdiction_housing.
No errors

Checking QA_2021_01_jurisdiction_population.
No errors

Checking QA_2021_01_jurisdiction_sex.
No errors

Checking QA_2021_01_jurisdiction_consolidated.
No errors

Checking QA_2021_01_jurisdiction_a

In [16]:
# Run spot missing geographies/years
# Add on "mgra" geography to this test if missing from the geography list
pc.NullValues().spot_missing_values(
    vintage=NEW_VINTAGE, 
    geo_list=GEOGRAPHIES + ["mgra"],
    est_table_list=EST_TABLES,
    raw_folder=RAW_FOLDER,
    save=True,
    save_location=OUTPUTS_FOLDER)

Running Check 2: Spot Missing Geographies/Years
Checking QA_2021_01_region_age.
No Errors

Checking QA_2021_01_region_ethnicity.
No Errors

Checking QA_2021_01_region_household_income.
No Errors

Checking QA_2021_01_region_households.
No Errors

Checking QA_2021_01_region_housing.
No Errors

Checking QA_2021_01_region_population.
No Errors

Checking QA_2021_01_region_sex.
No Errors

Checking QA_2021_01_jurisdiction_age.
No Errors

Checking QA_2021_01_jurisdiction_ethnicity.
No Errors

Checking QA_2021_01_jurisdiction_household_income.
No Errors

Checking QA_2021_01_jurisdiction_households.
No Errors

Checking QA_2021_01_jurisdiction_housing.
No Errors

Checking QA_2021_01_jurisdiction_population.
No Errors

Checking QA_2021_01_jurisdiction_sex.
No Errors

Checking QA_2021_01_cpa_age.
No Errors

Checking QA_2021_01_cpa_ethnicity.
No Errors

Checking QA_2021_01_cpa_household_income.
No Errors

Checking QA_2021_01_cpa_households.
No Errors

Checking QA_2021_01_cpa_housing.
No Errors

Chec

### Run Check 3

In [17]:
# N/A, done already by generate_tables.DiffFiles

### Run Check 4

In [18]:
# Run checks that year over year Estimates values do not change by too much
# This check should only be run at the region level to prevent small populations with high 
# variability from distorting results
if("region" in GEOGRAPHIES):
    pc.ThresholdAnalysis().check_thresholds(
        threshold=5,
        vintage=NEW_VINTAGE, 
        geo_list=["region"],
        est_table_list=EST_TABLES,
        raw_folder=RAW_FOLDER, 
        save=True,
        save_location=OUTPUTS_FOLDER)
else:
    print("This check should only be run at the region level.")

Running check 4: Threshold Analysis
Checking file QA_2021_01_region_age.
Errors have occurred on the following rows:
       region  yr_id  Under 5  |% Diff| Under 5  5 to 9  |% Diff| 5 to 9  \
0   San Diego   2010   203423               NaN  194029              NaN   
1   San Diego   2011   209939          3.203178  197889         1.989393   
2   San Diego   2012   215677          2.733175  203754         2.963783   
3   San Diego   2013   219753          1.889863  210910         3.512078   
4   San Diego   2014   222064          1.051635  216798         2.791712   
5   San Diego   2015   223794          0.779055  221228         2.043377   
6   San Diego   2016   223203          0.264082  224837         1.631349   
7   San Diego   2017   221450          0.785384  227947         1.383224   
8   San Diego   2018   218753          1.217882  229178         0.540038   
9   San Diego   2019   213481          2.410024  230127         0.414089   
10  San Diego   2020   205904          3.549262

In [19]:
# Run checks that year over year age_sex and age_sex_ethnicity Estimates values do not change by 
# too much
# Similar to the previous notebook cell, this check should only be run at the region level. Still,
# breaking down into age/sex/ethnicity or just age/ethnicity will greater lower populations and 
# increase volatility. As such, the threshold has been increased from 5 to 15
if("region" in GEOGRAPHIES):
    pc.ThresholdAnalysis().check_thresholds(
        threshold=15,
        vintage=NEW_VINTAGE, 
        geo_list=["region"],
        est_table_list=AGE_SEX_ETHNICITY,
        raw_folder=RAW_FOLDER, 
        save=True,
        save_location=OUTPUTS_FOLDER)
else:
    print("This check should only be run at the region level.")

Running check 4: Threshold Analysis
Checking file QA_2021_01_region_age_ethnicity.
Errors have occurred on the following rows:
        region  yr_id          name  Hispanic  |% Diff| Hispanic  \
12   San Diego   2010      15 to 17     57838                NaN   
13   San Diego   2011      15 to 17     53190           8.036239   
24   San Diego   2010     18 and 19     39688                NaN   
25   San Diego   2011     18 and 19     33682          15.133038   
148  San Diego   2014     60 and 61     13966           7.480376   
149  San Diego   2015     60 and 61     15183           8.714020   
150  San Diego   2016     60 and 61     16359           7.745505   
151  San Diego   2017     60 and 61     17202           5.153127   
152  San Diego   2018     60 and 61     17893           4.016975   
153  San Diego   2019     60 and 61     18649           4.225116   
154  San Diego   2020     60 and 61     19491           4.514987   
156  San Diego   2010      62 to 64     14273            

### Run Check 5

In [20]:
# N/A, done in Power BI

### Run Check 6

In [21]:
# Run checks that region level Estimates population values are within +/- 1.5% of CA DOF population
# values
pc.DOFPopulation().region_DOF_population_comparison(
    threshold=1.5,
    est_vintage=NEW_VINTAGE, 
    DOF_vintage=DOF_VINTAGE,
    raw_folder=RAW_FOLDER, 
    save=True,
    save_location=OUTPUTS_FOLDER)

Running Check 6: Estimates vs DOF Population Values
No errors



### Run Check 7

In [22]:
# Run checks that Estimates categorical distributions are within a certain range of CA DOF 
# categorical distributions. For example, is the percent of population in households vs group
# quarters roughly the same between Estimates and CA DOF?
pc.DOFProportion().check_DOF_proportion(
    threshold=1.5,
    est_vintage=NEW_VINTAGE,
    DOF_vintage=DOF_VINTAGE,
    prop_folder=PROP_FOLDER,
    save=True,
    save_location=OUTPUTS_FOLDER)

Running Check 7: DOF Categorical Proportion Check
Checking row_prop files
Errors have occurred on the following rows:
      region  yr_id       name     sex  Hispanic  Non-Hispanic, White  \
2  San Diego   2010   15 to 17  Female  4.222597             3.591888   
3  San Diego   2010   15 to 17    Male  4.447925             4.271212   
4  San Diego   2010  18 and 19  Female  2.831824             2.734724   
5  San Diego   2010  18 and 19    Male  8.626628             8.833974   
6  San Diego   2010   20 to 24  Female  3.568784             2.754082   
7  San Diego   2010   20 to 24    Male  5.180193             4.725284   

   Non-Hispanic, Asian  Non-Hispanic, Hawaiian or Pacific Islander  \
2             1.801953                                    0.047799   
3             1.398805                                    0.030476   
4             0.067719                                    0.018460   
5             0.940497                                    0.044858   
6             0.8124

## Runtime

In [23]:
runtime = time.time() - start
minutes, seconds = divmod(runtime, 60)
print(f'Runtime: {int(minutes)} minutes, {(int(seconds))} seconds')

Runtime: 3 minutes, 10 seconds
