# Example Usage

In [2]:
# Keep track of runtime which will be printed at the end of this notebook
import time
start = time.time()

In [3]:
# Library imports
import pathlib

import numpy as np
import pandas as pd

# Local modules
import functions as f

In [4]:
# The list of geographies to run on. To add any additional geography such as "mgra", simply add it 
# to the list below. 
# NOTE: Including "mgra" may result in extremely long processing times. As far as I can tell, this 
# is due to SQL Server taking an extremely long time to process the age_sex_ethnicity table at the
# mgra level. When running on Eric's Surface laptop, simply getting/saving the age_ethnicity and
# age_sex_ethnicity tables at the mgra level took about 20 minutes each.
GEOGRAPHIES = ["region", "jurisdiction", "cpa", "mgra"]

# The estimates tables to run on. The tables are split up into three different variables
# EST_TABLES contains Estimates tables where the key columns are ONLY geography name and year
# AGE_SEX_ETHNICITY contains Estimates tables where key columns contain additional information such
# as age category or ethnicity category
# CONSOLIDATED is the name of the consolidated file
EST_TABLES = ['age', 'ethnicity', 'household_income', 'households', 'housing', 'population', 'sex']
AGE_SEX_ETHNICITY = ["age_ethnicity", "age_sex_ethnicity"]
CONSOLIDATED = ["consolidated"]

# The Estimates/DOF vintages to run on. Checks are run on the NEW_VINTAGE by default. OLD_VINTAGE is
# only used in the creation of diff files, which contain NEW_VINTAGE - OLD_VINTAGE. DOF_VINTAGE is
# used for downloading DOF data, and comparing region level population values in Check 6
NEW_VINTAGE = "2021_01"
OLD_VINTAGE = "2020_06"
DOF_VINTAGE = "2021_07_14"

# Default locations where files should be saved to and loaded from.
# NOTE: These folders are created by the save function, no need to do so yourself.
RAW_FOLDER = pathlib.Path("./data/raw_data/")
DIFF_FOLDER = pathlib.Path("./data/diff/")
PROP_FOLDER = pathlib.Path("./data/proportion/")
OUTPUTS_FOLDER = pathlib.Path("./data/outputs/")

In [5]:
# Import the module that allows us to create tables
import generate_tables as gt

### Creating Estimates Tables (generate_tables.EstimatesTables)

In [17]:
# Creating individual files (one file per unique geography and Estimate table)
# I stopped after 37 minutes
# "_ =" to suppress output
_ = gt.EstimatesTables().individual(
        est_vintage=NEW_VINTAGE,
        geo_list=GEOGRAPHIES,
        est_table_list=EST_TABLES + AGE_SEX_ETHNICITY,
        save=True,
        save_folder=RAW_FOLDER,
        overwrite=False)

Exception during reset or similar
Traceback (most recent call last):
  File "c:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2022\Estimates_QC_Automation\generate_tables.py", line 470, in individual
    f.load(save_folder, est_vintage, geo, est_table_name)
  File "c:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2022\Estimates_QC_Automation\functions.py", line 124, in load
    raise FileNotFoundError(textwrap.dedent(f"""\
FileNotFoundError: No files found for the glob string "QA_2021_01_mgra_age_sex_ethnicity.*" in the folder data\raw_data

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "src\pymssql\_pymssql.pyx", line 559, in pymssql._pymssql.Cursor.fetchall
  File "src\pymssql\_pymssql.pyx", line 510, in pymssql._pymssql.Cursor.getrow
  File "src\pymssql\_mssql.pyx", line 468, in pymssql._mssql.MSSQLRowIterator.__next__
  File "src\pymssql\_mssql.pyx", line 1199, in py

DBAPIError: (pymssql._mssql.MSSQLDriverException) Not connected to any MS SQL server
(Background on this error at: https://sqlalche.me/e/14/dbapi)

In [6]:
# Creating consolidated files (one file per geography, contains all requested tables)
# NOTE: The additional parameter get_from_file=True allows the function to pull data from already
# downloaded Estimates tables, instead of re-downloading and holding in memory
_ = gt.EstimatesTables().consolidate(
        est_vintage=NEW_VINTAGE, 
        geo_list=GEOGRAPHIES,
        est_table_list=EST_TABLES, 
        get_from_file=True,
        raw_folder=RAW_FOLDER,
        save=True,
        save_folder=RAW_FOLDER)

### Creating DOF Data Files (generate_tables.CA_DOF)

In [8]:
# Creating DOF data files:
# 1. Region level population in each year
# 2. Region level population in each year broken down by age/sex/ethnicity
gt.CA_DOF().get_CA_DOF_region_pop(
    dof_vintage=DOF_VINTAGE, 
    save_folder=RAW_FOLDER)

### Creating Diff Files (generate_tables.DiffFiles)

In [9]:
# First create consolidated and individual files using the same parameters as before but with 
# the OLD_VINTAGE instead of the NEW_VINTAGE
_ = gt.EstimatesTables().individual(
        est_vintage=OLD_VINTAGE,
        geo_list=GEOGRAPHIES,
        est_table_list=EST_TABLES + AGE_SEX_ETHNICITY,
        save=True,
        save_folder=RAW_FOLDER,
        overwrite=False)
_ = gt.EstimatesTables().consolidate(
        est_vintage=OLD_VINTAGE, 
        geo_list=GEOGRAPHIES,
        est_table_list=EST_TABLES, 
        get_from_file=True,
        raw_folder=RAW_FOLDER,
        save=True,
        save_folder=RAW_FOLDER)

In [10]:
# Create diff files 
# NOTE: This will output xlsx files with sheets containing OLD_VINTAGE data, NEW_VINTAGE data, and
# NEW_VINTAGE-OLD_VINTAGE data. This is NOT percent difference, but numeric difference
# NOTE: Because of the way that the diff files are created, python is required to hold essentially
# three full tables in memory at the same time. For the "mgra" level and large tables such as 
# consolidated or age_sex_ethnicity, these result in a lack of memory and impossibly long run times.
# "mgra" is thus removed from diff files
NO_MGRA = GEOGRAPHIES.copy()
NO_MGRA.remove("mgra")
_ = gt.DiffFiles().create_diff_tables(
        old_vintage=OLD_VINTAGE, 
        new_vintage=NEW_VINTAGE, 
        geo_list=NO_MGRA,
        est_table_list=EST_TABLES + CONSOLIDATED + AGE_SEX_ETHNICITY,
        raw_data_folder=RAW_FOLDER,
        save=True,
        save_folder=DIFF_FOLDER)

### Creating Proportion Files (generate_tables.ProportionFiles)

In [11]:
# Create proportion files for estimates tables
gt.ProportionFiles().create_est_proportion_tables( 
    est_vintage=NEW_VINTAGE, 
    geo_list=['region'],
    est_table_list=['age', "sex", 'ethnicity', 'household_income', 'age_ethnicity', 'age_sex_ethnicity'],
    raw_data_folder=RAW_FOLDER,
    save=True,
    save_folder=PROP_FOLDER)

In [12]:
# Create proportion files for DOF tables
gt.ProportionFiles().create_DOF_proportion_table(
    DOF_vintage=DOF_VINTAGE,
    raw_data_folder=RAW_FOLDER,
    save=True,
    save_folder=PROP_FOLDER)

## Run Checks

In [13]:
# Import the module that allows us to run checks
import perform_checks as pc

### Run Check 1

In [14]:
# Run internal consistency checks by geography level
# NOTE: Due to a limitation of the function, "region" cannot be included in the input geo_list
# BUG: If new geographies were added to GEOGRAPHIES, make sure that aggregation instructions are
# updated in the variable InternalConsistency()._geography_aggregation. For more details on how,
# see the class docstring
NO_REGION = GEOGRAPHIES.copy()
NO_REGION.remove("region")
pc.InternalConsistency().check_geography_aggregations(
    vintage=NEW_VINTAGE,
    geo_list=NO_REGION,
    est_table=CONSOLIDATED[0],
    raw_folder=RAW_FOLDER,  
    save=True,
    save_location=OUTPUTS_FOLDER)

Running Check 1: Check aggregated values between geography levels
Aggregating jurisdiction level data to region and comparing with region csv file
No errors

Aggregating cpa level data to jurisdiction and comparing with jurisdiction csv file
CPA cannot be aggregated

Aggregating cpa level data to region and comparing with region csv file
CPA cannot be aggregated

Aggregating mgra level data to cpa and comparing with cpa csv file
CPA cannot be aggregated

Aggregating mgra level data to jurisdiction and comparing with jurisdiction csv file
MGRA cannot be aggregated to jurisdiction due to errors in MGRA 13
If Estimates 2021_01 is not using MGRA 13, code needs to be updated

Aggregating mgra level data to region and comparing with region csv file
No errors



In [15]:
# Run internal consistency checks between Estimates tables
# See function signature for the meaning of "est_table_types"
pc.InternalConsistency().check_internal_aggregations(
        vintage=NEW_VINTAGE, 
        geo_list=GEOGRAPHIES,
        est_table_types=["population", "households"],
        raw_folder=RAW_FOLDER,
        save=True,
        save_location=OUTPUTS_FOLDER)

Running Check 1: Check aggregated values between Estimates tables
Checking Estimates tables with population values at the region level
No errors

Checking Estimates tables with population values at the jurisdiction level
No errors

Checking Estimates tables with population values at the cpa level
No errors

Checking Estimates tables with population values at the mgra level
No errors

Checking Estimates tables with households values at the region level
No errors

Checking Estimates tables with households values at the jurisdiction level
No errors

Checking Estimates tables with households values at the cpa level
No errors

Checking Estimates tables with households values at the mgra level
No errors



### Run Check 2

In [16]:
# Run spot nulls check
# NOTE: Every table in that is combined into CONSOLIDATED is already represented in EST_TABLES
pc.NullValues().spot_nulls(
    vintage=NEW_VINTAGE, 
    geo_list=GEOGRAPHIES, 
    est_table_list=EST_TABLES + AGE_SEX_ETHNICITY, 
    raw_folder=RAW_FOLDER, 
    save=True,
    save_location=OUTPUTS_FOLDER)

Running Check 2: Spot Nulls
Checking QA_2021_01_region_age.
No errors

Checking QA_2021_01_region_ethnicity.
No errors

Checking QA_2021_01_region_household_income.
No errors

Checking QA_2021_01_region_households.
No errors

Checking QA_2021_01_region_housing.
No errors

Checking QA_2021_01_region_population.
No errors

Checking QA_2021_01_region_sex.
No errors

Checking QA_2021_01_region_age_ethnicity.
No errors

Checking QA_2021_01_region_age_sex_ethnicity.
No errors

Checking QA_2021_01_jurisdiction_age.
No errors

Checking QA_2021_01_jurisdiction_ethnicity.
No errors

Checking QA_2021_01_jurisdiction_household_income.
No errors

Checking QA_2021_01_jurisdiction_households.
No errors

Checking QA_2021_01_jurisdiction_housing.
No errors

Checking QA_2021_01_jurisdiction_population.
No errors

Checking QA_2021_01_jurisdiction_sex.
No errors

Checking QA_2021_01_jurisdiction_age_ethnicity.
No errors

Checking QA_2021_01_jurisdiction_age_sex_ethnicity.
No errors

Checking QA_2021_01_cp

FileNotFoundError: No files found for the glob string "QA_2021_01_mgra_age_sex_ethnicity.*" in the folder data\raw_data

In [None]:
# Run spot missing geographies/years
# Add on "mgra" geography to this test if missing from the geography list
pc.NullValues().spot_missing_values(
    vintage=NEW_VINTAGE, 
    geo_list=GEOGRAPHIES + ["mgra"],
    est_table_list=EST_TABLES,
    raw_folder=RAW_FOLDER,
    save=True,
    save_location=OUTPUTS_FOLDER)

Running Check 2: Spot Missing Geographies/Years
Checking QA_2021_01_region_age.
No Errors

Checking QA_2021_01_region_ethnicity.
No Errors

Checking QA_2021_01_region_household_income.
No Errors

Checking QA_2021_01_region_households.
No Errors

Checking QA_2021_01_region_housing.
No Errors

Checking QA_2021_01_region_population.
No Errors

Checking QA_2021_01_region_sex.
No Errors

Checking QA_2021_01_jurisdiction_age.
No Errors

Checking QA_2021_01_jurisdiction_ethnicity.
No Errors

Checking QA_2021_01_jurisdiction_household_income.
No Errors

Checking QA_2021_01_jurisdiction_households.
No Errors

Checking QA_2021_01_jurisdiction_housing.
No Errors

Checking QA_2021_01_jurisdiction_population.
No Errors

Checking QA_2021_01_jurisdiction_sex.
No Errors

Checking QA_2021_01_cpa_age.
No Errors

Checking QA_2021_01_cpa_ethnicity.
No Errors

Checking QA_2021_01_cpa_household_income.
No Errors

Checking QA_2021_01_cpa_households.
No Errors

Checking QA_2021_01_cpa_housing.
No Errors

Chec

### Run Check 3

In [None]:
# Run checks that between Estimates vintages, values do not change by too much
# NOTE: different geography levels have different parameters, as smaller populations tend to have
# more variability.
if("region" in GEOGRAPHIES):
    pc.VintageComparisons().vintage_change(
        p_threshold=2,
        n_threshold=500,
        old_vintage=OLD_VINTAGE,
        new_vintage=NEW_VINTAGE,
        geo_list=["region"],
        est_table_list=EST_TABLES + AGE_SEX_ETHNICITY,
        diff_data_folder=DIFF_FOLDER,
        save=True,
        outputs_folder=OUTPUTS_FOLDER)

Running check 3: Vintage Comparison
Checking between 2020_06 and 2021_01

Checking the age Estimates table at the region level
No errors

Checking the ethnicity Estimates table at the region level
No errors

Checking the household_income Estimates table at the region level
No errors

Checking the households Estimates table at the region level
No errors

Checking the housing Estimates table at the region level
Errors have occurred on the following rows:
Output is too large to print, see output file.

Checking the population Estimates table at the region level
No errors

Checking the sex Estimates table at the region level
No errors

Checking the age_ethnicity Estimates table at the region level
No errors

Checking the age_sex_ethnicity Estimates table at the region level
No errors



In [None]:
if("jurisdiction" in GEOGRAPHIES):
    pc.VintageComparisons().vintage_change(
        p_threshold=5,
        n_threshold=500,
        old_vintage=OLD_VINTAGE,
        new_vintage=NEW_VINTAGE,
        geo_list=["jurisdiction"],
        est_table_list=EST_TABLES + AGE_SEX_ETHNICITY,
        diff_data_folder=DIFF_FOLDER,
        save=True,
        outputs_folder=OUTPUTS_FOLDER)

Running check 3: Vintage Comparison
Checking between 2020_06 and 2021_01

Checking the age Estimates table at the jurisdiction level
No errors

Checking the ethnicity Estimates table at the jurisdiction level
Errors have occurred on the following rows:
Output is too large to print, see output file.

Checking the household_income Estimates table at the jurisdiction level
No errors

Checking the households Estimates table at the jurisdiction level
No errors

Checking the housing Estimates table at the jurisdiction level
Errors have occurred on the following rows:
Output is too large to print, see output file.

Checking the population Estimates table at the jurisdiction level
No errors

Checking the sex Estimates table at the jurisdiction level
No errors

Checking the age_ethnicity Estimates table at the jurisdiction level
Errors have occurred on the following rows:
Output is too large to print, see output file.

Checking the age_sex_ethnicity Estimates table at the jurisdiction level
No 

### Run Check 4

In [None]:
# Run checks that year over year Estimates values do not change by too much
# This check should only be run at the region level to prevent small populations with high 
# variability from distorting results
if("region" in GEOGRAPHIES):
    pc.ThresholdAnalysis().check_thresholds(
        p_threshold=5,
        n_threshold=500,
        vintage=NEW_VINTAGE, 
        geo_list=["region"],
        est_table_list=EST_TABLES,
        raw_folder=RAW_FOLDER, 
        save=True,
        save_location=OUTPUTS_FOLDER)
else:
    print("This check should only be run at the region level.")

Running check 4: Threshold Analysis
Checking file QA_2021_01_region_age.
Errors have occurred on the following rows:
       region  yr_id  Under 5  5 to 9  10 to 14  15 to 17  18 and 19  \
0   San Diego   2010   203423  194029    198716    128000      97095   
1   San Diego   2011   209939  197889    198979    128906      96042   
2   San Diego   2012   215677  203754    198403    131596      97211   
3   San Diego   2013   219753  210910    199295    133282      98736   
4   San Diego   2014   222064  216798    200875    132669     100101   
5   San Diego   2015   223794  221228    202767    131345     101507   
6   San Diego   2016   223203  224837    205110    130974      99967   
7   San Diego   2017   221450  227947    209845    131182      98214   
8   San Diego   2018   218753  229178    215375    131475      98417   
9   San Diego   2019   213481  230127    220121    132226      99228   
10  San Diego   2020   205904  230989    223104    133490      98931   

    20 to 24  25 t

In [None]:
# Run checks that year over year age_sex and age_sex_ethnicity Estimates values do not change by 
# too much
# Similar to the previous notebook cell, this check should only be run at the region level. Still,
# breaking down into age/sex/ethnicity or just age/ethnicity will greater lower populations and 
# increase volatility. As such, the threshold has been increased from 5 to 10
if("region" in GEOGRAPHIES):
    pc.ThresholdAnalysis().check_thresholds(
        p_threshold=5,
        n_threshold=500,
        vintage=NEW_VINTAGE, 
        geo_list=["region"],
        est_table_list=AGE_SEX_ETHNICITY,
        raw_folder=RAW_FOLDER, 
        save=True,
        save_location=OUTPUTS_FOLDER)
else:
    print("This check should only be run at the region level.")

Running check 4: Threshold Analysis
Checking file QA_2021_01_region_age_ethnicity.
Errors have occurred on the following rows:
        region  yr_id       name  Hispanic  \
12   San Diego   2010   15 to 17     57838   
13   San Diego   2011   15 to 17     53190   
24   San Diego   2010  18 and 19     39688   
25   San Diego   2011  18 and 19     33682   
36   San Diego   2010   20 to 24     93539   
..         ...    ...        ...       ...   
232  San Diego   2014    Under 5    100513   
233  San Diego   2015    Under 5    101399   
237  San Diego   2019    Under 5     92535   
238  San Diego   2020    Under 5     86555   
239  San Diego   2021    Under 5     81111   

     Non-Hispanic, American Indian or Alaska Native  Non-Hispanic, Asian  \
12                                              605                11065   
13                                              642                12987   
24                                              501                 9922   
25              

### Run Check 5

In [None]:
# N/A, done in Power BI

### Run Check 6

In [None]:
# Run checks that region level Estimates population values are within +/- 1.5% of CA DOF population
# values
pc.DOFPopulation().region_DOF_population_comparison(
    threshold=1.5,
    est_vintage=NEW_VINTAGE, 
    DOF_vintage=DOF_VINTAGE,
    raw_folder=RAW_FOLDER, 
    save=True,
    save_location=OUTPUTS_FOLDER)

Running Check 6: Estimates vs DOF Population Values
No errors



### Run Check 7

In [None]:
# Run checks that Estimates categorical distributions are within a certain range of CA DOF 
# categorical distributions. For example, is the percent of population in households vs group
# quarters roughly the same between Estimates and CA DOF?
pc.DOFProportion().check_DOF_proportion(
    threshold=1.5,
    est_vintage=NEW_VINTAGE,
    DOF_vintage=DOF_VINTAGE,
    prop_folder=PROP_FOLDER,
    save=True,
    save_location=OUTPUTS_FOLDER)

Running Check 7: DOF Categorical Proportion Check
Checking row_prop files
Errors have occurred on the following rows:
      region  yr_id       name     sex  Hispanic  Non-Hispanic, White  \
2  San Diego   2010   15 to 17  Female  4.222597             3.591888   
3  San Diego   2010   15 to 17    Male  4.447925             4.271212   
4  San Diego   2010  18 and 19  Female  2.831824             2.734724   
5  San Diego   2010  18 and 19    Male  8.626628             8.833974   
6  San Diego   2010   20 to 24  Female  3.568784             2.754082   
7  San Diego   2010   20 to 24    Male  5.180193             4.725284   

   Non-Hispanic, Asian  Non-Hispanic, Hawaiian or Pacific Islander  \
2             1.801953                                    0.047799   
3             1.398805                                    0.030476   
4             0.067719                                    0.018460   
5             0.940497                                    0.044858   
6             0.8124

## Runtime

In [None]:
runtime = time.time() - start
minutes, seconds = divmod(runtime, 60)
print(f'Runtime: {int(minutes)} minutes, {(int(seconds))} seconds')

Runtime: 1 minutes, 45 seconds
