# Example Usage

In [1]:
# Keep track of runtime which will be printed at the end of this notebook
import time
start = time.time()

In [2]:
# Library imports
import pathlib

import numpy as np
import pandas as pd

# Local modules
import functions as f

In [3]:
# The list of geographies to run on by default
# NOTE: To add any additional geography such as "mgra" or "cpa", simply add it to the list below. 
# NOTE: Including "mgra" results in huge memory usage, possibly 10GB+. If you are going to get mgra
# level data, it is recommended that you close as many programs as possible to avoid using the 
# hard drive as additional memory, which is EXTREMELY slow.
GEOGRAPHIES = ["region", "jurisdiction"]

# The estimates tables to run on. The tables are split up into three different variables
# EST_TABLES contains Estimates tables where the key columns are ONLY geography name and year
# AGE_SEX_ETHNICITY contains Estimates tables where key columns contain additional information such
# as age category or ethnicity category
# CONSOLIDATED is the name of the consolidated file
EST_TABLES = ['age', 'ethnicity', 'household_income', 'households', 'housing', 'population', 'sex']
AGE_SEX_ETHNICITY = ["age_ethnicity", "age_sex_ethnicity"]
CONSOLIDATED = ["consolidated"]

# The Estimates/DOF vintages to run on. Checks are run on the NEW_VINTAGE by default. OLD_VINTAGE is
# only used in the creation of diff files, which contain NEW_VINTAGE - OLD_VINTAGE. DOF_VINTAGE is
# used for downloading DOF data, and comparing region level population values in Check 6
NEW_VINTAGE = "2021_01"
OLD_VINTAGE = "2020_06"
DOF_VINTAGE = "2021_07_14"

# Default locations where files should be saved to and loaded from.
# NOTE: These folders are created by the save function, no need to do so yourself.
RAW_FOLDER = pathlib.Path("./data/raw_data/")
DIFF_FOLDER = pathlib.Path("./data/diff/")
PROP_FOLDER = pathlib.Path("./data/proportion/")
OUTPUTS_FOLDER = pathlib.Path("./data/outputs/")

## Create Tables

In [4]:
# Import the module that allows us to create tables
import generate_tables as gt

### Creating Estimates Tables (generate_tables.EstimatesTables)

In [5]:
# Creating individual files (one file per unique geography and Estimate table)
# "_ =" to suppress output
_ = gt.EstimatesTables().individual(
        est_vintage=NEW_VINTAGE,
        geo_list=GEOGRAPHIES,
        est_table_list=EST_TABLES + AGE_SEX_ETHNICITY,
        save=True,
        save_folder=RAW_FOLDER)

In [6]:
# Creating consolidated files (one file per geography, contains all requested tables)
# NOTE: The additional parameter get_from_file=True allows the function to pull data from already
# downloaded Estimates tables, instead of re-downloading and holding in memory
_ = gt.EstimatesTables().consolidate(
        est_vintage=NEW_VINTAGE, 
        geo_list=GEOGRAPHIES,
        est_table_list=EST_TABLES, 
        get_from_file=True,
        raw_folder=RAW_FOLDER,
        save=True,
        save_folder=RAW_FOLDER)

### Creating DOF Data Files (generate_tables.CA_DOF)

In [7]:
# Creating DOF Data files.
_ = gt.CA_DOF().get_CA_DOF_data(
        dof_vintage=DOF_VINTAGE, 
        save_folder=RAW_FOLDER)

SELECT dof.county_fips_code, dof.fiscal_yr, SUM(dof.population) as population
FROM [socioec_data].[ca_dof].[population_proj_2021_07_14] as dof
WHERE county_fips_code='06073'
GROUP BY dof.county_fips_code, dof.fiscal_yr
ORDER BY dof.county_fips_code, dof.fiscal_yr ASC


### Creating Diff Files (generate_tables.DiffFiles)

In [8]:
# First create consolidated and individual files using the same parameters as before but with 
# the OLD_VINTAGE instead of the NEW_VINTAGE
_ = gt.EstimatesTables().individual(
        est_vintage=OLD_VINTAGE,
        geo_list=GEOGRAPHIES,
        est_table_list=EST_TABLES + AGE_SEX_ETHNICITY,
        save=True,
        save_folder=RAW_FOLDER)
_ = gt.EstimatesTables().consolidate(
        est_vintage=OLD_VINTAGE, 
        geo_list=GEOGRAPHIES,
        est_table_list=EST_TABLES, 
        get_from_file=True,
        raw_folder=RAW_FOLDER,
        save=True,
        save_folder=RAW_FOLDER)

In [9]:
# Create diff files 
# NOTE: This will output xlsx files with sheets containing OLD_VINTAGE data, NEW_VINTAGE data, and
# NEW_VINTAGE-OLD_VINTAGE data. This is NOT percent difference, but numeric difference
_ = gt.DiffFiles().create_diff_tables(
        old_vintage=OLD_VINTAGE, 
        new_vintage=NEW_VINTAGE, 
        geo_list=GEOGRAPHIES,
        est_table_list=EST_TABLES + CONSOLIDATED + AGE_SEX_ETHNICITY,
        raw_data_folder=RAW_FOLDER,
        save=True,
        save_folder=DIFF_FOLDER)

### Creating Proportion Files (generate_tables.ProportionFiles)

In [10]:
# Create proportion files
gt.ProportionFiles().create_proportion_tables( 
    est_vintage=NEW_VINTAGE, 
    geo_list=['region'],
    est_table_list=['age', "sex", 'ethnicity', 'household_income', 'age_ethnicity', 'age_sex_ethnicity'],
    raw_data_folder=RAW_FOLDER,
    save=True,
    save_folder=PROP_FOLDER)

## Run Checks

In [11]:
# Import the module that allows us to run checks
import perform_checks as pc

### Run Check 1

In [12]:
# Run internal consistency checks by geography level
# NOTE: Due to a limitation of the function, "region" cannot be included in the input geo_list
# BUG: If new geographies were added to GEOGRAPHIES, make sure that aggregation instructions are
# updated in the variable InternalConsistency()._geography_aggregation. For more details on how,
# see the class docstring
NO_REGION = GEOGRAPHIES.copy()
NO_REGION.remove("region")
pc.InternalConsistency().check_geography_aggregations(
    vintage=NEW_VINTAGE,
    geo_list=NO_REGION,
    est_table="consolidated",
    raw_folder=RAW_FOLDER,  
    save=True,
    save_location=OUTPUTS_FOLDER)

Running Check 1: Check aggregated values between geography levels
Aggregating jurisdiction level data to region and comparing with region csv file
No errors



In [13]:
# Run internal consistency checks between Estimates tables
# See function signature for the meaning of "est_table_types"
pc.InternalConsistency().check_internal_aggregations(
        vintage=NEW_VINTAGE, 
        geo_list=GEOGRAPHIES,
        est_table_types=["population", "households"],
        raw_folder=RAW_FOLDER,
        save=True,
        save_location=OUTPUTS_FOLDER)

Running Check 1: Check aggregated values between Estimates tables
Checking Estimates tables with population values at the region level
No errors

Checking Estimates tables with population values at the jurisdiction level
No errors

Checking Estimates tables with households values at the region level
No errors

Checking Estimates tables with households values at the jurisdiction level
No errors



### Run Check 2

In [14]:
# Run spot nulls check
pc.NullValues().spot_nulls(
    vintage=NEW_VINTAGE, 
    geo_list=GEOGRAPHIES, 
    est_table_list=EST_TABLES + CONSOLIDATED + AGE_SEX_ETHNICITY, 
    raw_folder=RAW_FOLDER, 
    save=True,
    save_location=OUTPUTS_FOLDER)

Running Check 2: Spot Nulls
Checking QA_2021_01_region_age.
No errors

Checking QA_2021_01_region_ethnicity.
No errors

Checking QA_2021_01_region_household_income.
No errors

Checking QA_2021_01_region_households.
No errors

Checking QA_2021_01_region_housing.
No errors

Checking QA_2021_01_region_population.
No errors

Checking QA_2021_01_region_sex.
No errors

Checking QA_2021_01_region_consolidated.
No errors

Checking QA_2021_01_region_age_ethnicity.
No errors

Checking QA_2021_01_region_age_sex_ethnicity.
No errors

Checking QA_2021_01_jurisdiction_age.
No errors

Checking QA_2021_01_jurisdiction_ethnicity.
No errors

Checking QA_2021_01_jurisdiction_household_income.
No errors

Checking QA_2021_01_jurisdiction_households.
No errors

Checking QA_2021_01_jurisdiction_housing.
No errors

Checking QA_2021_01_jurisdiction_population.
No errors

Checking QA_2021_01_jurisdiction_sex.
No errors

Checking QA_2021_01_jurisdiction_consolidated.
No errors

Checking QA_2021_01_jurisdiction_a

### Run Check 3

In [15]:
# N/A, done already by generate_tables.DiffFiles

### Run Check 4

In [16]:
# N/A. Although code has been written for this check, the outputs are primarily for QA/QC. 
# Uncomment the following lines of code if you want to run the check anyways

# # Run checks that year over year Estimates values do not change by too much
# pc.ThresholdAnalysis().check_thresholds(
#     threshold=5,
#     vintage="2020_06", 
#     geo_list=["region"],
#     est_table_list=EST_TABLES,
#     raw_folder=RAW_FOLDER, 
#     save=True,
#     save_location=OUTPUTS_FOLDER)

In [17]:
# N/A. Although code has been written for this check, the outputs are primarily for QA/QC.
# Uncomment the following lines of code if you want to run the check anyways

# # Run checks that year over year age_sex and age_sex_ethnicity Estimates values do not change by 
# # too much
# # NOTE: The threshold used is 10 for the region level and 15 for the jurisdiction level, as smaller
# # population values result in more extreme changes
# pc.ThresholdAnalysis().check_thresholds(
#     threshold=10,
#     vintage="2020_06", 
#     geo_list=["region"],
#     est_table_list=AGE_SEX_ETHNICITY,
#     raw_folder=RAW_FOLDER, 
#     save=True,
#     save_location=OUTPUTS_FOLDER)
# pc.ThresholdAnalysis().check_thresholds(
#     threshold=15,
#     vintage="2020_06", 
#     geo_list=["jurisdiction"],
#     est_table_list=AGE_SEX_ETHNICITY,
#     raw_folder=RAW_FOLDER, 
#     save=True,
#     save_location=OUTPUTS_FOLDER)

### Run Check 5

In [18]:
# N/A, done in Power BI

### Run Check 6

In [19]:
# Run checks that region level Estimates population values are within +/- 1.5% of CA DOF population
# values
pc.DOFPopulation().region_DOF_population_comparison(
    threshold=1.5,
    est_vintage=NEW_VINTAGE, 
    DOF_vintage=DOF_VINTAGE,
    raw_folder=RAW_FOLDER, 
    save=True,
    save_location=OUTPUTS_FOLDER)

Running Check 6: Estimates vs DOF Population Values
No errors



### Run Check 7

In [20]:
# N/A. Although code has been written for this check, the outputs are primarily for QA/QC.
# Uncomment the following lines of code if you want to run the check anyways

# # Run checks that Estimates categorical distributions are within a certain range of CA DOF 
# # categorical distributions. For example, is the percent of population in households vs group
# # quarters roughly the same between Estimates and CA DOF?
# pc.DOFProportion().check_DOF_proportion(
#     threshold=4,
#     vintage="2020_06", 
#     geo_list=["region", "jurisdiction"],
#     raw_folder=RAW_FOLDER,
#     DOF_folder=DOF_FOLDER,
#     save=True,
#     save_location=OUTPUTS_FOLDER)

## Runtime

In [21]:
runtime = time.time() - start
minutes, seconds = divmod(runtime, 60)
print(f'Runtime: {int(minutes)} minutes, {(int(seconds))} seconds')

Runtime: 3 minutes, 16 seconds
