# Example Usage

In [1]:
# Keep track of the amount of time running takes
import time
start = time.time()

In [2]:
# Library imports
import pathlib
import random

import numpy as np
import pandas as pd

# Local modules
import functions as f

In [3]:
# All standard geographies
# NOTE: mgra is not included in the list of Geographies to save on runtime
GEOGRAPHIES = ["region", "jurisdiction"]

# All standard estimates tables
EST_TABLES = ['age', 'ethnicity', 'household_income', 'households', 'housing', 'population', 'sex']
CONSOLIDATED = ["consolidated"]
AGE_SEX_ETHNICITY = ["age_ethnicity", "age_sex_ethnicity"]

In [4]:
# Where files should be saved/loaded
RAW_FOLDER = pathlib.Path("./data/raw_data/")
DIFF_FOLDER = pathlib.Path("./data/diff/")
DOF_FOLDER = pathlib.Path("./data/CA_DOF/")
OUTPUTS_FOLDER = pathlib.Path("./data/outputs/")

## Create Tables

In [5]:
# Import the module that allows us to create tables
import generate_tables as gt

### Creating Estimates Tables (generate_tables.EstimatesTables)

In [6]:
# Creating consolidated files (one file per geography, contains all requested tables)
# "_ =" to supress output
_ = gt.EstimatesTables().consolidate(
        est_vintage="2020_06", 
        geo_list=GEOGRAPHIES,
        est_table_list=EST_TABLES, 
        save=True,
        save_folder=RAW_FOLDER)

In [7]:
# Creating individual files (one file per unique geography and Estimate table)
_ = gt.EstimatesTables().individual(
        est_vintage="2020_06",
        geo_list=GEOGRAPHIES,
        est_table_list=EST_TABLES + AGE_SEX_ETHNICITY,
        save=True,
        save_folder=RAW_FOLDER)

### Creating DOF Data Files (generate_tables.CA_DOF)

In [8]:
# Creating DOF Data file.
# NOTE: See class documentation for some manual steps you need to do before running this code.
# Using the default raw_data and save locations
_ = gt.CA_DOF().get_CA_DOF_data(
    raw_folder=RAW_FOLDER,
    save_folder=DOF_FOLDER,
    years=range(2010, 2022),
    geo_list=["region", "jurisdiction"])

### Creating Diff Files (generate_tables.DiffFiles)

In [9]:
# First create consolidated and individual files using the same parameters as before but with 
# a vintage of 2021_01
_ = gt.EstimatesTables().consolidate(
        est_vintage="2021_01", 
        geo_list=GEOGRAPHIES,
        est_table_list=EST_TABLES, 
        save=True,
        save_folder=RAW_FOLDER)
_ = gt.EstimatesTables().individual(
        est_vintage="2021_01",
        geo_list=GEOGRAPHIES,
        est_table_list=EST_TABLES + AGE_SEX_ETHNICITY,
        save=True,
        save_folder=RAW_FOLDER)

In [10]:
# Create diff files 
_ = gt.DiffFiles().create_diff_tables(
        old_vintage="2020_06", 
        new_vintage="2021_01", 
        geo_list=GEOGRAPHIES,
        est_table_list=EST_TABLES + CONSOLIDATED + AGE_SEX_ETHNICITY,
        raw_data_folder=RAW_FOLDER,
        save=True,
        save_folder=DIFF_FOLDER)

## Run Checks

In [11]:
# Import the module that allows us to run checks
import perform_checks as pc

### Run Check 1

In [12]:
# Run internal consistency checks by geography level
pc.InternalConsistency().check_geography_aggregations(
    vintage="2020_06",
    geo_list=["jurisdiction"],
    est_table="consolidated",
    raw_folder=RAW_FOLDER,  
    save=True,
    save_location=OUTPUTS_FOLDER)

Running Check 1: Check aggregated values between geography levels
Aggregating jurisdiction level data to region and comparing with region csv file
No errors



In [13]:
# Run internal consistency checks between Estimates tables
# See function signature for the meaning of "est_table_types"
pc.InternalConsistency().check_internal_aggregations(
        vintage="2020_06", 
        geo_list=GEOGRAPHIES,
        est_table_types=["population", "households"],
        raw_folder=RAW_FOLDER,
        save=True,
        save_location=OUTPUTS_FOLDER)

Running Check 1: Check aggregated values between Estimates tables
Checking Estimates tables with population values at the region level
No errors

Checking Estimates tables with population values at the jurisdiction level
No errors

Checking Estimates tables with households values at the region level
No errors

Checking Estimates tables with households values at the jurisdiction level
No errors



### Run Check 2

In [14]:
# Run spot nulls check
pc.NullValues().spot_nulls(
    vintage="2020_06", 
    geo_list=GEOGRAPHIES, 
    est_table_list=EST_TABLES + CONSOLIDATED + AGE_SEX_ETHNICITY, 
    raw_folder=RAW_FOLDER, 
    save=True,
    save_location=OUTPUTS_FOLDER)

Running Check 2: Spot Nulls
Checking QA_2020_06_region_age.
No errors

Checking QA_2020_06_region_ethnicity.
No errors

Checking QA_2020_06_region_household_income.
No errors

Checking QA_2020_06_region_households.
No errors

Checking QA_2020_06_region_housing.
No errors

Checking QA_2020_06_region_population.
No errors

Checking QA_2020_06_region_sex.
No errors

Checking QA_2020_06_region_consolidated.
No errors

Checking QA_2020_06_region_age_ethnicity.
No errors

Checking QA_2020_06_region_age_sex_ethnicity.
No errors

Checking QA_2020_06_jurisdiction_age.
No errors

Checking QA_2020_06_jurisdiction_ethnicity.
No errors

Checking QA_2020_06_jurisdiction_household_income.
No errors

Checking QA_2020_06_jurisdiction_households.
No errors

Checking QA_2020_06_jurisdiction_housing.
No errors

Checking QA_2020_06_jurisdiction_population.
No errors

Checking QA_2020_06_jurisdiction_sex.
No errors

Checking QA_2020_06_jurisdiction_consolidated.
No errors

Checking QA_2020_06_jurisdiction_a

### Run Check 3

In [15]:
# N/A, done already by generate_tables.DiffFiles

### Run Check 4

In [16]:
# Run checks that year over year Estimates values do not change by too much
pc.ThresholdAnalysis().check_thresholds(
    threshold=5,
    vintage="2020_06", 
    geo_list=["region"],
    est_table_list=EST_TABLES,
    raw_folder=RAW_FOLDER, 
    save=True,
    save_location=OUTPUTS_FOLDER)

Running check 4: Threshold Analysis
Checking file QA_2020_06_region_age.
Errors have occurred on the following rows:
       region  yr_id  Under 5  |% Diff| Under 5  5 to 9  |% Diff| 5 to 9  \
0   San Diego   2010   203423               NaN  194029              NaN   
1   San Diego   2011   209818          3.143696  197773         1.929608   
2   San Diego   2012   215677          2.792420  203754         3.024174   
3   San Diego   2013   219753          1.889863  210910         3.512078   
4   San Diego   2014   222064          1.051635  216798         2.791712   
5   San Diego   2015   223824          0.792564  221252         2.054447   
6   San Diego   2016   223355          0.209540  224985         1.687216   
7   San Diego   2017   221684          0.748136  228181         1.420539   
8   San Diego   2018   219100          1.165623  229534         0.592950   
9   San Diego   2019   213921          2.363761  230600         0.464419   
10  San Diego   2020   206650          3.398918

In [17]:
# Run checks that year over year age_sex and age_sex_ethnicity Estimates values do not change by 
# too much
# NOTE: The threshold used is 10 for the region level and 15 for the jurisdiction level, as smaller
# population values result in more extreme changes
pc.ThresholdAnalysis().check_thresholds(
    threshold=10,
    vintage="2020_06", 
    geo_list=["region"],
    est_table_list=AGE_SEX_ETHNICITY,
    raw_folder=RAW_FOLDER, 
    save=True,
    save_location=OUTPUTS_FOLDER)
pc.ThresholdAnalysis().check_thresholds(
    threshold=15,
    vintage="2020_06", 
    geo_list=["jurisdiction"],
    est_table_list=AGE_SEX_ETHNICITY,
    raw_folder=RAW_FOLDER, 
    save=True,
    save_location=OUTPUTS_FOLDER)

Running check 4: Threshold Analysis
Checking file QA_2020_06_region_age_ethnicity.
Errors have occurred on the following rows:
        region  yr_id          name  Hispanic  |% Diff| Hispanic  \
11   San Diego   2010      15 to 17     57838                NaN   
12   San Diego   2011      15 to 17     53157           8.093295   
22   San Diego   2010     18 and 19     39688                NaN   
23   San Diego   2011     18 and 19     33660          15.188470   
24   San Diego   2012     18 and 19     34632           2.887701   
..         ...    ...           ...       ...                ...   
200  San Diego   2012  85 and Older      6779           6.487590   
201  San Diego   2013  85 and Older      7160           5.620298   
202  San Diego   2014  85 and Older      7556           5.530726   
203  San Diego   2015  85 and Older      8021           6.154050   
204  San Diego   2016  85 and Older      8550           6.595188   

     Non-Hispanic, American Indian or Alaska Native  \
1

### Run Check 5

In [18]:
# N/A, done in Power BI

### Run Check 6

In [19]:
# Run checks that Estimates population values are within a certain range +/- 1.5% of CA DOF 
# population values
pc.DOFPopulation().check_DOF_population(
    threshold=1.5,
    vintage="2020_06", 
    geo_list=GEOGRAPHIES,
    raw_folder=RAW_FOLDER, 
    DOF_folder=DOF_FOLDER, 
    save=True,
    save_location=OUTPUTS_FOLDER)

Running Check 6: DOF Total Population Comparison
Checking at the region level
No errors

Checking at the jurisdiction level
Errors have occurred on the following rows:
               City  Year  Est Total Population  Est Household Population  \
10         Carlsbad  2020                114463                    113548   
21      Chula Vista  2020                272202                    270545   
32         Coronado  2020                 21381                     17558   
43          Del Mar  2020                  4268                      4268   
54         El Cajon  2020                104393                    101911   
65        Encinitas  2020                 62183                     61655   
76        Escondido  2020                153008                    150634   
87   Imperial Beach  2020                 28055                     27568   
98          La Mesa  2020                 59966                     59309   
109     Lemon Grove  2020                 26526               

### Run Check 7

In [20]:
# TODO

## Runtime

In [21]:
runtime = time.time() - start
minutes, seconds = divmod(runtime, 60)
print(f'Runtime: {int(minutes)} minutes, {(int(seconds))} seconds')

Runtime: 4 minutes, 1 seconds
