# Example Usage

In [16]:
# Keep track of the amount of time running takes
import time
start = time.time()

In [1]:
# Library imports
import pathlib
import random

import numpy as np
import pandas as pd

# Local modules
import functions as f

In [2]:
# All standard geographies and estimate tables
# NOTE: mgra is not included in the list of Geographies to save on runtime
GEOGRAPHIES = ["region", "jurisdiction"]
EST_TABLES = ['age', 'ethnicity', 'household_income', 'households', 'housing', 'population', 'sex']

In [3]:
# Where files should be saved/loaded
RAW_FOLDER = pathlib.Path("./data/raw_data/")
DIFF_FOLDER = pathlib.Path("./data/diff/")
DOF_FOLDER = pathlib.Path("./data/CA_DOF/")
OUTPUTS_FOLDER = pathlib.Path("./data/outputs/")

## Create Tables

In [4]:
# Import the module that allows us to create tables
import generate_tables as gt

### Creating Estimates Tables (generate_tables.EstimatesTables)

In [5]:
# Creating consolidated files (one file per geography, contains all requested tables)
# "_ =" to supress output
_ = gt.EstimatesTables().consolidate(
        est_vintage="2020_06", 
        geo_list=GEOGRAPHIES,
        est_table_list=EST_TABLES, 
        save=True,
        save_folder=RAW_FOLDER)

In [6]:
# Creating individual files (one file per unique geography and Estimate table)
_ = gt.EstimatesTables().individual(
        est_vintage="2020_06",
        geo_list=GEOGRAPHIES,
        est_table_list=EST_TABLES,
        save=True,
        save_folder=RAW_FOLDER)

### Creating DOF Data Files (generate_tables.CA_DOF)

In [7]:
# Creating DOF Data file.
# NOTE: See class documentation for some manual steps you need to do before running this code.
# Using the default raw_data and save locations
_ = gt.CA_DOF().get_CA_DOF_data(
    raw_folder=RAW_FOLDER,
    save_folder=DOF_FOLDER,
    years=range(2010, 2022),
    geo_list=["region", "jurisdiction"])

### Creating Diff Files (generate_tables.DiffFiles)

In [8]:
# First create consolidated and individual files using the same parameters as before but with 
# a vintage of 2021_01
_ = gt.EstimatesTables().consolidate(
        est_vintage="2021_01", 
        geo_list=GEOGRAPHIES,
        est_table_list=EST_TABLES, 
        save=True,
        save_folder=RAW_FOLDER)
_ = gt.EstimatesTables().individual(
        est_vintage="2021_01",
        geo_list=GEOGRAPHIES,
        est_table_list=EST_TABLES,
        save=True,
        save_folder=RAW_FOLDER)

In [9]:
# Create diff files 
_ = gt.DiffFiles().create_diff_tables(
        old_vintage="2020_06", 
        new_vintage="2021_01", 
        geo_list=GEOGRAPHIES,
        est_table_list=["consolidated"] + EST_TABLES,
        raw_data_folder=RAW_FOLDER,
        save=True,
        save_folder=DIFF_FOLDER)

## Run Checks

In [4]:
# Import the module that allows us to run checks
import perform_checks as pc

### Run Check 1

In [11]:
# Run internal consistency checks
pc.InternalConsistency().check_geography_aggregations(
    vintage="2020_06",
    geo_list=["jurisdiction"],
    est_table="consolidated",
    raw_folder=RAW_FOLDER,  
    save=True,
    save_location=OUTPUTS_FOLDER)

Running Check 1: Check aggregated values between geography levels
Aggregating jurisdiction level data to region and comparing with region csv file
No errors



### Run Check 2

In [5]:
# Run spot nulls check
pc.NullValues().spot_nulls(
    vintage="2020_06", 
    geo_list=GEOGRAPHIES, 
    est_table_list=["consolidated"] + EST_TABLES, 
    raw_folder=RAW_FOLDER, 
    save=True,
    save_location=OUTPUTS_FOLDER)

Running Check 2: Spot Nulls
Checking QA_2020_06_region_consolidated
No errors

Checking QA_2020_06_region_age
No errors

Checking QA_2020_06_region_ethnicity
No errors

Checking QA_2020_06_region_household_income
No errors

Checking QA_2020_06_region_households
No errors

Checking QA_2020_06_region_housing
No errors

Checking QA_2020_06_region_population
No errors

Checking QA_2020_06_region_sex
No errors

Checking QA_2020_06_jurisdiction_consolidated
No errors

Checking QA_2020_06_jurisdiction_age
No errors

Checking QA_2020_06_jurisdiction_ethnicity
No errors

Checking QA_2020_06_jurisdiction_household_income
No errors

Checking QA_2020_06_jurisdiction_households
No errors

Checking QA_2020_06_jurisdiction_housing
No errors

Checking QA_2020_06_jurisdiction_population
No errors

Checking QA_2020_06_jurisdiction_sex
No errors



### Run Check 3

In [13]:
# N/A, done already by generate_tables.DiffFiles

### Run Check 4

In [6]:
# Run checks that year over year Estimates values do not change by too much
pc.ThresholdAnalysis().check_thresholds(
    threshold=5,
    vintage="2020_06", 
    geo_list=GEOGRAPHIES,
    est_table_list=EST_TABLES,
    raw_folder=RAW_FOLDER, 
    save=True,
    save_location=OUTPUTS_FOLDER)

Running check 4: Threshold Analysis
Checking file QA_2020_06_region_age
Errors have occured on the following rows:
       region  yr_id  Under 5  |% Diff| Under 5  5 to 9  |% Diff| 5 to 9  \
0   San Diego   2010   203423               NaN  194029              NaN   
1   San Diego   2011   209818          3.143696  197773         1.929608   
2   San Diego   2012   215677          2.792420  203754         3.024174   
3   San Diego   2013   219753          1.889863  210910         3.512078   
4   San Diego   2014   222064          1.051635  216798         2.791712   
5   San Diego   2015   223824          0.792564  221252         2.054447   
6   San Diego   2016   223355          0.209540  224985         1.687216   
7   San Diego   2017   221684          0.748136  228181         1.420539   
8   San Diego   2018   219100          1.165623  229534         0.592950   
9   San Diego   2019   213921          2.363761  230600         0.464419   
10  San Diego   2020   206650          3.398918  

### Run Check 5

In [None]:
# N/A, done in Power BI

### Run Check 6

In [15]:
# Run checks that Estimates population values are within a certain range +/- 1.5% of CA DOF 
# population values
pc.DOFPopulation().check_DOF_population(
    threshold=1.5,
    vintage="2020_06", 
    geo_list=GEOGRAPHIES,
    raw_folder=RAW_FOLDER, 
    DOF_folder=DOF_FOLDER, 
    save=True,
    save_location=OUTPUTS_FOLDER)

Running Check 6: DOF Total Population Comparison
Checking at the region level
Errors have occured on the following rows:
    Year  Est Total Population  Est Household Population  Est Group Quarters  \
0   2010               3095314                   2993348              101966   
11  2020               3343355                   3230945              112410   

    DOF Total Population  DOF Household Population  DOF Group Quarters  \
0                3095313                   2991515              103798   
11               3298634                   3172741              125893   

    |% Diff| Total Population  |% Diff| Household Population  \
0                    0.000032                       0.061273   
11                   1.355743                       1.834502   

    |% Diff| Group Quarters  
0                  1.764967  
11                10.709889  

Checking at the jurisdiction level
Errors have occured on the following rows:
      Year  Est Total Population  Est Household Popul

### Run Check 7

In [None]:
# TODO

## Runtime

In [24]:
runtime = time.time() - start

minutes, seconds = divmod(runtime, 60)
hours, minutes = divmod(minutes, 60)

print(f'Runtime: {int(hours):0d} hours, {int(minutes):02d} minutes, {int(seconds):02d} seconds')

Runtime: 0 hours, 04 minutes, 21 seconds
