In [None]:
### READ FIRST: ########################################################
### REPLACE ALL SECTIONS IN THIS NOTEBOOK THAT ARE PREFIXED WITH "TO_DO" #
########################################################################

# Allows us to use the BigQuery "magic" (%%bigquery)
%load_ext google.cloud.bigquery

# Required so python actually gets re-executed every time
%load_ext autoreload
%autoreload 2

In [None]:
# Defines a function that can be used to materialize a table for use in later queries
def materialize_query_with_name(
    dataset_ref: bigquery.DatasetReference,
    view_tag: str,
    query_name: str,
    query: str,
):
    table_id = f"{view_tag}_{query_name}"
    print(
        f"Writing {query_name} query to [{dataset_ref.project}.{dataset_ref.dataset_id}.{table_id}]..."
    )

    create_job = bq_client.create_table_from_query_async(
        dataset_id=dataset_ref.dataset_id,
        table_id=table_id,
        query=query,
        overwrite=True,
    )
    create_job.result()
    print(f"Finished writing {query_name} query.")

In [None]:
import os
import sys
import pandas as pd

parts = os.path.normpath(os.getcwd()).split(os.path.sep)

#### TO_DO: REPLACE THIS WITH YOUR LOCAL REPO DIRECTORY NAME #####
relative_path_parts = parts[: parts.index("pulse-data") + 1]
abs_path = os.path.join("/", *relative_path_parts)

if sys.path[0] != abs_path:
    sys.path.insert(0, abs_path)

from datetime import datetime
from google.cloud import bigquery
from typing import Optional

from recidiviz.big_query.big_query_client import BigQueryClientImpl
from recidiviz.big_query.view_update_manager import (
    TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS,
)
from recidiviz.ingest.direct.views.direct_ingest_big_query_view_types import (
    DirectIngestPreProcessedIngestView,
    RawTableViewType,
)
from recidiviz.utils import regions
from recidiviz.utils.environment import GCP_PROJECT_STAGING
from recidiviz.utils.metadata import local_project_id_override, project_id
from google.cloud.bigquery import magics

magics.context.progress_bar_type = None

region_code = "US_XX"
state_specific_scratch_dataset = "us_xx_scratch"
state_specific_validation_dataset = "us_xx_validation"


# QUERY: CONFIRM THAT INGEST VIEW IS DETERMINISTIC
# TO_DO: Replace with ingest view queries that are the same
INGEST_VIEW_V1 = ""
INGEST_VIEW_V1_DUPLICATE = ""

check_ingest_view_determinism = f"""
    SELECT * # EXCEPT (TO_DO: <add column names here to narrow down what is changing>)
    FROM `recidiviz-staging.{state_specific_scratch_dataset}.{INGEST_VIEW_V1}`
    EXCEPT DISTINCT
    SELECT * # EXCEPT (TO_DO: <add column names here to narrow down what is changing>)
    FROM `recidiviz-staging.{state_specific_scratch_dataset}.{INGEST_VIEW_V1_DUPLICATE}`
"""


# QUERY: COMPARE VALIDATION AND INGEST VIEW POPULATIONS BY FACILITY/LOCATION
# TO_DO: Replace POPULATION_COMPARISON_VALIDATION_QUERY with a state-specific query to transform validation data for population comparisons
# External validation data should be uploaded to us_xx_validation dataset.
# Example Query:
POPULATION_COMPARISON_VALIDATION_QUERY = f"""
    -- Make any transformations you need fo rhte validation data here
    SELECT
        date_of_stay_or_supervision,
        facility_or_location,
        COUNT(DISTINT person_external_id) AS validation_population
    FROM `recidiviz-staging.{state_specific_validation_dataset}.TO_DO_replace_with_validation_view
    GROUP BY date_of_stay_or_supervision, facility_or_location
"""

# TO_DO: Replace POPULATION_COMPARISON_INGEST_QUERY with the columns from the ingest view you are validating.
# Example query:
POPULATION_COMPARISON_INGEST_QUERY = f"""
    SELECT
        period_date AS date_of_stay_or_supervision,
        facility_or_location,
        COUNT(DISTINCT ingest_view_person_id) AS ingest_population,
    FROM `recidiviz-staging.{state_specific_scratch_dataset}.TO_DO_replace_with_ingest_view`
    ,
    
    -- TO_DO: REPLACE WITH THE DATE RANGE OF THE VALIDATION DATA
    UNNEST(GENERATE_DATE_ARRAY(Date(2010, 1, 1), Date(2021, 9, 1), INTERVAL 1 MONTH)) period_date
    
    GROUP BY date_of_stay_or_supervision, facility_or_location
    ORDER BY date_of_stay_or_supervision

"""

population_comparison_by_date_and_location = f"""
    WITH validation_population AS (
        {POPULATION_COMPARISON_VALIDATION_QUERY}
    )
    ,ingest_view_population as (
        {POPULATION_COMPARISON_INGEST_QUERY}
    )   
    SELECT
        ingest_view_population.date_of_stay_or_supervision AS date_of_stay_or_supervision,
        validation_population.facility_or_location,
        ingest_population,
        validation_population,
        ingest_population - validation_population AS total_population_diff,
        ABS(ROUND(100 * SAFE_DIVIDE(
            ingest_population - validation_population, validation_population), 2)
        ) AS percentage_off
    FROM
        validation_query
    INNER JOIN
        population_query
    ON
        population_query.date_of_stay_or_supervision = validation_query.date_of_stay_or_supervision
    WHERE
        population_query.facility_or_location = validation_query.facility_or_location
    ORDER BY
        percentage_off DESC
"""


# QUERY: SEE A LIST OF INDIVIDUAL IDS THAT DIFFER BETWEEN INGEST VIEW AND VALIDATION

# TO_DO: Replace with ingest view query that unnests for each date that is available in the validation data
# Example query:
def get_ingest_view_individual_ids_query(ingest_view_name: str) -> str:
    return f"""
    SELECT 
        period_date AS date_of_stay_or_supervision,
        person_external_id,
        facility_or_location,
    FROM `recidiviz-staging.{state_specific_scratch_dataset}.{ingest_view_name}`
    ,
    -- TO_DO: REPLACE WITH THE DATE RANGE OF THE VALIDATION DATA
    UNNEST(GENERATE_DATE_ARRAY(Date(2010, 1, 1), Date(2021, 9, 1), INTERVAL 1 MONTH)) period_date
    
    WHERE (
        -- TO_DO: REPLACE WITH QUERY START/END DATE COLUMN NAMES
        (period_date BETWEEN start_date AND end_date and period_date != end_date)
        OR 
        (period_date >= start_date AND end_date IS NULL)
    )
    
    -- TO_DO: REPLACE WITH DATES AVAILABLE IN VALIDATION DATA
    AND (
        EXTRACT(MONTH FROM date_of_stay) IN (1, 4, 7, 10) 
            AND EXTRACT(YEAR FROM date_of_stay) > 2010  
        )
    )
    ORDER BY date_of_stay_or_supervision
"""


# TO_DO: Replace with query for validation data
# Example query:
VALIDATION_INDIVIDUAL_IDS_QUERY = f"""
    SELECT 
        person_external_id,
        date_of_stay_or_supervision,
        facility_or_location
    FROM `recidiviz-staging.{state_specific_validation_dataset}.TO_DO_replace_with_validation_view`
"""

ingest_validation_person_level_comparison = f"""
    WITH ingest_ids AS (
        {get_ingest_view_individual_ids_query("TO_DO: REPLACE WITH INGEST VIEW NAME")}
    ),
    validation_ids AS (
        {VALIDATION_INDIVIDUAL_IDS_QUERY}
    ),
    
    validation_ids_missing_from_ingest_view AS (
        WITH missing AS (
            SELECT 
            *
            FROM validation_ids
            
            EXCEPT DISTINCT
            
            SELECT *
            FROM ingest_ids
        )
        SELECT 
            *,
            'validation_ids_missing_from_ingest_view' as type
        FROM missing
    ),
    
    ingest_view_ids_missing_from_validation AS (
        WITH missing as (
            SELECT 
            *
            FROM ingest_ids
            EXCEPT DISTINCT
            SELECT * 
            FROM validation_ids
        )
        SELECT 
            *,
            'ingest_view_ids_missing_from_validation' as type
        FROM missing
    )
    
    SELECT * FROM validation_ids_missing_from_ingest_view
    
    UNION ALL

    SELECT * FROM ingest_view_ids_missing_from_validation
    
    ORDER BY person_external_id, date_of_stay_or_supervision
"""


# QUERY: COMPARE DIFFERENT VERSIONS OF AN INGEST VIEW
# Use this query when you have made changes to an ingest view and want to compare the diffs between the two

# TO_DO: Replace with ingest view names to compare
INGEST_VIEW_V1 = ""
INGEST_VIEW_V2 = ""

compare_ingest_views = f"""
    WITH ingest_view_v1 AS (
        {get_ingest_view_individual_ids_query(INGEST_VIEW_V1)}
    ),
    ingest_view_v2 AS (
        {get_ingest_view_individual_ids_query(INGEST_VIEW_V2)}
    )
     v1_ids_missing_from_v2 AS (
        WITH missing AS (
            SELECT
                *
            FROM ingest_view_v1
            EXCEPT DISTINCT
                SELECT *
                from ingest_view_v2
        )
        SELECT 
            *,
            'v1_ids_missing_from_v2' as type
        FROM missing
    ),
    v2_ids_missing_from_v1 AS (
        WITH missing AS (
            SELECT
                *
            FROM ingest_view_v2
            EXCEPT DISTINCT
            SELECT *
            FROM ingest_view_v1
        )
        SELECT 
            *,
            'v2_ids_missing_from_v1' as type
        FROM missing
    )
    SELECT * FROM v1_ids_missing_from_v2

    UNION ALL

    SELECT * FROM v2_ids_missing_from_v1

"""


pd.options.display.max_rows = 999

In [None]:
# Create dataset if it does not exist, set the validation_dataset and scrach_dataset variables.
with local_project_id_override(GCP_PROJECT_STAGING):
    bq_client = BigQueryClientImpl()

    validation_dataset = bq_client.dataset_ref_for_id(state_specific_validation_dataset)
    scratch_dataset = bq_client.dataset_ref_for_id(state_specific_scratch_dataset)

    # Create scratch dataset if it does not exist
    print(f"Scratch dataset: {scratch_dataset}")
    print(
        f"Creating dataset [{scratch_dataset.project}.{scratch_dataset.dataset_id}] ..."
    )
    bq_client.create_dataset_if_necessary(
        scratch_dataset,
    )
    print(
        f"Done creating dataset [{scratch_dataset.project}.{scratch_dataset.dataset_id}] ..."
    )

    # Create scratch dataset if it does not exist
    print(f"Validation dataset: {validation_dataset}")
    print(
        f"Creating dataset [{validation_dataset.project}.{validation_dataset.dataset_id}] ..."
    )
    bq_client.create_dataset_if_necessary(
        validation_dataset,
    )
    print(
        f"Done creating dataset [{validation_dataset.project}.{validation_dataset.dataset_id}] ..."
    )

In [None]:
with local_project_id_override(GCP_PROJECT_STAGING):
    bq_client = BigQueryClientImpl()

    # Materialize results from check_ingest_view_determinism
    materialize_query_with_name(
        dataset_ref=scratch_dataset,
        view_tag="check_ingest_view_determinism",
        query_name="materialized",
        query=check_ingest_view_determinism,
    )

    # Materialize results from population_comparison_by_date_and_location
    materialize_query_with_name(
        dataset_ref=scratch_dataset,
        view_tag="population_comparison_by_date_and_location",
        query_name="materialized",
        query=population_comparison_by_date_and_location,
    )

    # Materialize results from ingest_validation_person_level_comparison
    materialize_query_with_name(
        dataset_ref=scratch_dataset,
        view_tag="ingest_validation_person_level_comparison",
        query_name="materialized",
        query=ingest_validation_person_level_comparison,
    )

    # Materialize results from compare_ingest_views
    materialize_query_with_name(
        dataset_ref=scratch_dataset,
        view_tag="compare_ingest_views",
        query_name="materialized",
        query=compare_ingest_views,
    )

    print("Load complete")