In [None]:
# Allows us to use the BigQuery "magic" (%%bigquery)
%load_ext google.cloud.bigquery

# Required so python actually gets re-executed every time
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys

parts = os.path.normpath(os.getcwd()).split(os.path.sep)
relative_path_parts = parts[:parts.index('pulse-data') + 1]
abs_path = os.path.join('/', *relative_path_parts)

if sys.path[0] != abs_path:
    sys.path.insert(0, abs_path)

from datetime import datetime
from google.cloud import bigquery
from typing import Optional

from recidiviz.big_query.big_query_client import BigQueryClientImpl
from recidiviz.big_query.view_update_manager import TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS
from recidiviz.ingest.direct.controllers.direct_ingest_view_collector import DirectIngestPreProcessedIngestViewCollector
from recidiviz.ingest.direct.views.direct_ingest_big_query_view_types import (
    DirectIngestPreProcessedIngestView,
    DirectIngestPreProcessedIngestViewBuilder,
    RawTableViewType,
)
from recidiviz.utils import regions
from recidiviz.utils.environment import GCP_PROJECT_STAGING
from recidiviz.utils.metadata import local_project_id_override, project_id
from google.cloud.bigquery import magics
magics.context.progress_bar_type = None

region_code = 'US_ID'
view_tag = 'movement_facility_location_offstat_incarceration_periods'
lower_bound_dt = datetime.fromisoformat('2020-06-11')
upper_bound_dt = datetime.fromisoformat('2020-09-21')
dataset_prefix = 'nbhargava'


In [None]:
# Get the view
with local_project_id_override(GCP_PROJECT_STAGING):
    region = regions.get_region(region_code, is_direct_ingest=True)
    collector = DirectIngestPreProcessedIngestViewCollector(region, [])

    views_by_tag = {
        builder.file_tag: builder.build()
        for builder in collector.collect_view_builders()}

    view = views_by_tag[view_tag]

In [None]:
# Create a dataset for materialized tables
validation_sandbox_dataset_id = f'{dataset_prefix}_{view_tag}_validation'

with local_project_id_override(GCP_PROJECT_STAGING):
    bq_client = BigQueryClientImpl()

    validation_dataset = bq_client.dataset_ref_for_id(validation_sandbox_dataset_id)

    print(f'Creating dataset [{validation_dataset.project}.{validation_dataset.dataset_id}] ...')
    bq_client.create_dataset_if_necessary(
        validation_dataset,
        default_table_expiration_ms=TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS
    )
    print(f'Done creating dataset [{validation_dataset.project}.{validation_dataset.dataset_id}] ...')

In [None]:
# Defines a function that can be used to materialize a table for use in later queries
def materialize_query_with_name(
    dataset_ref: bigquery.DatasetReference,
    view_tag: str,
    query_name: str,
    query: str,
    query_dt: Optional[datetime] = None
):
    table_id = f'{view_tag}_{query_name}'
    print(f'Writing {query_name} query to [{dataset_ref.project}.{dataset_ref.dataset_id}.{table_id}]...')
    
    parameters = [bigquery.ScalarQueryParameter('update_timestamp', 'DATETIME', query_dt)] if query_dt else None
    create_job = bq_client.create_table_from_query_async(
        dataset_id=dataset_ref.dataset_id,
        table_id=table_id,
        query=query,
        query_parameters=parameters,
        overwrite=True
    )
    create_job.result()
    print(f'Finished writing {query_name} query.')

In [None]:
# Query the view two times and materialize the results to tables so we can analyze query determinism 
with local_project_id_override(GCP_PROJECT_STAGING):
    latest_query = view.expanded_view_query(
        config=DirectIngestPreProcessedIngestView.QueryStructureConfig(
            raw_table_view_type=RawTableViewType.PARAMETERIZED,
            param_name_override='update_timestamp'
        )
    )
    latest_query = latest_query.replace('@update_timestamp', 'CURRENT_DATE()')
    
    materialize_query_with_name(
        dataset_ref=validation_dataset,
        view_tag=view_tag,
        query_name='latest',
        query=latest_query,
    )
    
    materialize_query_with_name(
        dataset_ref=validation_dataset,
        view_tag=view_tag,
        query_name='latest_duplicate',
        query=latest_query,
    )

    print('Load complete')

In [None]:
%%bigquery
SELECT COUNT(*)
FROM `recidiviz-staging.nbhargava_movement_facility_location_offstat_incarceration_periods_validation.movement_facility_location_offstat_incarceration_periods_latest`

In [None]:
%%bigquery
SELECT COUNT(*)
FROM `recidiviz-staging.nbhargava_movement_facility_location_offstat_incarceration_periods_validation.movement_facility_location_offstat_incarceration_periods_latest_duplicate`

In [None]:
%%bigquery
# Checks that two executions of the query are deterministic (should return 0 rows)
SELECT * EXCEPT (prev_fac_cd, prev_fac_typ, next_fac_cd, next_fac_typ)
FROM `recidiviz-staging.nbhargava_movement_facility_location_offstat_incarceration_periods_validation.movement_facility_location_offstat_incarceration_periods_latest`
EXCEPT DISTINCT
SELECT * EXCEPT (prev_fac_cd, prev_fac_typ, next_fac_cd, next_fac_typ)
FROM `recidiviz-staging.nbhargava_movement_facility_location_offstat_incarceration_periods_validation.movement_facility_location_offstat_incarceration_periods_latest_duplicate`


In [None]:
%%bigquery --params {"docno": "64474"}
# This query can be used to debug what has changed between two runs of the query
SELECT
    *
FROM (
    SELECT '2NEW' AS t, *
    FROM `recidiviz-staging.nbhargava_movement_facility_location_offstat_incarceration_periods_validation.movement_facility_location_offstat_incarceration_periods_latest`
    UNION ALL
    SELECT '1OLD' AS t, *
    FROM `recidiviz-staging.nbhargava_movement_facility_location_offstat_incarceration_periods_validation.movement_facility_location_offstat_incarceration_periods_latest_duplicate`
)
WHERE docno = @docno
ORDER BY incrno, period_id, start_date, end_date

In [None]:
# Query the and materialize the view with two different date bounds so we can analyze view stability over time 
with local_project_id_override(GCP_PROJECT_STAGING):
    parameterized_query = view.expanded_view_query(
        config=DirectIngestPreProcessedIngestView.QueryStructureConfig(
            raw_table_view_type=RawTableViewType.PARAMETERIZED,
        )
    )

    materialize_query_with_name(
        dataset_ref=validation_dataset,
        view_tag=view_tag,
        query_name='lower_bound',
        query=parameterized_query,
        query_dt=lower_bound_dt
    )
    
    materialize_query_with_name(
        dataset_ref=validation_dataset,
        view_tag=view_tag,
        query_name='upper_bound',
        query=parameterized_query,
        query_dt=upper_bound_dt
    )

    print('Load complete')

In [None]:
%%bigquery
SELECT COUNT(*)
FROM `recidiviz-staging.nbhargava_movement_facility_location_offstat_incarceration_periods_validation.movement_facility_location_offstat_incarceration_periods_lower_bound`

In [None]:
%%bigquery
SELECT COUNT(*)
FROM `recidiviz-staging.nbhargava_movement_facility_location_offstat_incarceration_periods_validation.movement_facility_location_offstat_incarceration_periods_upper_bound`

In [None]:
%%bigquery
# Returns rows that changed between two date bounded queries that we don't expect to change (dates are in the past)
SELECT
    *
    # COUNT(*) AS num_unexpected_changes,
    # COUNT(DISTINCT docno) AS num_people_unexpected_changes, 
FROM (
    SELECT * #EXCEPT (statuses)
    FROM `recidiviz-staging.nbhargava_movement_facility_location_offstat_incarceration_periods_validation.movement_facility_location_offstat_incarceration_periods_upper_bound`
    EXCEPT DISTINCT
    SELECT * #EXCEPT (statuses)
    FROM `recidiviz-staging.nbhargava_movement_facility_location_offstat_incarceration_periods_validation.movement_facility_location_offstat_incarceration_periods_lower_bound`
)
WHERE start_date < '2019-01-01' AND end_date < '2019-01-01'
# , county_of_residence, condition_codes

In [None]:
%%bigquery --params {"docno": "66498"}
# This query can be used to debug what has changed between two runs of the query
SELECT *
FROM (
    SELECT '2NEW' AS t, *
    FROM `recidiviz-staging.nbhargava_movement_facility_location_offstat_incarceration_periods_validation.movement_facility_location_offstat_incarceration_periods_upper_bound`
    UNION ALL
    SELECT '1OLD' AS t, *
    FROM `recidiviz-staging.nbhargava_movement_facility_location_offstat_incarceration_periods_validation.movement_facility_location_offstat_incarceration_periods_lower_bound`
)
WHERE docno = @docno
AND period_id > 30
ORDER BY incrno, period_id, start_date, end_date
