In [1]:
# Allows us to use the BigQuery "magic" (%%bigquery)
%load_ext google.cloud.bigquery

# Required so python actually gets re-executed every time
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pandas as pd
parts = os.path.normpath(os.getcwd()).split(os.path.sep)
relative_path_parts = parts[:parts.index('pulse-data') + 1]
abs_path = os.path.join('/', *relative_path_parts)

if sys.path[0] != abs_path:
    sys.path.insert(0, abs_path)

from datetime import datetime
from google.cloud import bigquery
from typing import Optional

from recidiviz.big_query.big_query_client import BigQueryClientImpl
from recidiviz.big_query.view_update_manager import TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS
from recidiviz.ingest.direct.controllers.direct_ingest_view_collector import DirectIngestPreProcessedIngestViewCollector
from recidiviz.ingest.direct.views.direct_ingest_big_query_view_types import DirectIngestPreProcessedIngestView, RawTableViewType
from recidiviz.utils import regions
from recidiviz.utils.environment import GCP_PROJECT_STAGING
from recidiviz.utils.metadata import local_project_id_override, project_id
from google.cloud.bigquery import magics
magics.context.progress_bar_type = None

region_code = 'US_TN'
view_tag = 'OffenderMovementIncarcerationPeriod'
dataset_prefix = 'caroletouma_2'
validation_dataset = 'us_tn_validation_scratch'


ingest_view_individual_IDs_by_day_by_facility = """
    SELECT 
        DISTINCT OffenderID,
        LAST_DAY(PeriodDate) as day,
        Site as facility,
    FROM `recidiviz-staging.caroletouma_2_OffenderMovementIncarcerationPeriod_validation.OffenderMovementIncarcerationPeriod_latest` raw_periods,
    # Date range is from Jan 2014 to June 2021.
    UNNEST(GENERATE_DATE_ARRAY(Date(2014, 1, 1), Date(2021, 6, 1), INTERVAL 1 MONTH)) PeriodDate
    WHERE 
        (
            -- The validation counts are at 10:30 PM. Ensure that the comparison datetime is between start and end
            -- date times, otherwise (if the period is unbounded) that the validation date and time is after the start
            -- date time.
            (TIMESTAMP(CONCAT(STRING(LAST_DAY(PeriodDate)), " 20:30:00.000000"))
                BETWEEN TIMESTAMP(raw_periods.StartDateTime) AND TIMESTAMP(raw_periods.EndDateTime))
            OR (
                TIMESTAMP(CONCAT(STRING(LAST_DAY(PeriodDate)), " 20:30:00.000000")) 
                >= TIMESTAMP(raw_periods.StartDateTime) 
                and TIMESTAMP(raw_periods.EndDateTime) is NULL
            )
        )
        -- IGNORE TPFW on August 31, 2020. This is because TWPF got renamed to DJRC. The validation counts double
        -- counted TPFW and DJRC on August 31st, 2020.
       AND (Site != "TPFW" and LAST_DAY(PeriodDate) != Date(2020,8,31))

    GROUP BY day, facility, OffenderID
    ORDER BY day
"""

ingest_view_population_by_day_by_facility = """
    SELECT 
        LAST_DAY(PeriodDate) as day,
        Site as facility,
        COUNT(DISTINCT OffenderID) AS ingest_view_population
    FROM `recidiviz-staging.caroletouma_2_OffenderMovementIncarcerationPeriod_validation.OffenderMovementIncarcerationPeriod_latest` raw_periods,
    
    # Date range is from Jan 2014 to June 2021.
    UNNEST(GENERATE_DATE_ARRAY(Date(2014, 1, 1), Date(2021, 6, 1), INTERVAL 1 MONTH)) PeriodDate
    WHERE 
        (
            -- The validation counts are at 10:30 PM. Ensure that the comparison datetime is between start and end
            -- date times, otherwise (if the period is unbounded) that the validation date and time is after the start
            -- date time.
            (TIMESTAMP(CONCAT(STRING(LAST_DAY(PeriodDate)), " 20:30:00.000000"))
                BETWEEN TIMESTAMP(raw_periods.StartDateTime) AND TIMESTAMP(raw_periods.EndDateTime))
            OR (
                TIMESTAMP(CONCAT(STRING(LAST_DAY(PeriodDate)), " 20:30:00.000000")) 
                >= TIMESTAMP(raw_periods.StartDateTime) 
                and TIMESTAMP(raw_periods.EndDateTime) is NULL
            )
        )
        -- IGNORE TPFW on August 31, 2020. This is because TWPF got renamed to DJRC. The validation counts double
        -- counted TPFW and DJRC on August 31st, 2020.
       AND (Site != "TPFW" and LAST_DAY(PeriodDate) != Date(2020,8,31))
    GROUP BY day, facility
    ORDER BY day
"""

population_comparison_query = """
WITH
  validation_query AS (
  SELECT
    # The validation view totals for the first of the month are as of the last day of the previous month.
    DATE_SUB(date_of_stay, INTERVAL 1 DAY) AS day,
    REPLACE(REGEXP_REPLACE(facility, r'([\\'\\"])', ''), ' ', '') AS facility,
    population_count AS validation_population
  FROM
    `recidiviz-staging.us_tn_validation_scratch.validation_incarceration_population_by_day_by_facility_materialized`
    )
SELECT
  ingest_view_query.day AS day,
  validation_query.facility AS facility,
  ingest_view_query.ingest_view_population,
  validation_query.validation_population,
  ingest_view_query.ingest_view_population - validation_query.validation_population AS total_diff,
  ABS(ROUND(100 * SAFE_DIVIDE(ingest_view_query.ingest_view_population - validation_query.validation_population,
      validation_query.validation_population), 2)) AS percentage_off
FROM
  validation_query
INNER JOIN
  `recidiviz-staging.caroletouma_2_OffenderMovementIncarcerationPeriod_validation.ingest_view_population_by_day_by_facility_materialized` ingest_view_query
ON
  ingest_view_query.day = validation_query.day
WHERE
  ingest_view_query.facility = validation_query.facility
ORDER BY
  percentage_off DESC

"""

pd.options.display.max_rows = 999

In [3]:
# Get the view
with local_project_id_override(GCP_PROJECT_STAGING):
    region = regions.get_region(region_code, is_direct_ingest=True)
    collector = DirectIngestPreProcessedIngestViewCollector(region, [])

    views_by_tag = {
        builder.file_tag: builder.build()
        for builder in collector.collect_view_builders()}

    view = views_by_tag[view_tag]

In [4]:
# Create a dataset for materialized tables
validation_sandbox_dataset_id = f'{dataset_prefix}_{view_tag}_validation'

with local_project_id_override(GCP_PROJECT_STAGING):
    bq_client = BigQueryClientImpl()

    validation_dataset = bq_client.dataset_ref_for_id(validation_sandbox_dataset_id)
    print(f'Validation dataset: {validation_dataset}')

    print(f'Creating dataset [{validation_dataset.project}.{validation_dataset.dataset_id}] ...')
    bq_client.create_dataset_if_necessary(
        validation_dataset,
        default_table_expiration_ms=TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS
    )
    print(f'Done creating dataset [{validation_dataset.project}.{validation_dataset.dataset_id}] ...')

Validation dataset: DatasetReference('recidiviz-staging', 'caroletouma_2_OffenderMovementIncarcerationPeriod_validation')
Creating dataset [recidiviz-staging.caroletouma_2_OffenderMovementIncarcerationPeriod_validation] ...
Done creating dataset [recidiviz-staging.caroletouma_2_OffenderMovementIncarcerationPeriod_validation] ...


In [5]:
# Defines a function that can be used to materialize a table for use in later queries
def materialize_query_with_name(
    dataset_ref: bigquery.DatasetReference,
    view_tag: str,
    query_name: str,
    query: str,
    query_dt: Optional[datetime] = None
):
    table_id = f'{view_tag}_{query_name}'
    print(f'Writing {query_name} query to [{dataset_ref.project}.{dataset_ref.dataset_id}.{table_id}]...')
    
    parameters = [bigquery.ScalarQueryParameter('StartDate', 'DATETIME', query_dt)] if query_dt else None
    create_job = bq_client.create_table_from_query_async(
        dataset_id=dataset_ref.dataset_id,
        table_id=table_id,
        query=query,
        query_parameters=parameters,
        overwrite=True
    )
    create_job.result()
    print(f'Finished writing {query_name} query.')

In [6]:
# Query the view two times and materialize the results to tables so we can analyze query determinism 
with local_project_id_override(GCP_PROJECT_STAGING):
    latest_query = view.expanded_view_query(
        config=DirectIngestPreProcessedIngestView.QueryStructureConfig(
            raw_table_view_type=RawTableViewType.LATEST,
        )
    )
    
    materialize_query_with_name(
        dataset_ref=validation_dataset,
        view_tag=view_tag,
        query_name='latest',
        query=latest_query,
    )
    
    materialize_query_with_name(
        dataset_ref=validation_dataset,
        view_tag=view_tag,
        query_name='latest_duplicate',
        query=latest_query,
    )

    
    # Given the materalized ingest view, collect info into Facility/Date: Individual ID
    materialize_query_with_name(
        dataset_ref=validation_dataset,
        view_tag='ingest_view_individual_IDs_by_day_by_facility',
        query_name='materialized',
        query=ingest_view_individual_IDs_by_day_by_facility,
    )
    
    # Given the materalized ingest view, collect info into Facility/Date: Population Count
    materialize_query_with_name(
        dataset_ref=validation_dataset,
        view_tag='ingest_view_population_by_day_by_facility',
        query_name='materialized',
        query=ingest_view_population_by_day_by_facility,
    )
    
    # Compare the population counts for the raw counts vs. ingest view
    materialize_query_with_name(
        dataset_ref=validation_dataset,
        view_tag='comparison_population_counts_tom_vs_generated',
        query_name='materialized',
        query=population_comparison_query,
    )

    print('Load complete')

Writing latest query to [recidiviz-staging.caroletouma_2_OffenderMovementIncarcerationPeriod_validation.OffenderMovementIncarcerationPeriod_latest]...
Finished writing latest query.
Writing latest_duplicate query to [recidiviz-staging.caroletouma_2_OffenderMovementIncarcerationPeriod_validation.OffenderMovementIncarcerationPeriod_latest_duplicate]...
Finished writing latest_duplicate query.
Writing materialized query to [recidiviz-staging.caroletouma_2_OffenderMovementIncarcerationPeriod_validation.ingest_view_individual_IDs_by_day_by_facility_materialized]...
Finished writing materialized query.
Writing materialized query to [recidiviz-staging.caroletouma_2_OffenderMovementIncarcerationPeriod_validation.ingest_view_population_by_day_by_facility_materialized]...
Finished writing materialized query.
Writing materialized query to [recidiviz-staging.caroletouma_2_OffenderMovementIncarcerationPeriod_validation.comparison_population_counts_tom_vs_generated_materialized]...
Finished writing 

In [7]:
%%bigquery --params {"OffenderID": "00506905"}
# Insert the specific `OffenderID`

# QUERY DESCRIPTION:
# This query pulls in the death date associated with the OffenderID.
SELECT 
    OffenderID,
    DeathDate,
    DeathType
FROM `recidiviz-staging.us_tn_raw_data_up_to_date_views.OffenderAttributes_latest`
WHERE OffenderID LIKE @OffenderID

Unnamed: 0,OffenderID,DeathDate,DeathType
0,506905,TN,62


In [9]:
%%bigquery --params {"OffenderID": "00506905"}
# Insert the specific `OffenderID`

# QUERY DESCRIPTION:
# This query lists the movements associated with that OffenderID in the raw `OffenderMovement` table.
# Ignore `WRFA` and `FAWR` because those related to work releases (that are filtered out).
SELECT *
FROM `recidiviz-staging.us_tn_raw_data_up_to_date_views.OffenderMovement_latest`
WHERE OffenderID LIKE @OffenderID
ORDER BY MovementDateTime DESC

Unnamed: 0,OffenderID,MovementDateTime,MovementType,MovementReason,FromLocationID,ToLocationID,ArrivalDepartureFlag,LastUpdateUserID,LastUpdateDate
0,506905,2021-01-05 11:22:00.000000,FAFA,JAILT,"""005 ""","""BCCX """,D,"""BI18AHC """,2021-01-05 11:22:39.559132
1,506905,2020-09-14 08:00:00.000000,CCFA,REVOK,"""ETHR ""","""005 """,D,"""TNLJAOL """,2020-09-15 11:17:11.431096
2,506905,2018-02-23 09:05:00.000000,FACC,REINS,"""005 ""","""ETHR """,D,"""BICJ754 """,2018-03-21 11:02:37.957474
3,506905,2017-12-15 09:05:00.000000,CCFA,REVOK,"""SECC ""","""005 """,D,"""BICJ754 """,2017-12-15 14:08:31.711181
4,506905,2017-08-28 09:05:00.000000,CCFA,VIOLT,"""SECC ""","""005 """,D,"""TNWCCEF """,2017-08-30 09:01:06.466329
5,506905,2017-08-28 09:00:00.000000,ABCC,RTABS,"""AB ""","""SECC """,A,"""TNWCCEF """,2017-08-30 09:00:45.445953
6,506905,2016-11-02 09:00:00.000000,CCAB,FAILR,"""SECC ""","""AB """,D,"""TNWCCEF """,2017-02-16 08:35:20.920834
7,506905,2016-02-26 09:00:00.000000,CCCC,INTER,"""ETHR ""","""SECC """,D,"""TNWCCA5 """,2016-03-02 12:29:57.597410
8,506905,2016-01-27 12:05:00.000000,CCCC,INTER,"""SECC ""","""ETHR """,D,"""TNWCCEF """,2016-02-23 07:12:54.374917
9,506905,2016-01-27 12:00:00.000000,FACC,REINS,"""062 ""","""SECC """,D,"""BI01D93 """,2016-01-28 08:02:05.925960


In [8]:
%%bigquery --params {"OffenderID": "00506905"}
# Insert the specific `OffenderID`.

# QUERY DESCRIPTION:
# This query lists output from the **ingest view** we are testing of `OffenderMovement`, given an `OffenderID`. 
SELECT *
FROM `recidiviz-staging.caroletouma_2_OffenderMovementIncarcerationPeriod_validation.OffenderMovementIncarcerationPeriod_latest`
WHERE OffenderID LIKE @OffenderID
ORDER BY StartDateTime ASC

Unnamed: 0,OffenderID,StartDateTime,EndDateTime,Site,StartMovementType,StartMovementReason,EndMovementType,EndMovementReason,IncarcerationSequenceNumber
0,506905,2012-06-04 15:25:00.000000,2012-08-22 00:01:00.000000,062,PRFA,SPLIT,FAPR,RETSP,1
1,506905,2013-05-31 11:00:00.000000,2013-06-24 08:00:00.000000,062,PRFA,PRVOK,FAFA,JAILT,2
2,506905,2013-06-24 08:00:00.000000,2013-07-29 06:35:00.000000,MCCX,FAFA,JAILT,FACT,OUTNO,3
3,506905,2013-07-29 06:35:00.000000,2013-07-29 10:50:00.000000,062,FACT,OUTNO,CTFA,RETNO,4
4,506905,2013-07-29 10:50:00.000000,2013-07-31 08:35:00.000000,MCCX,CTFA,RETNO,FAFA,CLASN,5
5,506905,2013-07-31 08:35:00.000000,2013-08-07 13:30:00.000000,BCCX,FAFA,CLASN,FACC,JUDDE,6
6,506905,2015-02-20 09:00:00.000000,2016-01-27 12:00:00.000000,062,CCFA,REVOK,FACC,REINS,7
7,506905,2017-12-15 09:05:00.000000,2018-02-23 09:05:00.000000,005,CCFA,REVOK,FACC,REINS,8
8,506905,2020-09-14 08:00:00.000000,2021-01-05 11:22:00.000000,005,CCFA,REVOK,FAFA,JAILT,9
9,506905,2021-01-05 11:22:00.000000,,BCCX,FAFA,JAILT,,,10


In [10]:
%%bigquery --params {"OffenderID": 506905}
# Insert the specific `OffenderID` (note: remove the extra "00" at the front of the ID, if applicable)

# QUERY DESCRIPTION:
# This query lists specific dates and facilities that the OffenderID was counted for in the **RAW** validation
# incarceration population counts.
SELECT *
FROM `recidiviz-staging.us_tn_validation_scratch.raw_incarceration_individual_IDs_by_day_by_facility`
WHERE OffenderID = @OffenderID
ORDER BY ReportingDate ASC


Unnamed: 0,ReportingDate,SiteID,OffenderID
0,2021-02-01,BCCX,506905
1,2021-03-01,BCCX,506905
2,2021-04-01,BCCX,506905
3,2021-05-01,BCCX,506905
4,2021-06-01,BCCX,506905


In [7]:
%%bigquery
SELECT COUNT(*)
FROM `recidiviz-staging.supervision_and_incarceration_OffenderMovement_validation.OffenderMovement_latest`

Unnamed: 0,f0_
0,3498866


In [9]:
%%bigquery
SELECT COUNT(*)
FROM `recidiviz-staging.supervision_and_incarceration_OffenderMovement_validation.OffenderMovement_latest_duplicate`

Unnamed: 0,f0_
0,3498866


In [10]:
%%bigquery
# Checks that two executions of the query are deterministic (should return 0 rows)
SELECT * # EXCEPT (<add column names here to narrow down what is changing>)
FROM `recidiviz-staging.supervision_and_incarceration_OffenderMovement_validation.OffenderMovement_latest`
EXCEPT DISTINCT
SELECT * # EXCEPT (<add column names here to narrow down what is changing>)
FROM `recidiviz-staging.supervision_and_incarceration_OffenderMovement_validation.OffenderMovement_latest_duplicate`


Unnamed: 0,OffenderID,StartDate,EndDate,PeriodType,StartMovementType,StartMovementReason,EndMovementType,EndMovementReason,FromLocationID,ToLocationID
