In [4]:
"""
STATE: AL
POLICY: Parole Grant Rates
VERSION: V1
DATA SOURCE: AL public DOC inmate data and AL reports
DATA QUALITY: medium-low
HIGHEST PRIORITY MISSING DATA: N/A
REFERENCE_DATE: October 2022
TIME_STEP: Month
ADDITIONAL NOTES: Initial policy scoping doc https://docs.google.com/document/d/1mj6Fmm3aCmx08PqhNShV6Rb8MCRuHQ2D56BmxeJKxKg/edit?usp=sharing
"""

import logging
import pandas as pd
import numpy as np
import pdb
import matplotlib as plt
from IPython.display import display
import itertools


%run /Users/jazz/Recidiviz/recidiviz-research/utils/research_utils.py

pd.options.display.max_colwidth = None
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("max_seq_items", None)
pd.set_option("display.width", None)

In [6]:
import sys

sys.path.insert(1, "/Users/jazz/Recidiviz/pulse-data")

from recidiviz.calculator.modeling.population_projection.super_simulation.time_converter import (
    TimeConverter,
)
from recidiviz.calculator.modeling.population_projection.utils.spark_bq_utils import (
    upload_spark_model_inputs,
)
from recidiviz.calculator.modeling.population_projection.utils.spark_preprocessing_utils import (
    convert_dates,
)
from recidiviz.utils.yaml_dict import YAMLDict

logger = logging.getLogger(__name__)
logger.setLevel(level=logging.INFO)

# Get the simulation tag from the model inputs config
yaml_file_path = "al_gun_minimums_v1.yaml"

simulation_config = YAMLDict.from_path(yaml_file_path)
data_inputs = simulation_config.pop_dict("data_inputs")
simulation_tag = data_inputs.pop("big_query_simulation_tag", str)

# Convert the timestamps to time_steps (relative ints), with 0 being the most recent
# date of data (Sept. 2020)
reference_date = simulation_config.pop("reference_date", float)
time_step = simulation_config.pop("time_step", float)
time_converter = TimeConverter(reference_year=reference_date, time_step=time_step)

In [390]:
%%bigquery persons_raw
SELECT * FROM `recidiviz-staging.spark_al_public_data_scraped.us_al_person_details`
where collection_id in ("COLLECTION_6")

Query complete after 0.00s: 100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 390.35query/s]
Downloading: 100%|██████████████████████████████████████████████████| 26092/26092 [00:01<00:00, 13458.84rows/s]


In [391]:
%%bigquery incarcerations_raw
SELECT * FROM `recidiviz-staging.spark_al_public_data_scraped.us_al_incarceration_details`
where collection_id in ("COLLECTION_6")

Query complete after 0.01s: 100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 157.23query/s]
Downloading: 100%|██████████████████████████████████████████████████| 46858/46858 [00:02<00:00, 23205.15rows/s]


In [392]:
%%bigquery sentences_raw
SELECT * FROM `recidiviz-staging.spark_al_public_data_scraped.us_al_sentences`
where collection_id in ("COLLECTION_6")

Query complete after 0.00s: 100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 263.30query/s]
Downloading: 100%|████████████████████████████████████████████████| 117399/117399 [00:03<00:00, 35141.98rows/s]


In [404]:
persons_raw.head(2)

Unnamed: 0,Inmate,AIS,Institution,Race,Sex,Hair_Color,Eye_Color,Height,Weight,Birth_Year,Custody,run_id,collection_id
0,"ROBERTSON, GUY C",81485,OHIO,B,M,BLACK,BROWN,"5' 10""",156,1931,ESCAPE,2023-05-05 15:53:15.991829,COLLECTION_6
1,"TAYLOR, LEROY",106228,OHIO,B,M,BLACK,BROWN,"6' 0""",165,1943,RECAPTURED PAROLE VIOLATOR,2023-05-05 15:53:15.991829,COLLECTION_6


In [405]:
incarcerations_raw.head(2)

Unnamed: 0,SUF,Admit_Date,Total_Term,Time_Served,Jail_Credit,Good_Time_Received,Good_Time_Revoked,Min_Release_Date,Parole_Consideration_Date,Parole_Status,most_recent,AIS,incarceration_id,run_id,collection_id
0,0,05/03/1977,0Y 0M 0D,10Y 4M 5D,0,0Y 0M 0D,0Y 0M 0D,,,NO HEARING,True,Z350,Z350-0,2023-05-05 15:53:15.991829,COLLECTION_6
1,0,09/07/1977,0Y 0M 0D,32Y 8M 18D,0,0Y 0M 0D,0Y 0M 0D,,,NO HEARING,True,Z355,Z355-0,2023-05-05 15:53:15.991829,COLLECTION_6


In [406]:
sentences_raw.head(2)

Unnamed: 0,incarceration_id,Case_Number,Sentenced,Offense,Term,Type,Commit_County,AIS,run_id,collection_id
0,319206-R,CC2016-000034,09/09/2019,MURDER,0Y *M 0D,Concurrent,BIBB,319206,2023-05-05 15:53:15.991829,COLLECTION_6
1,332541-S,CC2022-000029,01/26/2023,RAPE I,0Y *M 0D,Consecutive,CHILTON,332541,2023-05-05 15:53:15.991829,COLLECTION_6


In [528]:
inc_time_span_col = [
    "Total_Term",
    "Time_Served",
    "Good_Time_Received",
    "Good_Time_Revoked",
]
sen_time_span_col = ["Term"]
inc_date_col = ["Admit_Date", "Min_Release_Date", "Parole_Consideration_Date", "run_id"]
sen_date_col = ["Sentenced", "run_id"]

In [393]:
persons = persons_raw.copy()
incarcerations = incarcerations_raw.copy()
sentences = sentences_raw.copy()

In [317]:
incarcerations.AIS = incarcerations.AIS.str.zfill(8)
sentences.AIS = sentences.AIS.str.zfill(8)

incarcerations.Parole_Consideration_Date = (
    incarcerations.Parole_Consideration_Date.mask(
        incarcerations.Parole_Consideration_Date == "nan", None
    )
)
incarcerations.Min_Release_Date = incarcerations.Min_Release_Date.mask(
    (incarcerations.Min_Release_Date == "nan"), None
)

gun_keywords = ["PISTOL", "GUN", "RIFLE", "SHTGN"]
sentences["is_firearm"] = sentences.Offense.str.contains("|".join(gun_keywords))

In [383]:
def parse_time_span(s: pd.Series) -> pd.Series:
    df = (
        pd.concat(
            [
                s.str.extract("(?P<year>\d*)Y\s*(?P<month>\d*)M\s*(?P<day>\d*)D"),
                s.str.extract("(?P<days>\d*) Days"),
            ],
            axis=1,
        )
        .fillna(0)
        .astype(int, errors="ignore")
    )
    df["sentence"] = df.year + df.month / 12 + df.day / 365.25
    return df.sentence.mask(df.sentence == 0, df.days / 365.25)

In [397]:
for col in inc_time_span_col:
    incarcerations[col.lower()] = parse_time_span(incarcerations[col])
for col in sen_time_span_col:
    sentences[col.lower()] = parse_time_span(sentences[col])

In [529]:
def parse_date(s: pd.Series) -> pd.Series:
    df = pd.DataFrame(s)
    df["date"] = pd.to_datetime(df[s.name], errors="coerce")
    df["date_adj"] = df.date.mask(
        df.date.isna() & (df[s.name] != "nan"), pd.Timestamp.max
    )
    return df.date_adj

In [530]:
for col in inc_date_col:
    incarcerations[col.lower()] = parse_date(incarcerations[col])
for col in sen_date_col:
    sentences[col.lower()] = parse_date(sentences[col])

In [531]:
incarcerations.head()

Unnamed: 0,SUF,Admit_Date,Total_Term,Time_Served,Jail_Credit,Good_Time_Received,Good_Time_Revoked,Min_Release_Date,Parole_Consideration_Date,Parole_Status,most_recent,AIS,incarceration_id,run_id,collection_id,total_term_parsed,time_served_parsed,good_time_received_parsed,good_time_revoked_parsed,admit_date,min_release_date,parole_consideration_date
0,0,05/03/1977,0Y 0M 0D,10Y 4M 5D,0,0Y 0M 0D,0Y 0M 0D,,,NO HEARING,True,Z350,Z350-0,2023-05-05 15:53:15.991829,COLLECTION_6,0.0,10.347023,0.0,0.0,1977-05-03,NaT,NaT
1,0,09/07/1977,0Y 0M 0D,32Y 8M 18D,0,0Y 0M 0D,0Y 0M 0D,,,NO HEARING,True,Z355,Z355-0,2023-05-05 15:53:15.991829,COLLECTION_6,0.0,32.715948,0.0,0.0,1977-09-07,NaT,NaT
2,0,09/09/1977,0Y 0M 0D,22Y 0M 28D,0,0Y 0M 0D,0Y 0M 0D,,,NO HEARING,True,Z357,Z357-0,2023-05-05 15:53:15.991829,COLLECTION_6,0.0,22.07666,0.0,0.0,1977-09-09,NaT,NaT
3,0,11/01/1977,0Y 0M 0D,17Y 2M 8D,0,0Y 0M 0D,0Y 0M 0D,,,NO HEARING,True,Z360,Z360-0,2023-05-05 15:53:15.991829,COLLECTION_6,0.0,17.188569,0.0,0.0,1977-11-01,NaT,NaT
4,0,11/01/1977,0Y 0M 0D,27Y 6M 21D,1947,0Y 0M 0D,0Y 0M 0D,,,NO HEARING,True,Z361,Z361-0,2023-05-05 15:53:15.991829,COLLECTION_6,0.0,27.557495,0.0,0.0,1977-11-01,NaT,NaT


In [532]:
incarcerations.dtypes

SUF                                  object
Admit_Date                           object
Total_Term                           object
Time_Served                          object
Jail_Credit                          object
Good_Time_Received                   object
Good_Time_Revoked                    object
Min_Release_Date                     object
Parole_Consideration_Date            object
Parole_Status                        object
most_recent                          object
AIS                                  object
incarceration_id                     object
run_id                       datetime64[ns]
collection_id                        object
total_term_parsed                   float64
time_served_parsed                  float64
good_time_received_parsed           float64
good_time_revoked_parsed            float64
admit_date                   datetime64[ns]
min_release_date             datetime64[ns]
parole_consideration_date    datetime64[ns]
dtype: object