In [1]:
%run ../../../../../../recidiviz-research/utils/research_utils.py
adjust_plot_scale(0.4)
from IPython.display import display, HTML
from functools import partial

display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import os
import sys

sys.path.insert(0, os.path.relpath("../../../../../"))
from recidiviz.calculator.modeling.population_projection.super_simulation.time_converter import (
    TimeConverter,
)
from recidiviz.calculator.modeling.population_projection.utils.spark_bq_utils import (
    upload_spark_model_inputs,
)
from recidiviz.calculator.modeling.population_projection.utils.spark_preprocessing_utils import (
    transitions_interpolation,
)
from recidiviz.utils.yaml_dict import YAMLDict

In [3]:
FED_DIRECTORY_PATH = "../state/FED/"
SENTENCING_DATA_PATH = FED_DIRECTORY_PATH + "sentencing_data/"

# Data dictionary:
# https://www.ussc.gov/sites/default/files/pdf/research-and-publications/datafiles/USSC_Public_Release_Codebook_FY99_FY20.pdf
common_columns = [
    # Total prison sentence months
    "TOTPRISN",
    # Total probation sentence months
    "PROBATN",
    # Month sentenced
    "SENTMON",
    # Year sentenced
    "SENTYR",
    # Primary drug type on the sentence
    "COMBDRG2",
    # Type of sentence imposed (prison, prison + fines, probation, probation + fines, just fines)
    "SENTIMP",
    # 1-5 Statutes on the sentences
    "NWSTAT1",
    "NWSTAT2",
    "NWSTAT3",
    "NWSTAT4",
    "NWSTAT5",
    # 1-7 Drug types involved in the case
    "DRUGTYP1",
    "DRUGTYP2",
    "DRUGTYP3",
    "DRUGTYP4",
    "DRUGTYP5",
    # Sentencing district
    "DISTRICT",
    # Mandatory minimum
    "DRUGMIN",
    # The gram amount for the 1st drug type on the sentence
    "WGT1",
    # Race and ethnicity
    "NEWRACE",
]

new_data_columns = common_columns + [
    "OFFGUIDE",
]
old_data_columns = common_columns + [
    "OFFTYPSB",
]

# Supply some of the column types to silence warnings & process the large CSVs faster
column_dtypes = {
    "NWSTAT1": str,
    "NWSTAT2": str,
    "NWSTAT3": str,
    "NWSTAT4": str,
    "NWSTAT5": str,
}

new_data_files = [
    "opafy21nid.csv",
    "opafy20nid.csv",
    "opafy19nid.csv",
    "opafy18nid.csv",
]
# FY10-FY17 use the old offense type format
old_data_files = [f"opafy{year:02}nid.csv" for year in range(10, 18)]

In [9]:
def get_eligible_sentences(df: pd.DataFrame) -> pd.DataFrame:
    """Return only the sentences eligible for this policy:
    * less than 1 day of prison ("TOTPRISN" != 9992)
    * prison ordered, no term specified ("TOTPRISN" != 9997)
    * probation ordered, no term specified ("PROBATN" != 997)
    * sentenced to prison or probation ("SENTIMP" is in [1, 2, 3, 4])
    """
    return df[
        (~df["TOTPRISN"].isin([9992, 9997]))
        & (df["PROBATN"] != 997)
        & df["SENTIMP"].isin([1, 2, 3, 4])
    ]


new_data_list = []
for file in new_data_files:
    temp = pd.read_csv(
        SENTENCING_DATA_PATH + file, usecols=new_data_columns, dtype=column_dtypes
    )
    temp = get_eligible_sentences(temp)
    new_data_list.append(temp)
new_data = pd.concat(new_data_list)

old_data_list = []
for file in old_data_files:
    # Handle FY10 where the header row is lower case
    if "fy10" in file:
        diff_data_columns = [col.lower() for col in old_data_columns]
        diff_column_dtypes = {
            key.lower(): value for (key, value) in column_dtypes.items()
        }
        temp = pd.read_csv(
            SENTENCING_DATA_PATH + file,
            usecols=diff_data_columns,
            dtype=diff_column_dtypes,
        )
        temp.columns = [col.upper() for col in temp.columns]
    else:
        temp = pd.read_csv(
            SENTENCING_DATA_PATH + file, usecols=old_data_columns, dtype=column_dtypes
        )
    temp = get_eligible_sentences(temp)
    old_data_list.append(temp)

old_data = pd.concat(old_data_list)

# Offense code to offense type mapping for newer data
offguide_map = {
    1: "Administration of Justice",
    3: "Arson",
    4: "Assault",
    5: "Extortion/Racketeering/Fraud",
    6: "Robbery",
    7: "Sex Offense",
    8: "Commercialized Vice",
    9: "Drug Possession",
    10: "Drug Trafficking",
    12: "Extortion/Racketeering/Fraud",
    13: "Firearms",
    15: "Extortion/Racketeering/Fraud",
    16: "Extortion/Racketeering/Fraud",
    17: "Immigration",
    19: "Kidnapping",
    20: "Manslaughter",
    21: "Money Laundering",
    22: "Murder",
    24: "Sexual Abuse",
    25: "Prison Offenses",
    26: "Robbery",
    27: "Sex Offense",
    29: "Extortion/Racketeering/Fraud",
    30: "Other",
}
new_data["primary_crime_type"] = new_data["OFFGUIDE"].map(offguide_map).fillna("Other")

# Offense code to offense code mapping for older data
offtype_map = {
    1: "Murder",
    2: "Manslaughter",
    3: "Kidnapping",
    4: "Sex Offense",
    5: "Assault",
    6: "Robbery",
    9: "Arson",
    10: "Drug Trafficking",
    11: "Drug Communication Facilities",
    12: "Drug Possession",
    13: "Firearms",
    15: "Robbery",
    16: "Robbery",
    17: "Robbery",
    18: "Extortion/Racketeering/Fraud",
    19: "Extortion/Racketeering/Fraud",
    20: "Extortion/Racketeering/Fraud",
    21: "Extortion/Racketeering/Fraud",
    22: "Extortion/Racketeering/Fraud",
    23: "Money Laundering",
    24: "Extortion/Racketeering/Fraud",
    25: "Extortion/Racketeering/Fraud",
    26: "Civil Rights Offenses",
    27: "Immigration",
    28: "Sex Offense",
    29: "Prison Offenses",
    30: "Administration of Justice",
    35: "Traffic Violations and Other Offenses",
}
old_data["primary_crime_type"] = old_data["OFFTYPSB"].map(offtype_map).fillna("Other")

In [10]:
new_data[new_data["primary_crime_type"] == "Other"]["OFFGUIDE"].value_counts(
    dropna=False
)

30    2684
28     877
23     750
11     626
18     253
6      206
14     164
2       89
Name: OFFGUIDE, dtype: int64

In [11]:
old_data[old_data["primary_crime_type"] == "Other"]["OFFTYPSB"].value_counts(
    dropna=False
)

31    1261
32     792
34     691
25     586
33     147
Name: OFFTYPSB, dtype: int64

In [15]:
def get_eligible_sentences(df: pd.DataFrame) -> pd.DataFrame:
    """Return only the sentences eligible for this policy:
    * less than 1 day of prison ("TOTPRISN" != 9992)
    * prison ordered, no term specified ("TOTPRISN" != 9997)
    * probation ordered, no term specified ("PROBATN" != 997)
    * sentenced to prison or probation ("SENTIMP" is in [1, 2, 3, 4])
    """
    return df[
        (~df["TOTPRISN"].isin([9992, 9997]))
        & (df["PROBATN"] != 997)
        & df["SENTIMP"].isin([1, 2, 3, 4])
    ]


new_data_list = []
for file in new_data_files:
    temp = pd.read_csv(
        SENTENCING_DATA_PATH + file, usecols=new_data_columns, dtype=column_dtypes
    )
    temp = get_eligible_sentences(temp)
    new_data_list.append(temp)
new_data = pd.concat(new_data_list)

old_data_list = []
for file in old_data_files:
    # Handle FY10 where the header row is lower case
    if "fy10" in file:
        diff_data_columns = [col.lower() for col in old_data_columns]
        diff_column_dtypes = {
            key.lower(): value for (key, value) in column_dtypes.items()
        }
        temp = pd.read_csv(
            SENTENCING_DATA_PATH + file,
            usecols=diff_data_columns,
            dtype=diff_column_dtypes,
        )
        temp.columns = [col.upper() for col in temp.columns]
    else:
        temp = pd.read_csv(
            SENTENCING_DATA_PATH + file, usecols=old_data_columns, dtype=column_dtypes
        )
    temp = get_eligible_sentences(temp)
    old_data_list.append(temp)

old_data = pd.concat(old_data_list)

# Offense code to offense type mapping for newer data
offguide_map = {
    1: "Administration of Justice",
    3: "Arson",
    4: "Assault",
    5: "Extortion/Racketeering/Fraud",
    6: "Robbery",
    7: "Sex Offense",
    8: "Sex Offense",
    9: "Drug Possession",
    10: "Drug Trafficking",
    12: "Extortion/Racketeering/Fraud",
    13: "Firearms",
    15: "Extortion/Racketeering/Fraud",
    16: "Extortion/Racketeering/Fraud",
    17: "Immigration",
    19: "Kidnapping",
    20: "Manslaughter",
    21: "Extortion/Racketeering/Fraud",
    22: "Murder",
    24: "Sex Offense",
    25: "Prison Offenses",
    26: "Robbery",
    27: "Sex Offense",
    29: "Extortion/Racketeering/Fraud",
    30: "Other",
}
new_data["primary_crime_type"] = new_data["OFFGUIDE"].map(offguide_map).fillna("Other")

# Offense code to offense code mapping for older data
offtype_map = {
    1: "Murder",
    2: "Manslaughter",
    3: "Kidnapping",
    4: "Sex Offense",
    5: "Assault",
    6: "Robbery",
    9: "Arson",
    10: "Drug Trafficking",
    11: "Drug Communication Facilities",
    12: "Drug Possession",
    13: "Firearms",
    15: "Robbery",
    16: "Robbery",
    17: "Robbery",
    18: "Extortion/Racketeering/Fraud",
    19: "Extortion/Racketeering/Fraud",
    20: "Extortion/Racketeering/Fraud",
    21: "Extortion/Racketeering/Fraud",
    22: "Extortion/Racketeering/Fraud",
    23: "Extortion/Racketeering/Fraud",
    24: "Extortion/Racketeering/Fraud",
    25: "Extortion/Racketeering/Fraud",
    26: "Civil Rights Offenses",
    27: "Immigration",
    28: "Sex Offense",
    29: "Prison Offenses",
    30: "Administration of Justice",
    35: "Traffic Violations and Other Offenses",
}
old_data["primary_crime_type"] = old_data["OFFTYPSB"].map(offtype_map).fillna("Other")

# Concatenate the two datasets together and rename columns
concat_data_columns = [
    "COMBDRG2",
    "TOTPRISN",
    "PROBATN",
    "primary_crime_type",
    "SENTMON",
    "SENTYR",
    "SENTIMP",
    "NWSTAT1",
    "NWSTAT2",
    "NWSTAT3",
    "NWSTAT4",
    "NWSTAT5",
    "DRUGTYP1",
    "DRUGTYP2",
    "DRUGTYP3",
    "DRUGTYP4",
    "DRUGTYP5",
    "NEWRACE",
    "DISTRICT",
    "DRUGMIN",
    "WGT1",
]
sentence_data = pd.concat(
    [new_data[concat_data_columns], old_data[concat_data_columns]]
).reset_index(drop=True)
sentence_data = sentence_data.rename(
    {
        "TOTPRISN": "total_prison_sentence_months",
        "PROBATN": "total_probation_sentence_months",
        "SENTMON": "month",
        "SENTYR": "year",
    },
    axis=1,
)
COMBDRG2_map = {
    1: "Other Drug",  # "Cocaine",
    2: "Other Drug",  # "Crack",
    3: "Other Drug",  # "Heroin",
    4: "Marijuana",
    6: "Other Drug",  # "Methamphetamine",
    7: "Other Drug",  # "Fentanyl",
    77: "Other Drug",  # "Other"
}
sentence_data["primary_drug_type"] = (
    sentence_data["COMBDRG2"].map(COMBDRG2_map).fillna("None")
)

# Tag all sentences that are listed with any marijuana involved
sentence_data["any_marijuana_involved"] = (
    (sentence_data["COMBDRG2"] == 4)
    | (sentence_data["DRUGTYP1"] == 4)
    | (sentence_data["DRUGTYP2"] == 4)
    | (sentence_data["DRUGTYP3"] == 4)
    | (sentence_data["DRUGTYP4"] == 4)
    | (sentence_data["DRUGTYP5"] == 4)
)

district_map = {
    0: "Maine",
    1: "Massachusetts",
    2: "New Hampshire",
    3: "Rhode Island",
    4: "Puerto Rico",
    5: "Connecticut",
    6: "New York",
    7: "New York",
    8: "New York",
    9: "New York",
    10: "Vermont",
    11: "Delaware",
    12: "New Jersey",
    13: "Pennsylvania",
    14: "Pennsylvania",
    15: "Pennsylvania",
    16: "Maryland",
    17: "North Carolina",
    18: "North Carolina",
    19: "North Carolina",
    20: "South Carolina",
    22: "Virginia",
    23: "Virginia",
    24: "West Virginia",
    25: "West Virginia",
    26: "Alabama",
    27: "Alabama",
    28: "Alabama",
    29: "Florida",
    30: "Florida",
    31: "Florida",
    32: "Georgia",
    33: "Georgia",
    34: "Georgia",
    35: "Louisiana",
    36: "Louisiana",
    37: "Mississippi",
    38: "Mississippi",
    39: "Texas",
    40: "Texas",
    41: "Texas",
    42: "Texas",
    43: "Kentucky",
    44: "Kentucky",
    45: "Michigan",
    46: "Michigan",
    47: "Ohio",
    48: "Ohio",
    49: "Tennessee",
    50: "Tennessee",
    51: "Tennessee",
    52: "Illinois",
    53: "Illinois",
    54: "Illinois",
    55: "Indiana",
    56: "Indiana",
    57: "Wisconsin",
    58: "Wisconsin",
    60: "Arkansas",
    61: "Arkansas",
    62: "Iowa",
    63: "Iowa",
    64: "Minnesota",
    65: "Missouri",
    66: "Missouri",
    67: "Nebraska",
    68: "North Dakota",
    69: "South Dakota",
    70: "Arizona",
    71: "California",
    72: "California",
    73: "California",
    74: "California",
    75: "Hawaii",
    76: "Idaho",
    77: "Montana",
    78: "Nevada",
    79: "Oregon",
    80: "Washington",
    81: "Washington",
    82: "Colorado",
    83: "Kansas",
    84: "New Mexico",
    85: "Oklahoma",
    86: "Oklahoma",
    87: "Oklahoma",
    88: "Utah",
    89: "Wyoming",
    90: "Dist of Columbia",
    91: "Virgin Islands",
    93: "Guam",
    94: "N Mariana Islands",
    95: "Alaska",
    96: "Louisiana",
}
sentence_data["sentencing_district"] = (
    sentence_data["DISTRICT"].map(district_map).fillna("Other")
)

race_ethnicity_type_map = {1: "White", 2: "Black", 3: "Hispanic", 6: "Other"}

sentence_data["race_or_ethnicity"] = (
    sentence_data["NEWRACE"].map(race_ethnicity_type_map).fillna("Unknown")
)

sentence_type_map = {1: "Prison", 2: "Prison", 3: "Probation", 4: "Probation"}
sentence_data["sentence_type"] = sentence_data["SENTIMP"].map(sentence_type_map)

# Create a column `sentence_start_month` from the sentencing year & month data
sentence_data["day"] = 1
sentence_data["sentence_start_month"] = pd.to_datetime(
    sentence_data[["year", "month", "day"]]
).dt.date

sentence_data = sentence_data.drop(
    ["day", "month", "year", "DISTRICT", "NEWRACE", "COMBDRG2", "SENTIMP"], axis=1
)
# Estimate the full-term release date for each sentence,
# cap life sentences at 100 years to avoid date out of bounds errors
sentence_data["estimated_prison_release_date"] = (
    sentence_data["sentence_start_month"]
    + np.floor(
        sentence_data["total_prison_sentence_months"].clip(upper=100 * 12)
    ).apply(pd.offsets.MonthEnd)
).dt.date


sentence_data["estimated_probation_release_date"] = (
    sentence_data["sentence_start_month"]
    + np.floor(sentence_data["total_probation_sentence_months"].fillna(0)).apply(
        pd.offsets.MonthEnd
    )
).dt.date
print(len(sentence_data))
sentence_data.head()

876687


Unnamed: 0,total_prison_sentence_months,total_probation_sentence_months,primary_crime_type,NWSTAT1,NWSTAT2,NWSTAT3,NWSTAT4,NWSTAT5,DRUGTYP1,DRUGTYP2,DRUGTYP3,DRUGTYP4,DRUGTYP5,DRUGMIN,WGT1,primary_drug_type,any_marijuana_involved,sentencing_district,race_or_ethnicity,sentence_type,sentence_start_month,estimated_prison_release_date,estimated_probation_release_date
0,28.0,0.0,Extortion/Racketeering/Fraud,181344,181028A,182,,,,,,,,0,,,False,Florida,Hispanic,Prison,2020-10-01,2023-01-31,2020-10-31
1,15.0,0.0,Administration of Justice,181001A2,181001A3,,,,,,,,,0,,,False,Texas,White,Prison,2020-10-01,2021-12-31,2020-10-31
2,18.0,0.0,Firearms,18922G1,18924A2,,,,,,,,,0,,,False,Tennessee,White,Prison,2020-10-01,2022-03-31,2020-10-31
3,0.0,0.0,Immigration,81324A1AVI,81324A1AII,81324A1BII,,,,,,,,0,,,False,New Mexico,Hispanic,Prison,2020-10-01,2020-10-31,2020-10-31
4,120.0,0.0,Sex Offense,181470,,,,,,,,,,0,,,False,Texas,Hispanic,Prison,2020-10-01,2030-09-30,2020-10-31


In [16]:
AVG_PCT_SERVED = 0.883
sentence_data["estimated_prison_release_date"] = (
    sentence_data["sentence_start_month"]
    + np.floor(
        sentence_data["total_prison_sentence_months"].clip(upper=100 * 12)
        * AVG_PCT_SERVED
    ).apply(pd.offsets.MonthEnd)
).dt.date

In [17]:
active_sentences = sentence_data[
    (sentence_data["estimated_prison_release_date"] > datetime.date(2022, 7, 1))
    | (sentence_data["estimated_probation_release_date"] > datetime.date(2022, 7, 1))
]
active_sentences.groupby(
    [
        "sentence_type",
        "primary_crime_type",
        "primary_drug_type",
        "any_marijuana_involved",
    ]
).count()["sentence_start_month"].unstack(
    ["sentence_type", "primary_drug_type", "any_marijuana_involved"]
).fillna(
    0
)

sentence_type,Prison,Prison,Prison,Prison,Probation,Probation,Probation,Probation
primary_drug_type,Marijuana,None,Other Drug,Other Drug,Marijuana,None,Other Drug,Other Drug
any_marijuana_involved,True,False,False,True,True,False,False,True
primary_crime_type,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
Administration of Justice,4.0,263.0,40.0,5.0,11.0,210.0,71.0,4.0
Arson,1.0,234.0,2.0,2.0,0.0,3.0,0.0,0.0
Assault,32.0,1423.0,73.0,28.0,0.0,120.0,0.0,0.0
Civil Rights Offenses,0.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0
Drug Communication Facilities,1.0,0.0,35.0,4.0,1.0,0.0,1.0,0.0
Drug Possession,3.0,0.0,3.0,0.0,16.0,0.0,76.0,4.0
Drug Trafficking,2720.0,79.0,68990.0,6001.0,354.0,0.0,1285.0,67.0
Extortion/Racketeering/Fraud,204.0,8635.0,1354.0,227.0,20.0,3781.0,66.0,2.0
Firearms,615.0,24316.0,3487.0,938.0,7.0,1104.0,11.0,3.0
Immigration,9.0,5737.0,10.0,2.0,0.0,2742.0,1.0,0.0


In [85]:
active_va_sentences = active_sentences[
    active_sentences["sentencing_district"] == "Virginia"
]
active_va_sentences.groupby(
    [
        "sentence_type",
        "primary_crime_type",
        "primary_drug_type",
        "any_marijuana_involved",
    ]
).count()["sentence_start_month"].unstack(
    ["sentence_type", "primary_drug_type", "any_marijuana_involved"]
).fillna(
    0
)

sentence_type,Prison,Prison,Prison,Prison,Probation,Probation,Probation,Probation
primary_drug_type,None,Other Drug,Marijuana,Other Drug,None,Other Drug,Marijuana,Other Drug
any_marijuana_involved,False,False,True,True,False,False,True,True
primary_crime_type,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
Administration of Justice,8.0,2.0,0.0,0.0,4.0,0.0,0.0,0.0
Assault,19.0,1.0,3.0,2.0,1.0,0.0,0.0,0.0
Commercialized Vice,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Drug Possession,0.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0
Drug Trafficking,2.0,1938.0,50.0,317.0,0.0,12.0,4.0,1.0
Extortion/Racketeering,67.0,7.0,1.0,1.0,1.0,0.0,0.0,0.0
Firearms,586.0,158.0,21.0,47.0,27.0,0.0,0.0,0.0
Fraud,81.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Immigration,15.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0
Kidnapping,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
