# Examining downloaded Criminal Justice System statistics datasets

In [None]:
import pandas as pd

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from src.data.processing import filter_sentence_type

## `load_outcomes_data()`

Loading the outcomes concatenated DataFrame with no further processing

In [None]:
df = filter_sentence_type.load_outcomes_data()

In [None]:
df

In [None]:
sorted(df['Year'].unique())

## `rename_and_reorder_columns()`

In [None]:
filter_sentence_type.rename_and_reorder_columns(df)

In [None]:
[outcome for outcome in df['outcome'].unique()]

In [None]:
[sentence_len for sentence_len in df['sentence_len'].unique()]

## `apply_multiple_regex_replacements()`

Using a dictionary to apply all of the regex replacements to:
* Remove prefixes
* Change wording in outcome column
* Change wording in sentence_len column

In [None]:
# Defining regex replacements for specific columns
regex_replacements = {
    'sex': [(r"\d\d: ", "")],
    'age_group': [(r"\d\d: ", "")],
    'offence': [(r"\d\d: ", "")],
    'outcome': [(r"\d\d: ", "")],
    'sentence_len': [
        (r"\d\d: ", ""),
        (r"Custody - ", ""),
        (r"Over", "More than"),
        (r"Life$", "Life sentence"),
    ]
}

In [None]:
df_regex = (
    filter_sentence_type.apply_multiple_regex_replacements(
        df,
        regex_replacements)
    )
df_regex

In [None]:
[sentence_len for sentence_len in df_regex['sentence_len'].unique()]

## `filter_dataframe()`

Testing the application of filters to the dataframe

### Are my filters are being read in correctly?

In [None]:
import src.utilities as utils
config = utils.read_config()
outcomes_by_offence_filter = config.get('outcomes_by_offence_filter', {})
outcomes_by_offence_filter

In [None]:
include_filters = outcomes_by_offence_filter.get('include', {})
include_filters

In [None]:
exclude_filters = outcomes_by_offence_filter.get('exclude', {})
exclude_filters

In [None]:
filter_sentence_type.filter_dataframe(df_regex, outcomes_by_offence_filter)

In [None]:
filter_sentence_type.process_data(df, config_file=config)

## `load_and_process_data()`

Now testing pipeline function to bring all these steps together

In [3]:
filter_sentence_type.load_and_process_data()

2025-06-05 16:08:34,404 - INFO - Loading outcomes by offence data...
2025-06-05 16:08:58,912 - INFO - Loaded data from data/raw/sentence_outcomes_2017_2024.csv
2025-06-05 16:09:20,151 - INFO - Loaded data from data/raw/sentence_outcomes_2010_2016.csv
2025-06-05 16:09:23,218 - INFO - Processing data...
2025-06-05 16:09:34,378 - INFO - Applying filters...
2025-06-05 16:09:34,379 - INFO - Include filter on column 'sex' with values: ['Female']
2025-06-05 16:09:34,624 - INFO - Include filter on column 'outcome' with values: ['Immediate Custody', 'Community Sentence', 'Suspended Sentence']
2025-06-05 16:09:34,722 - INFO - Include filter on column 'age_group' with values: ['Adults', 'Young adults']
2025-06-05 16:09:34,775 - INFO - Exclude filter on column 'pfa' with values: ['Not known']
2025-06-05 16:09:34,863 - INFO - Data filtered.
2025-06-05 16:09:35,040 - INFO - Data loaded and processed successfully.


Unnamed: 0,year,pfa,sex,age_group,offence,outcome,sentence_len,freq
4268271,2010,Avon and Somerset,Female,Adults,Drug offences,Immediate Custody,More than 12 months and up to and including 18...,1
4269203,2010,Avon and Somerset,Female,Adults,Fraud offences,Community Sentence,,2
4269825,2010,Avon and Somerset,Female,Young adults,Possession of weapons,Community Sentence,,1
4270091,2010,Avon and Somerset,Female,Adults,Fraud offences,Suspended Sentence,,1
4272402,2010,Avon and Somerset,Female,Adults,Summary non-motoring,Community Sentence,,1
...,...,...,...,...,...,...,...,...
4254435,2024,Wiltshire,Female,Adults,Violence against the person,Community Sentence,,1
4262593,2024,Wiltshire,Female,Adults,Violence against the person,Community Sentence,,2
4263581,2024,Wiltshire,Female,Adults,Violence against the person,Community Sentence,,1
4265810,2024,Wiltshire,Female,Adults,Theft offences,Suspended Sentence,,1
