# Examining downloaded Criminal Justice System statistics datasets

In [1]:
import pandas as pd

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from src.data.processing import filter_sentence_type

## `load_outcomes_data()`

Loading the outcomes concatenated DataFrame with no further processing

In [None]:
df = filter_sentence_type.load_outcomes_data()

In [None]:
df

In [None]:
sorted(df['Year'].unique())

## `rename_and_reorder_columns()`

In [None]:
filter_sentence_type.rename_and_reorder_columns(df)

In [None]:
[outcome for outcome in df['outcome'].unique()]

In [None]:
[sentence_len for sentence_len in df['sentence_len'].unique()]

## `apply_multiple_regex_replacements()`

Using a dictionary to apply all of the regex replacements to:
* Remove prefixes
* Change wording in outcome column
* Change wording in sentence_len column

In [None]:
# Defining regex replacements for specific columns
regex_replacements = {
    'sex': [(r"\d\d: ", "")],
    'age_group': [(r"\d\d: ", "")],
    'offence': [(r"\d\d: ", "")],
    'outcome': [(r"\d\d: ", "")],
    'sentence_len': [
        (r"\d\d: ", ""),
        (r"Custody - ", ""),
        (r"Over", "More than"),
        (r"Life$", "Life sentence"),
    ]
}

In [None]:
df_regex = (
    filter_sentence_type.apply_multiple_regex_replacements(
        df,
        regex_replacements)
    )
df_regex

In [None]:
[sentence_len for sentence_len in df_regex['sentence_len'].unique()]

## `filter_dataframe()`

Testing the application of filters to the dataframe

### Are my filters are being read in correctly?

In [None]:
import src.utilities as utils
config = utils.read_config()
outcomes_by_offence_filter = config.get('outcomes_by_offence_filter', {})
outcomes_by_offence_filter

In [None]:
include_filters = outcomes_by_offence_filter.get('include', {})
include_filters

In [None]:
exclude_filters = outcomes_by_offence_filter.get('exclude', {})
exclude_filters

In [None]:
filter_sentence_type.filter_dataframe(df_regex, outcomes_by_offence_filter)

In [None]:
filter_sentence_type.process_data(df, config_file=config)

## `load_and_process_data()`

Now testing pipeline function to bring all these steps together

In [3]:
filter_sentence_type.load_and_process_data()

2025-06-05 16:08:34,404 - INFO - Loading outcomes by offence data...
2025-06-05 16:08:58,912 - INFO - Loaded data from data/raw/sentence_outcomes_2017_2024.csv
2025-06-05 16:09:20,151 - INFO - Loaded data from data/raw/sentence_outcomes_2010_2016.csv
2025-06-05 16:09:23,218 - INFO - Processing data...
2025-06-05 16:09:34,378 - INFO - Applying filters...
2025-06-05 16:09:34,379 - INFO - Include filter on column 'sex' with values: ['Female']
2025-06-05 16:09:34,624 - INFO - Include filter on column 'outcome' with values: ['Immediate Custody', 'Community Sentence', 'Suspended Sentence']
2025-06-05 16:09:34,722 - INFO - Include filter on column 'age_group' with values: ['Adults', 'Young adults']
2025-06-05 16:09:34,775 - INFO - Exclude filter on column 'pfa' with values: ['Not known']
2025-06-05 16:09:34,863 - INFO - Data filtered.
2025-06-05 16:09:35,040 - INFO - Data loaded and processed successfully.


Unnamed: 0,year,pfa,sex,age_group,offence,outcome,sentence_len,freq
4268271,2010,Avon and Somerset,Female,Adults,Drug offences,Immediate Custody,More than 12 months and up to and including 18...,1
4269203,2010,Avon and Somerset,Female,Adults,Fraud offences,Community Sentence,,2
4269825,2010,Avon and Somerset,Female,Young adults,Possession of weapons,Community Sentence,,1
4270091,2010,Avon and Somerset,Female,Adults,Fraud offences,Suspended Sentence,,1
4272402,2010,Avon and Somerset,Female,Adults,Summary non-motoring,Community Sentence,,1
...,...,...,...,...,...,...,...,...
4254435,2024,Wiltshire,Female,Adults,Violence against the person,Community Sentence,,1
4262593,2024,Wiltshire,Female,Adults,Violence against the person,Community Sentence,,2
4263581,2024,Wiltshire,Female,Adults,Violence against the person,Community Sentence,,1
4265810,2024,Wiltshire,Female,Adults,Theft offences,Suspended Sentence,,1


# Stepping back

## Examining how to allow Assaults on an Emergency Worker offences to be isolated

In [15]:
df = filter_sentence_type.load_outcomes_data()

2025-07-30 14:27:20,098 - INFO - Loading outcomes by offence data...
2025-07-30 14:27:55,960 - INFO - Loaded data from data/raw/sentence_outcomes_2017_2024.csv
2025-07-30 14:28:40,304 - INFO - Loaded data from data/raw/sentence_outcomes_2010_2016.csv


In [16]:
df.columns

Index(['Year', 'Sex', 'Age Group', 'Police Force Area', 'Offence Group',
       'Offence', 'Sentence Outcome', 'Custodial Sentence Length',
       'Sentenced'],
      dtype='object')

In [8]:
df[['Offence Type', 'Offence Group', 'Offence']]

Unnamed: 0,Offence Type,Offence Group,Offence
0,04: Summary motoring,12: Summary motoring,817 Neglecting road regulations (other than sp...
1,02: Triable either way,04: Theft offences,46 Theft from Shops
2,04: Summary motoring,12: Summary motoring,816 Speed limit offences (MOT)
3,03: Summary non-motoring,11: Summary non-motoring,105 Common assault and battery
4,04: Summary motoring,12: Summary motoring,816 Speed limit offences (MOT)
...,...,...,...
8170320,03: Summary non-motoring,11: Summary non-motoring,"104 Assaulting, resisting or obstructing a con..."
8170321,04: Summary motoring,12: Summary motoring,803A Driving a motor vehicle under the influen...
8170322,02: Triable either way,07: Possession of weapons,10D Possession of article with blade or point
8170323,04: Summary motoring,12: Summary motoring,803A Driving a motor vehicle under the influen...


In [10]:
# Filter the DataFrame for rows where 'Offence' contains 'emergency worker'
emergency_worker_df = df[df['Offence'].str.contains('emergency worker', case=False, na=False)]

# Display the filtered DataFrame
emergency_worker_df[['Offence Type', 'Offence Group', 'Offence']].head(10)

Unnamed: 0,Offence Type,Offence Group,Offence
990408,02: Triable either way,01: Violence against the person,8.22 Assault of an emergency worker
990769,02: Triable either way,01: Violence against the person,8.22 Assault of an emergency worker
991674,02: Triable either way,01: Violence against the person,8.22 Assault of an emergency worker
991751,02: Triable either way,01: Violence against the person,8.22 Assault of an emergency worker
992587,02: Triable either way,01: Violence against the person,8.22 Assault of an emergency worker
992622,02: Triable either way,01: Violence against the person,8.22 Assault of an emergency worker
992661,02: Triable either way,01: Violence against the person,8.22 Assault of an emergency worker
992793,02: Triable either way,01: Violence against the person,8.22 Assault of an emergency worker
993070,02: Triable either way,01: Violence against the person,8.22 Assault of an emergency worker
993476,02: Triable either way,01: Violence against the person,8.22 Assault of an emergency worker


Right, that has worked, so I need to include the `Offence` column in the `usecols` list in the `load_outcomes_data()` function to begin with, so that I can filter the DataFrame for rows where 'Offence' contains 'emergency worker'.

In [18]:
df = filter_sentence_type.load_and_process_data()

2025-07-30 14:47:07,691 - INFO - Loading outcomes by offence data...
2025-07-30 14:47:40,685 - INFO - Loaded data from data/raw/sentence_outcomes_2017_2024.csv
2025-07-30 14:48:11,373 - INFO - Loaded data from data/raw/sentence_outcomes_2010_2016.csv
2025-07-30 14:48:16,406 - INFO - Processing data...
2025-07-30 14:48:47,431 - INFO - Applying filters...
2025-07-30 14:48:47,435 - INFO - Include filter on column 'sex' with values: ['Female']
2025-07-30 14:48:47,832 - INFO - Include filter on column 'outcome' with values: ['Immediate Custody', 'Community Sentence', 'Suspended Sentence']
2025-07-30 14:48:48,006 - INFO - Include filter on column 'age_group' with values: ['Adults', 'Young adults']
2025-07-30 14:48:48,091 - INFO - Exclude filter on column 'pfa' with values: ['Not known']
2025-07-30 14:48:48,254 - INFO - Data filtered.
2025-07-30 14:48:48,772 - INFO - Data loaded and processed successfully.


In [12]:
df

Unnamed: 0,year,pfa,sex,age_group,offence,specific_offence,outcome,sentence_len,freq
4268271,2010,Avon and Somerset,Female,Adults,Drug offences,Unlawful importation - Class A,Immediate Custody,More than 12 months and up to and including 18...,1
4269203,2010,Avon and Somerset,Female,Adults,Fraud offences,"Fraud by false representation: cheque, plastic...",Community Sentence,,2
4269825,2010,Avon and Somerset,Female,Young adults,Possession of weapons,Possession of article with blade or point,Community Sentence,,1
4270091,2010,Avon and Somerset,Female,Adults,Fraud offences,Fraud by false representation: other frauds,Suspended Sentence,,1
4272402,2010,Avon and Somerset,Female,Adults,Summary non-motoring,Common assault and battery,Community Sentence,,1
...,...,...,...,...,...,...,...,...,...
4254435,2024,Wiltshire,Female,Adults,Violence against the person,Assault of an emergency worker,Community Sentence,,1
4262593,2024,Wiltshire,Female,Adults,Violence against the person,Assault of an emergency worker,Community Sentence,,2
4263581,2024,Wiltshire,Female,Adults,Violence against the person,Assault of an emergency worker,Community Sentence,,1
4265810,2024,Wiltshire,Female,Adults,Theft offences,Blackmail,Suspended Sentence,,1


In [19]:
df['specific_offence'].unique()

array(['Unlawful importation - Class A',
       'Fraud by false representation: cheque, plastic card and online bank accounts',
       'Possession of article with blade or point',
       'Fraud by false representation: other frauds',
       'Common assault and battery', 'Theft from Shops',
       'Causing fear or provocation of violence - summary',
       'Possession of a controlled drug - Class A', 'False Accounting',
       'Driving a motor vehicle under the influence of drink or drugs (MOT)',
       'Abstracting Electricity',
       'Protection from Harassment Act 1997 S.2 - Summary offence of harassment',
       'Driving licence related offences (excluding fraud and forgery) - summary (MOT)',
       'Murder', 'Benefit fraud offences - triable either way',
       'Other Theft or Unauthorised Taking',
       'Assaulting, resisting or obstructing a constable or designated officer in execution of duty',
       'Perverting the Course of Justice - indictable only',
       'Failing to Sur

Great, that has worked and I've integrated that into the necessary functions. This is all I need to do for now, so I will commit this change and then move on to the next processing step in the pipeline.