In [None]:
import src.data.utilities as utils
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np

In [None]:
cols = ['Police Force Area', 'Year', 'Sex', 'Age group', 'Offence group', 'Sentence Outcome', 'Custodial Sentence Length','Sentenced']
df = pd.read_csv("/Users/alex/Downloads/obo_sent_pivot_2016_2022/obo_sent_pivot_2019_2022.csv", encoding= 'unicode_escape', low_memory=True, usecols=cols)
df.head()

In [None]:
utils.lcColumns(df)
utils.renameColumns(df, columns={
    'year_of_appearance': 'year',
    'offence_group': 'offence',
    'police_force_area': 'pfa',
    'sentence_outcome': 'outcome',
    'custodial_sentence_length': 'sentence_length',
    'sentenced': 'freq',
    'count': 'freq'}
    )

In [None]:
df.head()

In [None]:
utils.tidy_elements(df)

In [None]:
df.head()

In [None]:
# Checking that sentence_length values have been tidied up
df['sentence_length'].unique()

In [None]:
# Reordering columns
column_order = ['year', 'pfa', 'sex', 'age_group', 'offence', 'outcome', 'sentence_length', 'freq']
df = df.reindex(columns=column_order)

# Setting categorical columns
convert_dict = {'outcome': "category",
                'sentence_length': "category"
                }
df = df.astype(convert_dict)

In [None]:
df.info()

In [None]:
## FILTERING DATASET
filt1 = df['sex'] == 'Female'
filt2 = df['outcome'].isin(['Immediate custody', 'Community sentence','Suspended sentence'])
filt3 = df['age_group'].isin(["Adults", "Young adults"])
filt4 = df['pfa'].isin(["Special/miscellaneous and unknown police forces", "City of London", "Not known"])
filt = filt1 & filt2 & filt3 & ~filt4
women_dataset = df[filt].sort_values(['year', 'pfa']).copy()

In [None]:
women_dataset.reset_index(drop=True, inplace=True)
women_dataset.head()

In [None]:
len(women_dataset['pfa'].unique())

In [None]:
women_dataset_2019 = women_dataset.query('year==2019')

Importing previous dataset to ensure matching

In [None]:
df_orig = pd.read_csv('data/interim/PFA_2009-21_women_cust_comm_sus.csv')
df_orig.head()

In [None]:
df_orig.query('year==2019').describe(include=['object', 'category'])

In [None]:
df_orig_2019 = df_orig.query('year==2019')

In [None]:
for column in women_dataset_2019.columns:
    print(column, women_dataset_2019[column].unique())

The only `outcome` appears to be `Community sentence` so something has gone wrong with the filtering stage. Let's circle back

In [None]:
women_dataset_2019.info()

In [None]:
women_dataset_2019.describe(include=['object', 'category'])

Right, that seems to confirm it. Time to examine the filtering again.

In [None]:
df['outcome'].unique()

As ever, it's silly string formatting that seems to have tripped up everything. Let's rectify by making this case agnostic.

In [None]:
mylist=['Immediate custody', 'Community sentence','Suspended sentence']
myset = set([s.lower() for s in mylist])
myset

In [None]:
filt2 = df['outcome'].apply(lambda v: v.lower() in myset)
df[filt2].describe(include=['object', 'category'])

Re-filtering dataframe

In [None]:
## FILTERING DATASET
sentences = ['Immediate custody', 'Community sentence','Suspended sentence']
sentences_set = set([s.lower() for s in sentences])

filt1 = df['sex'] == 'Female'
filt2 = df['outcome'].apply(lambda v: v.lower() in sentences_set)
filt3 = df['age_group'].isin(["Adults", "Young adults"])
filt4 = df['pfa'].isin(["Special/miscellaneous and unknown police forces", "City of London", "Not known"])
filt = filt1 & filt2 & filt3 & ~filt4
women_dataset = df[filt].sort_values(['year', 'pfa']).copy()

In [None]:
women_dataset.describe(include=['object', 'category'])

Great, we have three unique values in the `outcome` column. Let's filter for 2019 and check further

In [None]:
women_dataset_2019 = women_dataset.query('year==2019')

In [None]:
women_dataset_2019.info()

In [None]:
women_dataset_2019['freq'].sum()

In [None]:
df_orig_2019['freq'].sum()

Right, the sum of `freq` is the same for both dataframes. Let's now check against all the years within `women_dataset` and `df_orig`

In [None]:
df_orig['year'].unique()

In [None]:
women_dataset['year'].unique()

In [None]:
for year in women_dataset['year'].unique():
    print(year, women_dataset.query(f'year=={year}')['freq'].sum())

In [None]:
for year in women_dataset['year'].unique():
    sentenced_sum = df_orig.query(f'year=={year}')['freq'].sum()
    if sentenced_sum > 0:
        print(year, sentenced_sum)

## Excellent, think we're there. Time to bring this all together.

In [None]:
import src.data.utilities as utils
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np

In [None]:
cols = ['Police Force Area', 'Year', 'Sex', 'Age group', 'Offence group', 'Sentence Outcome', 'Custodial Sentence Length','Sentenced']
df = pd.read_csv("/Users/alex/Downloads/obo_sent_pivot_2016_2022/obo_sent_pivot_2019_2022.csv", encoding= 'unicode_escape', low_memory=True, usecols=cols)

In [None]:
utils.lcColumns(df)
utils.renameColumns(df, columns={
    'year_of_appearance': 'year',
    'offence_group': 'offence',
    'police_force_area': 'pfa',
    'sentence_outcome': 'outcome',
    'custodial_sentence_length': 'sentence_length',
    'sentenced': 'freq',
    'count': 'freq'}
    )
utils.tidy_elements(df)

# Reordering columns
column_order = ['year', 'pfa', 'sex', 'age_group', 'offence', 'outcome', 'sentence_length', 'freq']
df = df.reindex(columns=column_order)

# Setting categorical columns
convert_dict = {'outcome': "category",
                'sentence_length': "category"
                }
df = df.astype(convert_dict)

In [None]:
## FILTERING DATASET
sentences = ['Immediate custody', 'Community sentence','Suspended sentence']
sentences_set = set([s.lower() for s in sentences])

filt1 = df['sex'] == 'Female'
filt2 = df['outcome'].apply(lambda v: v.lower() in sentences_set)
filt3 = df['age_group'].isin(["Adults", "Young adults"])
filt4 = df['pfa'].isin(["Special/miscellaneous and unknown police forces", "City of London", "Not known"])
filt = filt1 & filt2 & filt3 & ~filt4
women_dataset = df[filt].sort_values(['year', 'pfa']).copy()

In [None]:
women_dataset.head()

In [None]:
women_dataset['year'].unique()

In [None]:
women_dataset.query('year==2019')['freq'].sum()

Right, that looks to have behaved. Let's now attempt to run the full `data_cleansing.py` code below, with those filtering adaptations made.

In [2]:
import src.data.utilities as utils
%load_ext autoreload
%autoreload 2

import pandas as pd
import glob

Building a simple list to read in multiple csv files

In [None]:
path="data/external/obo_sent_pivot_2016_2022/"
all_files = glob.glob(path + "*.csv")
all_files

In [None]:
cols = ['Police Force Area', 'Year', 'Sex', 'Age group', 'Offence group', 'Sentence Outcome', 'Custodial Sentence Length','Sentenced']
all_csvs = [utils.loadData(filename, cols=cols) for filename in all_files]
df = pd.concat(all_csvs, axis=0, ignore_index=True)

Checking that the concat has pulled through all of the dates

In [None]:
df['Year'].unique()

Importing other csv and working through the `data_cleansing.py` stages

In [None]:
# 2. Court outcomes by police force area 2009–2019 (from: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/888561/csvs-behind-data-tools-2-2019.zip)
cols_2009 =['Police Force Area', 'Year of Appearance', 'Sex', 'Age Group', 'Offence Group', 'Outcome', 'Custodial Sentence Length','Count'] 
df_2009 = utils.loadData('../womens-pfa-analysis/data/external/court-outcomes-by-PFA-2019.csv', cols=cols_2009)

In [None]:
df_2009['Year of Appearance'].unique()

In [None]:
filt = df_2009['Year of Appearance'] < 2016
df_2009 = df_2009[filt].copy()
df_2009['Year of Appearance'].unique()

In [None]:
df_list = utils.dataframeList(locals())

In [None]:
# Renaming columns
for data in df_list:
    utils.lcColumns(data)
    utils.renameColumns(data, columns={
        'year_of_appearance': 'year',
        'offence_group': 'offence',
        'police_force_area': 'pfa',
        'sentence_outcome': 'outcome',
        'custodial_sentence_length': 'sentence_length',
        'sentenced': 'freq',
        'count': 'freq'}
        )

# Joining and tidying elements into one DataFrame
df_combined = pd.concat(df_list)
utils.tidy_elements(df_combined)

# Reordering columns
column_order = ['year', 'pfa', 'sex', 'age_group', 'offence', 'outcome', 'sentence_length', 'freq']
df_combined = df_combined.reindex(columns=column_order)

In [None]:
df_combined

In [None]:
convert_dict = {'outcome': "category",
                'sentence_length': "category"
                }
df_combined = df_combined.astype(convert_dict)

Capitalising the outcomes for consistency before filtering

In [None]:
df_combined['outcome'] = df_combined['outcome'].str.capitalize()
df_combined['outcome'].unique()

In [None]:
df_combined['outcome'] = df_combined['outcome'].str.capitalize()

## FILTERING DATASET
filt1 = df_combined['sex'] == 'Female'
filt2 = df_combined['outcome'].isin(['Immediate custody', 'Community sentence','Suspended sentence'])
filt3 = df_combined['age_group'].isin(["Adults", "Young adults"])
filt4 = df_combined['pfa'].isin(["Special/miscellaneous and unknown police forces", "City of London", "Not known"])
filt = filt1 & filt2 & filt3 & ~filt4
women_dataset = df_combined[filt].sort_values(['year', 'pfa']).copy()

In [None]:
women_dataset['outcome'].unique()

In [None]:
women_dataset.query('year==2022')['freq'].sum()

In [None]:
for year in women_dataset['year'].unique():
    sentenced_sum = women_dataset.query(f'year=={year}')['freq'].sum()
    if sentenced_sum > 0:
        print(year, sentenced_sum)

Running this again, but using the 2010–2015 data also provided in the same publication, rather than the older 2009–2019 data. I've also renamed the folder to reference that the data goes back to 2010 now—so earlier code will throw an error.

This means I can lose quite a bit of the stages (commented out below)

In [None]:
## Collecting file paths with glob

path="data/external/obo_sent_pivot_2010_2022/"
all_files = glob.glob(path + "*.csv")


## IMPORTING DATASETS ##
# 1. Sentencing data 2010–22 
# (from: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/1157979/obo_sent_pivot_2010_2015.zip and 
# https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/1157991/obo_sent_pivot_2016_2022.zip)

cols = ['Police Force Area', 'Year', 'Sex', 'Age group', 'Offence group', 'Sentence Outcome', 'Custodial Sentence Length','Sentenced']
all_csvs = [utils.loadData(filename, cols=cols) for filename in all_files]
df = pd.concat(all_csvs, axis=0, ignore_index=True)

# # 2. Court outcomes by police force area 2009–2019 (from: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/888561/csvs-behind-data-tools-2-2019.zip)
# cols_2009 =['Police Force Area', 'Year of Appearance', 'Sex', 'Age Group', 'Offence Group', 'Outcome', 'Custodial Sentence Length','Count'] 
# df_2009 = utils.loadData('../womens-pfa-analysis/data/external/court-outcomes-by-PFA-2019.csv', cols=cols_2009)

# #Dropping duplicate data from 2009 dataset that also appears in df
# filt = df_2009['Year of Appearance'] < 2016
# df_2009 = df_2009[filt].copy()

# Defining datasets to iterate through in following section
# df_list = utils.dataframeList(locals()) #Using locals() function to retrieve local symbol table. Note this outputs a complex list, and is no longer a DataFrame.

## DATA CLEANING PROCESS

# Renaming columns
utils.lcColumns(df)
utils.renameColumns(df, columns={
    'year_of_appearance': 'year',
    'offence_group': 'offence',
    'police_force_area': 'pfa',
    'sentence_outcome': 'outcome',
    'custodial_sentence_length': 'sentence_length',
    'sentenced': 'freq',
    'count': 'freq'}
    )

# Tidying elements into one DataFrame
utils.tidy_elements(df)

# Reordering columns
column_order = ['year', 'pfa', 'sex', 'age_group', 'offence', 'outcome', 'sentence_length', 'freq']
df = df.reindex(columns=column_order)

# Setting categorical columns
convert_dict = {'outcome': "category",
                'sentence_length': "category"
                }
df = df.astype(convert_dict)

# Setting outcomes to lowercase
df['outcome'] = df['outcome'].str.capitalize()

## FILTERING DATASET
filt1 = df['sex'] == 'Female'
filt2 = df['outcome'].isin(['Immediate custody', 'Community sentence','Suspended sentence'])
filt3 = df['age_group'].isin(["Adults", "Young adults"])
filt4 = df['pfa'].isin(["Special/miscellaneous and unknown police forces", "City of London", "Not known"])
filt = filt1 & filt2 & filt3 & ~filt4
women_dataset = df[filt].sort_values(['year', 'pfa']).copy()

In [None]:
for year in women_dataset['year'].unique():
    sentenced_sum = women_dataset.query(f'year=={year}')['freq'].sum()
    if sentenced_sum > 0:
        print(year, sentenced_sum)

In [None]:
for year in women_dataset['year'].unique():
    sentenced_sum = women_dataset.query(f'year=={year}')['freq'].sum()
    if sentenced_sum > 0:
        print(year, sentenced_sum)

In [None]:
women_dataset.to_csv('data/interim/TESTING_PFA_2010-22_women_cust_comm_sus.csv', index=False)

In [None]:
women_dataset.to_csv('data/interim/TESTING_PFA_2010-22_women_cust_comm_sus.csv', index=False)

Think I will add 2009 data back in from the other file next, just for continuity.

In [3]:
## Collecting file paths with glob

path="data/external/obo_sent_pivot_2010_2022/"
all_files = glob.glob(path + "*.csv")


## IMPORTING DATASETS ##
# 1. Sentencing data 2010–22 
# (from: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/1157979/obo_sent_pivot_2010_2015.zip and 
# https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/1157991/obo_sent_pivot_2016_2022.zip)

cols = ['Police Force Area', 'Year', 'Sex', 'Age group', 'Offence group', 'Sentence Outcome', 'Custodial Sentence Length','Sentenced']
all_csvs = [utils.loadData(filename, cols=cols) for filename in all_files]
df = pd.concat(all_csvs, axis=0, ignore_index=True)

# 2. Court outcomes by police force area 2009–2019 (from: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/888561/csvs-behind-data-tools-2-2019.zip)
cols_2009 =['Police Force Area', 'Year of Appearance', 'Sex', 'Age Group', 'Offence Group', 'Outcome', 'Custodial Sentence Length','Count'] 
df_2009 = utils.loadData('../womens-pfa-analysis/data/external/court-outcomes-by-PFA-2019.csv', cols=cols_2009)

#Dropping duplicate data from 2009 dataset that also appears in df
filt = df_2009['Year of Appearance'] == 2009
df_2009 = df_2009[filt].copy()

# Defining datasets to iterate through in following section
df_list = utils.dataframeList(locals()) #Using locals() function to retrieve local symbol table. Note this outputs a complex list, and is no longer a DataFrame.

## DATA CLEANING PROCESS

# Renaming columns
for data in df_list:
    utils.lcColumns(data)
    utils.renameColumns(data, columns={
        'year_of_appearance': 'year',
        'offence_group': 'offence',
        'police_force_area': 'pfa',
        'sentence_outcome': 'outcome',
        'custodial_sentence_length': 'sentence_length',
        'sentenced': 'freq',
        'count': 'freq'}
        )

# Joining and tidying elements into one DataFrame
df_combined = pd.concat(df_list)
utils.tidy_elements(df_combined)

# Reordering columns
column_order = ['year', 'pfa', 'sex', 'age_group', 'offence', 'outcome', 'sentence_length', 'freq']
df_combined = df_combined.reindex(columns=column_order)

# Setting categorical columns
convert_dict = {'outcome': "category",
                'sentence_length': "category"
                }
df_combined = df_combined.astype(convert_dict)

# Setting outcomes to lowercase
df_combined['outcome'] = df_combined['outcome'].str.capitalize()

## FILTERING DATASET
filt1 = df_combined['sex'] == 'Female'
filt2 = df_combined['outcome'].isin(['Immediate custody', 'Community sentence','Suspended sentence'])
filt3 = df_combined['age_group'].isin(["Adults", "Young adults"])
filt4 = df_combined['pfa'].isin(["Special/miscellaneous and unknown police forces", "City of London", "Not known"])
filt = filt1 & filt2 & filt3 & ~filt4
women_dataset = df_combined[filt].sort_values(['year', 'pfa']).copy()

## OUTPUTTING INTERIM DATASET FOR FURTHER ANALYSIS
# women_dataset.to_csv('data/interim/TESTING_PFA_2010-22_women_cust_comm_sus.csv', index=False)

Checking issue with 2009

In [16]:
# 2. Court outcomes by police force area 2009–2019 (from: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/888561/csvs-behind-data-tools-2-2019.zip)
cols_2009 =['Police Force Area', 'Year of Appearance', 'Sex', 'Age Group', 'Offence Group', 'Outcome', 'Custodial Sentence Length','Count'] 
df_2009 = utils.loadData('../womens-pfa-analysis/data/external/court-outcomes-by-PFA-2019.csv', cols=cols_2009)
df_2009

Unnamed: 0,Police Force Area,Year of Appearance,Sex,Age Group,Offence Group,Outcome,Custodial Sentence Length,Count
0,Avon and Somerset,2009,01: Male,01: Children,04: Theft Offences,13: Community sentence,,1
1,Avon and Somerset,2009,01: Male,01: Children,03: Robbery,13: Community sentence,,1
2,Avon and Somerset,2009,01: Male,01: Children,11: Summary non-motoring,13: Community sentence,,1
3,Avon and Somerset,2009,01: Male,01: Children,11: Summary non-motoring,13: Community sentence,,1
4,Avon and Somerset,2009,01: Male,01: Children,01: Violence against the person,08: Committed for trial,,1
...,...,...,...,...,...,...,...,...
4102850,Wiltshire,2019,"03: Companies, public bodies etc.","04: Companies, public bodies etc.",12: Summary motoring,04: Withdrawn,,1
4102851,Wiltshire,2019,"03: Companies, public bodies etc.","04: Companies, public bodies etc.",12: Summary motoring,12: Fine,,1
4102852,Wiltshire,2019,"03: Companies, public bodies etc.","04: Companies, public bodies etc.",12: Summary motoring,04: Withdrawn,,2
4102853,Wiltshire,2019,"03: Companies, public bodies etc.","04: Companies, public bodies etc.",11: Summary non-motoring,12: Fine,,10


In [17]:
# Renaming columns
utils.lcColumns(df_2009)
utils.renameColumns(df_2009, columns={
    'year_of_appearance': 'year',
    'offence_group': 'offence',
    'police_force_area': 'pfa',
    'sentence_outcome': 'outcome',
    'custodial_sentence_length': 'sentence_length',
    'sentenced': 'freq',
    'count': 'freq'}
    )
utils.tidy_elements(df_2009)

In [18]:
df_2009

Unnamed: 0,pfa,year,sex,age_group,offence,outcome,sentence_length,freq
0,Avon and Somerset,2009,Male,Children,Theft Offences,Community sentence,,1
1,Avon and Somerset,2009,Male,Children,Robbery,Community sentence,,1
2,Avon and Somerset,2009,Male,Children,Summary non-motoring,Community sentence,,1
3,Avon and Somerset,2009,Male,Children,Summary non-motoring,Community sentence,,1
4,Avon and Somerset,2009,Male,Children,Violence against the person,Committed for trial,,1
...,...,...,...,...,...,...,...,...
4102850,Wiltshire,2019,"Companies, public bodies etc.","Companies, public bodies etc.",Summary motoring,Withdrawn,,1
4102851,Wiltshire,2019,"Companies, public bodies etc.","Companies, public bodies etc.",Summary motoring,Fine,,1
4102852,Wiltshire,2019,"Companies, public bodies etc.","Companies, public bodies etc.",Summary motoring,Withdrawn,,2
4102853,Wiltshire,2019,"Companies, public bodies etc.","Companies, public bodies etc.",Summary non-motoring,Fine,,10


In [23]:
# Reordering columns
column_order = ['year', 'pfa', 'sex', 'age_group', 'offence', 'outcome', 'sentence_length', 'freq']
df_2009 = df_2009.reindex(columns=column_order)

# Setting categorical columns
convert_dict = {'outcome': "category",
                'sentence_length': "category"
                }
df_2009 = df_2009.astype(convert_dict)

# Setting outcomes to lowercase
df_2009['outcome'] = df_2009['outcome'].str.capitalize()

## FILTERING DATASET
filt1 = df_2009['sex'] == 'Female'
filt2 = df_2009['outcome'].isin(['Immediate custody', 'Community sentence','Suspended sentence'])
filt3 = df_2009['age_group'].isin(["Adults", "Young adults"])
filt4 = df_2009['pfa'].isin(["Special/miscellaneous and unknown police forces", "City of London", "Not known"])
filt = filt1 & filt2 & filt3 & ~filt4
df_2009 = df_2009[filt].sort_values(['year', 'pfa']).copy()

In [27]:
df_2009.query('pfa == "Cleveland" & outcome == "Immediate custody"').groupby(['sentence_length'])['freq'].sum()

sentence_length
12 months                                   41
4 Years                                     15
6 months                                    49
Imprisonment for public protection           2
Life sentence                                0
More than 1 month and up to 2 months       189
More than 10 years and up to 15 years        0
More than 12 months and up to 18 months     61
More than 15 years and less than life        0
More than 18 months and up to 2 years       45
More than 2 months and up to 3 months       98
More than 2 years and up to 3 years         77
More than 3 months and under 6 months      137
More than 3 years and under 4 years         18
More than 4 years and up to 5 years         13
More than 5 years and up to 6 years          9
More than 6 months and up to 9 months       45
More than 6 years and up to 7 years          4
More than 7 years and up to 8 years          7
More than 8 years and up to 9 years          0
More than 9 months and under 12 months      

Right, following further inspection in `script_testing.ipynb` I suspect that the issue is a combination of two factors:
1. My `tidy_elements()` function removing "and including" from the sentence lengths
2. That I should not be combining with the earlier 2009 dataset—as MoJ explictly warns not to do this and that the new 2010+ are the single source of truth

After updating `tidy_elements()` I'm going to try consolidating the code below and running again.

In [2]:
import src.data.utilities as utils
%load_ext autoreload
%autoreload 2

import pandas as pd
import glob

In [3]:
## Collecting file paths with glob

path="data/external/obo_sent_pivot_2010_2022/"
all_files = glob.glob(path + "*.csv")


## IMPORTING DATASETS ##

# 1. Sentencing data 2010–22 
# (from: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/1157979/obo_sent_pivot_2010_2015.zip and 
# https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/1157991/obo_sent_pivot_2016_2022.zip)

cols = ['Police Force Area', 'Year', 'Sex', 'Age group', 'Offence group', 'Sentence Outcome', 'Custodial Sentence Length','Sentenced']
all_csvs = [utils.loadData(filename, cols=cols) for filename in all_files]
df = pd.concat(all_csvs, axis=0, ignore_index=True)


## DATA CLEANING PROCESS

# Renaming columns
utils.lcColumns(df)
utils.renameColumns(df, columns={
    'year_of_appearance': 'year',
    'offence_group': 'offence',
    'police_force_area': 'pfa',
    'sentence_outcome': 'outcome',
    'custodial_sentence_length': 'sentence_length',
    'sentenced': 'freq',
    'count': 'freq'}
    )

# Tidying elements using regex function
utils.tidy_elements(df)

# Reordering columns
column_order = ['year', 'pfa', 'sex', 'age_group', 'offence', 'outcome', 'sentence_length', 'freq']
df = df.reindex(columns=column_order)

# Setting categorical columns
convert_dict = {'outcome': "category",
                'sentence_length': "category"
                }
df = df.astype(convert_dict)

# Setting outcomes to lowercase
df['outcome'] = df['outcome'].str.capitalize()

## FILTERING DATASET
filt1 = df['sex'] == 'Female'
filt2 = df['outcome'].isin(['Immediate custody', 'Community sentence','Suspended sentence'])
filt3 = df['age_group'].isin(["Adults", "Young adults"])
filt4 = df['pfa'].isin(["City of London", "Not known"])
filt = filt1 & filt2 & filt3 & ~filt4
women_dataset = df[filt].sort_values(['year', 'pfa']).copy()

Checking `freq` values against those in `df_orig`—the previous dataset

In [6]:
df_orig = pd.read_csv('data/interim/PFA_2009-21_women_cust_comm_sus.csv')
df_orig.head()

Unnamed: 0,year,pfa,sex,age_group,offence,outcome,sentence_len,freq
0,2009,Avon and Somerset,Female,Young adults,Violence against the person,Community sentence,,2
1,2009,Avon and Somerset,Female,Young adults,Violence against the person,Suspended sentence,,1
2,2009,Avon and Somerset,Female,Young adults,Violence against the person,Suspended sentence,,1
3,2009,Avon and Somerset,Female,Young adults,Public order offences,Community sentence,,1
4,2009,Avon and Somerset,Female,Young adults,Miscellaneous crimes against society,Community sentence,,1


In [7]:
for year in df_orig['year'].unique():
    sentenced_sum = df_orig.query(f'year=={year}')['freq'].sum()
    if sentenced_sum > 0:
        print(year, sentenced_sum)

2009 36925
2010 37966
2011 37783
2012 33929
2013 31282
2014 31124
2015 32163
2016 29795
2017 28204
2018 25698
2019 24352
2020 17377
2021 20275


In [8]:
for year in women_dataset['year'].unique():
    sentenced_sum = women_dataset.query(f'year=={year}')['freq'].sum()
    if sentenced_sum > 0:
        print(year, sentenced_sum)

2010 37966
2011 37783
2012 33929
2013 31282
2014 31124
2015 32163
2016 29795
2017 28206
2018 25698
2019 24352
2020 17377
2021 20275
2022 19182


Just one very small difference in the values for 2017, which I suspect are a data revision. Everything else is looking good, so let's save out

In [5]:
women_dataset.to_csv('data/interim/PFA_2010-22_women_cust_comm_sus.csv', index=False)