# Women's imprisonment rates
## Criminal Justice Statistics Police Force Area: Cleaning data for analysis of sentences

#### Importing pandas library and reading in data

In [1]:
import pandas as pd

#### Importing dataset and filtering columns

In [2]:
usecols =['Police Force Area', 'Year', 'Sex', 'Age group', 'Offence group', 'Sentence Outcome', 'Custodial Sentence Length','Sentenced']
df = pd.read_csv('../data/external/sentencing.csv', usecols=usecols, encoding = 'latin1', low_memory=False)

#### Importing dataset going back to 2009 and filtering columns

In [3]:
usecols_2009 =['Police Force Area', 'Year of Appearance', 'Sex', 'Age Group', 'Offence Group', 'Outcome', 'Custodial Sentence Length','Count'] 
df_2009 = pd.read_csv('../../womens-pfa-analysis/data/external/court-outcomes-by-PFA-2019.csv', usecols=usecols_2009, encoding = 'latin1', low_memory=False)

#### Viewing datasets

In [4]:
df.head()

Unnamed: 0,Year,Offence group,Sex,Age group,Police Force Area,Sentence Outcome,Custodial Sentence Length,Sentenced
0,2017,01: Violence against the person,02: Male,02: Young adults,Greater Manchester,07: Total Immediate custody,23: Custody - Life,1
1,2017,01: Violence against the person,02: Male,03: Adults,West Yorkshire,07: Total Immediate custody,23: Custody - Life,1
2,2017,01: Violence against the person,02: Male,03: Adults,Metropolitan Police,07: Total Immediate custody,23: Custody - Life,1
3,2017,01: Violence against the person,02: Male,03: Adults,West Yorkshire,07: Total Immediate custody,23: Custody - Life,2
4,2017,01: Violence against the person,02: Male,01: Children,Metropolitan Police,07: Total Immediate custody,23: Custody - Life,2


In [5]:
df_2009.head()

Unnamed: 0,Police Force Area,Year of Appearance,Sex,Age Group,Offence Group,Outcome,Custodial Sentence Length,Count
0,Avon and Somerset,2009,01: Male,01: Children,04: Theft Offences,13: Community sentence,,1
1,Avon and Somerset,2009,01: Male,01: Children,03: Robbery,13: Community sentence,,1
2,Avon and Somerset,2009,01: Male,01: Children,11: Summary non-motoring,13: Community sentence,,1
3,Avon and Somerset,2009,01: Male,01: Children,11: Summary non-motoring,13: Community sentence,,1
4,Avon and Somerset,2009,01: Male,01: Children,01: Violence against the person,08: Committed for trial,,1


#### Checking time periods for both dataframes

In [8]:
df['Year'].unique()

array([2017, 2018, 2019, 2020, 2021])

In [9]:
df_2009['Year of Appearance'].unique()

array([2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])

In [18]:
filt1= df['Police Force Area'] == "Dyfed-Powys"
filt2= df['Year'] == 2021
filt3= df['Sentence Outcome'] == "07: Total Immediate custody"
df[filt1 & filt2 & filt3]

Unnamed: 0,Year,Offence group,Sex,Age group,Police Force Area,Sentence Outcome,Custodial Sentence Length,Sentenced
1409992,2021,01: Violence against the person,02: Male,03: Adults,Dyfed-Powys,07: Total Immediate custody,02: Custody - Over 1 month and up to and inclu...,1
1411577,2021,01: Violence against the person,02: Male,03: Adults,Dyfed-Powys,07: Total Immediate custody,01: Custody - Up to and including 1 month,1
1411803,2021,01: Violence against the person,02: Male,03: Adults,Dyfed-Powys,07: Total Immediate custody,03: Custody - Over 2 months and up to and incl...,1
1411804,2021,01: Violence against the person,02: Male,03: Adults,Dyfed-Powys,07: Total Immediate custody,04: Custody - More than 3 months and under 6 m...,4
1412017,2021,01: Violence against the person,02: Male,03: Adults,Dyfed-Powys,07: Total Immediate custody,01: Custody - Up to and including 1 month,1
...,...,...,...,...,...,...,...,...
1722077,2021,12: Summary motoring,02: Male,03: Adults,Dyfed-Powys,07: Total Immediate custody,02: Custody - Over 1 month and up to and inclu...,1
1722078,2021,12: Summary motoring,02: Male,03: Adults,Dyfed-Powys,07: Total Immediate custody,03: Custody - Over 2 months and up to and incl...,1
1722079,2021,12: Summary motoring,02: Male,03: Adults,Dyfed-Powys,07: Total Immediate custody,04: Custody - More than 3 months and under 6 m...,1
1722514,2021,12: Summary motoring,02: Male,03: Adults,Dyfed-Powys,07: Total Immediate custody,03: Custody - Over 2 months and up to and incl...,1


#### Right, the first thing we need to address is the overlapping time series. Let's remove all 2017, 2018 and 2019 values from df_2009

In [11]:
filt = df_2009['Year of Appearance'] < 2017
df_2009 = df_2009[filt]
df_2009['Year of Appearance'].unique()

array([2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016])

#### Renaming and re-ordering columns

In [12]:
df.columns = ['year', 'offence', 'sex', 'age_group', 'pfa', 'outcome', 'sentence_len', 'freq']
df_2009.columns = ['pfa', 'year', 'sex', 'age_group', 'offence', 'outcome', 'sentence_len', 'freq']

df_column_order = ['year', 'pfa', 'sex', 'age_group', 'offence', 'outcome', 'sentence_len', 'freq']
df = df[df_column_order]
df_2009 = df_2009[df_column_order]

Removing sentence length prefixes

In [13]:
df['sentence_len'] = df['sentence_len'].str.replace("^\S*: \S* - ","", regex=True)
df_2009['sentence_len'] = df_2009['sentence_len'].str.replace("\d\d: ","", regex=True)

Concatenating the two dataframes

In [15]:
df_combined = pd.concat([df_2009, df])

In [16]:
#Regexing unnecessary code prefixes
df_combined['outcome'] = df_combined['outcome'].str.replace("\d\d: ","", regex=True)
df_combined['sex'] = df_combined['sex'].str.replace("\d\d: ","", regex=True)
df_combined['age_group'] = df_combined['age_group'].str.replace("\d\d: ","", regex=True)
df_combined['offence'] = df_combined['offence'].str.replace("\d\d: ","", regex=True)

In [17]:
#Stansardising outcomes of interest
df_combined['outcome'] = df_combined['outcome'].str.replace("(Total Immediate custody)","Immediate custody", regex=True)
df_combined['outcome'] = df_combined['outcome'].str.replace("(Total Community sentence)","Community sentence", regex=True)

In [18]:
#Standardising sentence lengths
df_combined['sentence_len'] = df_combined['sentence_len'].str.replace("(Over)","More than", regex=True)
df_combined['sentence_len'] = df_combined['sentence_len'].str.replace("( and including)","", regex=True)
df_combined['sentence_len'] = df_combined['sentence_len'].str.replace("(to less than)","and under", regex=True)
df_combined['sentence_len'] = df_combined['sentence_len'].str.replace("Life$","Life sentence", regex=True)
df_combined['sentence_len'] = df_combined['sentence_len'].str.capitalize()

In [19]:
#Applying filters
filt1 = df_combined['sex'] == 'Female'
filt2 = df_combined['outcome'].isin(['Immediate custody', 'Community sentence','Suspended sentence'])
filt3 = df_combined['age_group'].isin(["Adults", "Young adults"])
filt4 = df_combined['pfa'].isin(["Special/miscellaneous and unknown police forces", "City of London"])
filt = filt1 & filt2 & filt3 & ~filt4
women_custody = df_combined[filt]

In [21]:
#Outputting data for continued analysis
women_custody.to_csv('../data/interim/PFA_2009-21_women_cust_comm_sus.csv', index=False)