# Women's imprisonment rates
## Criminal Justice Statistics Police Force Area: Data Cleansing

#### Importing pandas library and reading in data

In [1]:
import pandas as pd

In [147]:
usecols =['Police Force Area', 'Year', 'Sex', 'Age group', 'Sentence Outcome', 'Custodial Sentence Length','Sentenced']

In [154]:
df = pd.read_csv('../data/external/sentencing.csv', usecols=usecols, encoding = 'latin1', low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1741818 entries, 0 to 1741817
Data columns (total 7 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   Year                       int64 
 1   Sex                        object
 2   Age group                  object
 3   Police Force Area          object
 4   Sentence Outcome           object
 5   Custodial Sentence Length  object
 6   Sentenced                  int64 
dtypes: int64(2), object(5)
memory usage: 93.0+ MB


In [155]:
df.head()

Unnamed: 0,Year,Sex,Age group,Police Force Area,Sentence Outcome,Custodial Sentence Length,Sentenced
0,2017,02: Male,02: Young adults,Greater Manchester,07: Total Immediate custody,23: Custody - Life,1
1,2017,02: Male,03: Adults,West Yorkshire,07: Total Immediate custody,23: Custody - Life,1
2,2017,02: Male,03: Adults,Metropolitan Police,07: Total Immediate custody,23: Custody - Life,1
3,2017,02: Male,03: Adults,West Yorkshire,07: Total Immediate custody,23: Custody - Life,2
4,2017,02: Male,01: Children,Metropolitan Police,07: Total Immediate custody,23: Custody - Life,2


In [156]:
df = df.rename(columns = {'Year': 'year',
                     'Police Force Area': 'pfa',
                     'Age group': 'age_group',
                     'Sex': 'sex',
                     'Sentence Outcome': 'outcome',
                     'Custodial Sentence Length': 'sentence_len',
                     'Sentenced': 'freq'})
df.head()

Unnamed: 0,year,sex,age_group,pfa,outcome,sentence_len,freq
0,2017,02: Male,02: Young adults,Greater Manchester,07: Total Immediate custody,23: Custody - Life,1
1,2017,02: Male,03: Adults,West Yorkshire,07: Total Immediate custody,23: Custody - Life,1
2,2017,02: Male,03: Adults,Metropolitan Police,07: Total Immediate custody,23: Custody - Life,1
3,2017,02: Male,03: Adults,West Yorkshire,07: Total Immediate custody,23: Custody - Life,2
4,2017,02: Male,01: Children,Metropolitan Police,07: Total Immediate custody,23: Custody - Life,2


In [158]:
df['outcome'] = df['outcome'].str.replace("\d\d: ","", regex=True)
df['sex'] = df['sex'].str.replace("\d\d: ","", regex=True)
df['age_group'] = df['age_group'].str.replace("\d\d: ","", regex=True)
df['sentence_len'] = df['sentence_len'].str.replace("^\S* \S* - ","", regex=True)
df.head()

Unnamed: 0,year,sex,age_group,pfa,outcome,sentence_len,freq
0,2017,Male,Young adults,Greater Manchester,Total Immediate custody,Life,1
1,2017,Male,Adults,West Yorkshire,Total Immediate custody,Life,1
2,2017,Male,Adults,Metropolitan Police,Total Immediate custody,Life,1
3,2017,Male,Adults,West Yorkshire,Total Immediate custody,Life,2
4,2017,Male,Children,Metropolitan Police,Total Immediate custody,Life,2


In [159]:
df['sentence_len'].unique()

array(['Life', 'More than 18 months and up to 2 years', nan,
       'More than 15 years and less than life',
       'More than 10 years and up to 15 years',
       'More than 6 years and up to 7 years',
       'More than 7 years and up to 8 years',
       'More than 3 years and under 4 years', '4 years',
       'More than 2 years and up to 3 years',
       'More than 12 months and up to 18 months',
       'More than 4 years and up to 5 years', '6 months',
       'More than 5 years and up to 6 years',
       'More than 9 years and up to 10 years',
       'More than 6 months and up to 9 months',
       'More than 9 months and under 12 months',
       'More than 8 years and up to 9 years', '12 months',
       'More than 3 months and under 6 months',
       'Over 2 months and up to and including 3 months',
       'Over 1 month and up to and including 2 months',
       'Up to and including 1 month'], dtype=object)

In [12]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1741818 entries, 0 to 1741817
Data columns (total 7 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   year          1741818 non-null  int64 
 1   sex           1741818 non-null  object
 2   age_group     1741818 non-null  object
 3   pfa           1741818 non-null  object
 4   outcome       1741818 non-null  object
 5   sentence_len  299109 non-null   object
 6   freq          1741818 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 93.0+ MB


In [106]:
df2 = df.dropna(subset=['sentence_len'])

In [107]:
filt1 = df2['sex'] == 'Female'
filt2 = df2['outcome'] == 'Total Immediate custody'
filt3 = df2['age_group'].isin(["Adults", "Young adults"])
filt = filt1 & filt2 & filt3
women_custody = df2[filt]

In [108]:
women_custody

Unnamed: 0,year,sex,age_group,pfa,outcome,sentence_len,freq
39,2017,Female,Adults,Metropolitan Police,Total Immediate custody,Custody - Life,1
40,2017,Female,Adults,South Yorkshire,Total Immediate custody,Custody - Life,1
41,2017,Female,Adults,Hampshire,Total Immediate custody,Custody - Life,1
64,2017,Female,Adults,Gwent,Total Immediate custody,Custody - More than 7 years and up to 8 years,1
65,2017,Female,Adults,West Midlands,Total Immediate custody,Custody - More than 15 years and less than life,1
...,...,...,...,...,...,...,...
1722745,2021,Female,Adults,Kent,Total Immediate custody,Custody - Over 1 month and up to and including...,1
1722753,2021,Female,Adults,Nottinghamshire,Total Immediate custody,Custody - More than 3 months and under 6 months,1
1722755,2021,Female,Adults,South Yorkshire,Total Immediate custody,Custody - More than 3 months and under 6 months,1
1722762,2021,Female,Adults,West Midlands,Total Immediate custody,Custody - Over 2 months and up to and includin...,1


#### Checking that totals match pivot table outputs

In [19]:
years = women_custody.groupby('year').agg({'freq':'sum'})
years

Unnamed: 0_level_0,freq
year,Unnamed: 1_level_1
2017,7187
2018,6465
2019,5878
2020,4383
2021,4221


#### Using crosstab to aggregate the total number of custodial sentences by police force area for each year.

In [20]:
pfa_years = pd.crosstab(index=women_custody['pfa'], columns=women_custody['year'],
                     values=women_custody['freq'], aggfunc='sum')
pfa_years.head()

year,2017,2018,2019,2020,2021
pfa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Avon and Somerset,158.0,148.0,151.0,103.0,103.0
Bedfordshire,53.0,36.0,31.0,23.0,20.0
Cambridgeshire,115.0,116.0,89.0,78.0,47.0
Cheshire,172.0,176.0,149.0,123.0,117.0
City of London,5.0,1.0,,,


#### Converting floats to integers and assigning inplace

In [21]:
pfa_years = pfa_years.fillna(0.0).astype(int)
pfa_years.head()

year,2017,2018,2019,2020,2021
pfa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Avon and Somerset,158,148,151,103,103
Bedfordshire,53,36,31,23,20
Cambridgeshire,115,116,89,78,47
Cheshire,172,176,149,123,117
City of London,5,1,0,0,0


#### Removing "Special/miscellaneous and unknown police forces" and "City of London"

In [138]:
filt_pfa = ["Special/miscellaneous and unknown police forces", "City of London"]
df3 = pfa_years.query('pfa != @filt_pfa')

In [139]:
df3.head()

year,2017,2018,2019,2020,2021
pfa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Avon and Somerset,158,148,151,103,103
Bedfordshire,53,36,31,23,20
Cambridgeshire,115,116,89,78,47
Cheshire,172,176,149,123,117
Cleveland,152,140,98,55,103


#### Checking for number of observations

In [120]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42 entries, Avon and Somerset to Wiltshire
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   2017    42 non-null     int64
 1   2018    42 non-null     int64
 2   2019    42 non-null     int64
 3   2020    42 non-null     int64
 4   2021    42 non-null     int64
dtypes: int64(5)
memory usage: 2.0+ KB


### Bringing in dataset from previous analysis to extend the time period back to 2014

In [140]:
df_2014 = pd.read_csv("../../womens-pfa-analysis/data/interim/court-outcomes-by-PFA-2019-cleansed.csv")
df_2014.head()

Unnamed: 0,pfa,2014,2015,2016,2017,2018,2019,per_change_2014
0,Avon and Somerset,196,165,164,158,148,151,-0.229592
1,Bedfordshire,69,80,53,53,36,31,-0.550725
2,Cambridgeshire,91,89,112,115,116,89,-0.021978
3,Cheshire,169,181,167,172,176,149,-0.118343
4,Cleveland,91,78,108,152,140,98,0.076923


In [141]:
df_2014.drop(df_2014.iloc[:,4:], inplace=True, axis=1)

In [142]:
df_2014.head()

Unnamed: 0,pfa,2014,2015,2016
0,Avon and Somerset,196,165,164
1,Bedfordshire,69,80,53
2,Cambridgeshire,91,89,112
3,Cheshire,169,181,167
4,Cleveland,91,78,108


#### Checking for numer of observations

In [129]:
df_2014.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   pfa     42 non-null     object
 1   2014    42 non-null     int64 
 2   2015    42 non-null     int64 
 3   2016    42 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 1.4+ KB


#### Resetting index for df3 to release pfa column to merge on

In [143]:
df_2014.columns

Index(['pfa', '2014', '2015', '2016'], dtype='object')

In [144]:
df3.columns

Int64Index([2017, 2018, 2019, 2020, 2021], dtype='int64', name='year')

####  df3 column names are int rather than str, which has created an issue.

In [132]:
df3.reset_index(inplace=True)
df3.head()

year,pfa,2017,2018,2019,2020,2021
0,Avon and Somerset,158,148,151,103,103
1,Bedfordshire,53,36,31,23,20
2,Cambridgeshire,115,116,89,78,47
3,Cheshire,172,176,149,123,117
4,Cleveland,152,140,98,55,103


In [133]:
df_merged = pd.merge(df_2014, df3)
df_merged.head()

Unnamed: 0,pfa,2014,2015,2016,2017,2018,2019,2020,2021
0,Avon and Somerset,196,165,164,158,148,151,103,103
1,Bedfordshire,69,80,53,53,36,31,23,20
2,Cambridgeshire,91,89,112,115,116,89,78,47
3,Cheshire,169,181,167,172,176,149,123,117
4,Cleveland,91,78,108,152,140,98,55,103


#### Ensuring that merge has no null values

In [64]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42 entries, 0 to 41
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   pfa     42 non-null     object
 1   2014    42 non-null     int64 
 2   2015    42 non-null     int64 
 3   2016    42 non-null     int64 
 4   2017    42 non-null     int64 
 5   2018    42 non-null     int64 
 6   2019    42 non-null     int64 
 7   2020    42 non-null     int64 
 8   2021    42 non-null     int64 
dtypes: int64(8), object(1)
memory usage: 3.3+ KB


In [63]:
df_merged

Unnamed: 0,pfa,2014,2015,2016,2017,2018,2019,2020,2021
0,Avon and Somerset,196,165,164,158,148,151,103,103
1,Bedfordshire,69,80,53,53,36,31,23,20
2,Cambridgeshire,91,89,112,115,116,89,78,47
3,Cheshire,169,181,167,172,176,149,123,117
4,Cleveland,91,78,108,152,140,98,55,103
5,Cumbria,92,103,92,104,132,72,45,40
6,Derbyshire,171,179,176,174,178,123,130,126
7,Devon and Cornwall,116,126,120,147,120,106,106,86
8,Dorset,56,67,52,73,52,61,35,38
9,Durham,82,76,80,64,79,41,56,50


#### Calculating percentage change between 2014 and 2021 and dropping all NaN columns

In [65]:
df_merged.pct_change(axis='columns', periods=7)

TypeError: unsupported operand type(s) for /: 'int' and 'str'

#### Getting some weird errors here as the info suggests that all of the relevant values are int. Let's try and alternative calculation, just to see whether I can perform operations between columns.

In [66]:
df_merged['2021'] - df_merged['2020']

KeyError: '2021'

Strange, let's check the column names on the dataframe to make sure I'm not missing out a space.

In [94]:
df_merged.columns

Index(['pfa', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021'], dtype='object')

#### Ahhhhh, looks as though there's some weirdness going on here. Let's try and convert them for consistency and see if that works.

In [96]:
df_merged.index

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
            34, 35, 36, 37, 38, 39, 40, 41],
           dtype='int64')

In [97]:
df_merged.columns

Index(['pfa', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021'], dtype='object')

In [99]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42 entries, 0 to 41
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   pfa     42 non-null     object
 1   2014    42 non-null     int64 
 2   2015    42 non-null     int64 
 3   2016    42 non-null     int64 
 4   2017    42 non-null     int64 
 5   2018    42 non-null     int64 
 6   2019    42 non-null     int64 
 7   2020    42 non-null     int64 
 8   2021    42 non-null     int64 
dtypes: int64(8), object(1)
memory usage: 3.3+ KB


#### Let's try and set PFA as the index to remove it from the other columns

In [100]:
df_merged = df_merged.set_index('pfa')

In [101]:
per_change = df_merged.pct_change(axis='columns', periods=7).dropna(axis='columns').copy()
per_change.head()

Unnamed: 0_level_0,2021
pfa,Unnamed: 1_level_1
Avon and Somerset,-0.47449
Bedfordshire,-0.710145
Cambridgeshire,-0.483516
Cheshire,-0.307692
Cleveland,0.131868


## Success!

Resetting the index earlier clearly didn't help. It assisted the merge, but then I had to reinstate it.

### Let's now add this new column back to df_merged

In [102]:
df4 = df_merged.join(per_change, rsuffix='_per_change')
df4.rename(columns={'2021_per_change':'per_change_2014'}, inplace = True)
df4.head()

Unnamed: 0_level_0,2014,2015,2016,2017,2018,2019,2020,2021,per_change_2014
pfa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Avon and Somerset,196,165,164,158,148,151,103,103,-0.47449
Bedfordshire,69,80,53,53,36,31,23,20,-0.710145
Cambridgeshire,91,89,112,115,116,89,78,47,-0.483516
Cheshire,169,181,167,172,176,149,123,117,-0.307692
Cleveland,91,78,108,152,140,98,55,103,0.131868


#### Saving merged and cleaned data to new csv

In [104]:
df4.to_csv('../data/processed/female_custodial_sentences_PFA.csv')