In [1]:
import src.data.utilities as utils
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

In [None]:
## DATASETS ##
# 1. Sentencing data 2017–21 (from: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/1076592/Data-behind-interactive-tools-3.zip)
cols = ['Police Force Area', 'Year', 'Sex', 'Age group', 'Offence group', 'Sentence Outcome', 'Custodial Sentence Length','Sentenced']
df = utils.loadData("data/external/sentencing.csv", cols=cols)
df.head()

In [None]:
# 2. Court outcomes by police force area 2009–2019 (from: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/888561/csvs-behind-data-tools-2-2019.zip)
cols_2009 =['Police Force Area', 'Year of Appearance', 'Sex', 'Age Group', 'Offence Group', 'Outcome', 'Custodial Sentence Length','Count'] 
df_2009 = utils.loadData('data/external/court-outcomes-by-PFA-2019.csv', cols=cols_2009)
df_2009.head()

In [None]:
#Dropping duplicate data from 2009 dataset that also appears in df
filt = df_2009['Year of Appearance'] < 2017
df_2009 = df_2009[filt].copy()

In [None]:
df_list = utils.dataframeList(locals())

In [None]:
#Standardising variable names
for data in df_list:
    utils.lcColumns(data)
    utils.renameColumns(data, columns={
        'year_of_appearance': 'year',
        'offence_group': 'offence',
        'police_force_area': 'pfa',
        'sentence_outcome': 'outcome',
        'custodial_sentence_length': 'sentence_length',
        'sentenced': 'freq',
        'count': 'freq'}
        )
    # utils.orderColumns(data, column_order = ['year', 'pfa', 'sex', 'age_group', 'offence', 'outcome', 'sentence_length', 'freq'])

In [None]:

def tidy_elements(data):
    regex = {r"^\S*: \S* - ": "",
             r"\d\d: ": "",
            "Total ": "",
            "(Over)": "More than",
            "( and including)": "",
            "(to less than)": "and under",
            "Life$": "Life sentence"
             }
    return data.map(regex)

In [None]:
df_combined = pd.concat(df_list)
tidy_elements(df_combined)

In [None]:
df_combined = pd.concat(df_list)
utils.tidy_elements(df_combined)

In [None]:
df_combined.dtypes

In [None]:
convert_dict = {'outcome': "category",
                'sentence_length': "category"
                }

In [None]:
df_combined = df_combined.astype(convert_dict)

In [None]:
df_combined.dtypes

In [34]:
df = pd.read_csv('data/interim/PFA_2009-21_women_cust_comm_sus.csv')

In [35]:
filt = df['outcome'] == 'Immediate custody'
pfa_custody_sentence_lengths = df[filt].copy()

In [36]:
pfa_custody_sentence_lengths

Unnamed: 0,year,pfa,sex,age_group,offence,outcome,sentence_len,freq
12,2009,Avon and Somerset,Female,Young adults,Theft Offences,Immediate custody,Up to 1 month,1
18,2009,Avon and Somerset,Female,Young adults,Violence against the person,Immediate custody,Imprisonment for public protection,1
27,2009,Avon and Somerset,Female,Adults,Miscellaneous crimes against society,Immediate custody,More than 12 months and up to 18 months,1
30,2009,Avon and Somerset,Female,Adults,Theft Offences,Immediate custody,Up to 1 month,3
32,2009,Avon and Somerset,Female,Adults,Theft Offences,Immediate custody,Up to 1 month,1
...,...,...,...,...,...,...,...,...
242475,2021,Wiltshire,Female,Adults,Theft Offences,Immediate custody,Up to 1 month,1
242476,2021,Wiltshire,Female,Adults,Drug offences,Immediate custody,More than 2 years and up to 3 years,1
242482,2021,Wiltshire,Female,Adults,Miscellaneous crimes against society,Immediate custody,More than 1 month and up to 2 months,1
242484,2021,Wiltshire,Female,Adults,Miscellaneous crimes against society,Immediate custody,More than 3 months and under 6 months,1


Defining sentence length categories

In [37]:
less_6months = ["Up to 1 month", 
                "More than 1 month and up to 2 months",
                "More than 2 months and up to 3 months",
                "More than 3 months and under 6 months"]

six_12_months = ["6 months",
                "More than 6 months and up to 9 months",
                "More than 9 months and under 12 months"]

In [53]:
def sentence_length_groups(sentence_len):
    if sentence_len in less_6months:
        return 'Less than 6 months'
    elif sentence_len in six_12_months:
        return '6 months and under 12 months'
    else:
        return 'Over 12 months'

In [56]:
def replace_sentence_lengths(x_df, fill_map):
    res=x_df.loc[:,'sentence_len'].map(fill_map)
    x_df.loc[:,'sentence_len']=res 
    
    return x_df

In [57]:
(
    df
    .query('outcome == @sentence_type')
    .pipe(replace_sentence_lengths, sentence_length_groups)
    
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_df.loc[:,'sentence_len']=res


Unnamed: 0,year,pfa,sex,age_group,offence,outcome,sentence_len,freq
12,2009,Avon and Somerset,Female,Young adults,Theft Offences,Immediate custody,Less than 6 months,1
18,2009,Avon and Somerset,Female,Young adults,Violence against the person,Immediate custody,Over 12 months,1
27,2009,Avon and Somerset,Female,Adults,Miscellaneous crimes against society,Immediate custody,Over 12 months,1
30,2009,Avon and Somerset,Female,Adults,Theft Offences,Immediate custody,Less than 6 months,3
32,2009,Avon and Somerset,Female,Adults,Theft Offences,Immediate custody,Less than 6 months,1
...,...,...,...,...,...,...,...,...
242475,2021,Wiltshire,Female,Adults,Theft Offences,Immediate custody,Less than 6 months,1
242476,2021,Wiltshire,Female,Adults,Drug offences,Immediate custody,Over 12 months,1
242482,2021,Wiltshire,Female,Adults,Miscellaneous crimes against society,Immediate custody,Less than 6 months,1
242484,2021,Wiltshire,Female,Adults,Miscellaneous crimes against society,Immediate custody,Less than 6 months,1


In [46]:
#Filtering for custodial sentences and applying the map

pfa_custody_sentence_lengths = df.query('outcome == @sentence_type')
pfa_custody_sentence_lengths['sentence_len'] = pfa_custody_sentence_lengths['sentence_len'].map(sentence_length_groups)

#Grouping dataset
pfa_custody_sentence_lengths = pfa_custody_sentence_lengths.groupby(['pfa', 'year', 'sentence_len'], as_index=False)['freq'].sum()

#Outputting to CSV
pfa_custody_sentence_lengths.to_csv('data/interim/PFA_2009-21_women_cust_sentence_len.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pfa_custody_sentence_lengths['sentence_len'] = pfa_custody_sentence_lengths['sentence_len'].map(sentence_length_groups)


In [None]:
df2 = pfa_custody_sentence_lengths.copy()

In [None]:
df2['sentence_len'] = df2['sentence_len'].map(sentence_length_groups)

In [None]:
df2['sentence_len'].value_counts()

In [19]:
#Importing cleansed dataset
df = pd.read_csv('data/interim/PFA_2009-21_women_cust_comm_sus.csv')

## 1.SENTENCING OUTCOME FOR EACH PFA BY YEAR

#Grouping dataset
pfa_sentencing_outcomes = df.groupby(['pfa', 'year', 'outcome'], as_index=False)['freq'].sum()

#Outputting to CSV
pfa_sentencing_outcomes.to_csv('data/processed/PFA_2009-21_women_sentencing_outcomes_FINAL.csv', index=False)


## 2.CUSTODIAL SENTENCE LENGTHS FOR EACH PFA BY YEAR
'''THIS PRODUCES THE DATA FOR FIGURE 1 IN THE PFA FACTSHEET'''

#Filtering cleansed dataset
filt = df['outcome'] == 'Immediate custody'
pfa_custody_sentence_lengths = df[filt].copy()

#Defining sentence_len categories
less_6months = ["Up to 1 month", 
                "More than 1 month and up to 2 months",
                "More than 2 months and up to 3 months",
                "More than 3 months and under 6 months"]

six_12_months = ["6 months",
                "More than 6 months and up to 9 months",
                "More than 9 months and under 12 months"]

#Mapping sentence_len categories
def sentence_length_groups(sentence_len):
    if sentence_len in less_6months:
        return 'Less than 6 months'
    elif sentence_len in six_12_months:
        return '6 months and under 12 months'
    else:
        return 'Over 12 months'
    
pfa_custody_sentence_lengths['sentence_len'] = pfa_custody_sentence_lengths['sentence_len'].map(sentence_length_groups)

#Grouping dataset
pfa_custody_sentence_lengths = pfa_custody_sentence_lengths.groupby(['pfa', 'year', 'sentence_len'], as_index=False)['freq'].sum()

#Outputting to CSV
# final_df.to_csv('data/interim/PFA_2009-21_women_cust_sentence_len_test.csv', index=False)

In [39]:
#By year
filt = pfa_custody_sentence_lengths['year'] >= 2014
pfa_df_2014 = pfa_custody_sentence_lengths[filt].copy()

#By sentences of less than six months
filt = pfa_df_2014['sentence_len'] == "Less than 6 months"
lt_6 = pfa_df_2014[filt].copy()

#By sentences of less than 12 months
filt = pfa_df_2014['sentence_len'] != "Over 12 months"
lt_12m = pfa_df_2014[filt].copy()

In [31]:
def aggregate_sentences(df):
    new_df = pd.crosstab(index=df['pfa'], columns=df['year'],
                        values=df['freq'], aggfunc='sum')
    
    new_df = new_df.fillna(0.0).astype(int)
    new_df['per_change_2014'] = new_df.pct_change(axis='columns', periods=7).dropna(axis='columns')
    return new_df

In [40]:
lt_12m.columns

Index(['year', 'pfa', 'sex', 'age_group', 'offence', 'outcome', 'sentence_len',
       'freq'],
      dtype='object')

In [41]:
aggregate_sentences(lt_12m)

year,2014,2015,2016,2017,2018,2019,2020,2021,per_change_2014
pfa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Avon and Somerset,196,165,164,158,148,151,103,103,-0.47449
Bedfordshire,69,80,53,53,36,31,23,20,-0.710145
Cambridgeshire,91,89,112,115,116,89,78,47,-0.483516
Cheshire,169,181,167,172,176,149,123,117,-0.307692
Cleveland,91,78,108,152,140,98,55,103,0.131868
Cumbria,92,103,92,104,132,72,45,40,-0.565217
Derbyshire,171,179,176,174,178,123,130,126,-0.263158
Devon and Cornwall,116,126,120,147,120,106,106,86,-0.258621
Dorset,56,67,52,73,52,61,35,38,-0.321429
Durham,82,76,80,64,79,41,56,50,-0.390244


In [4]:
df = pd.read_csv('data/interim/PFA_2009-21_women_cust_comm_sus.csv')

In [30]:
(
    df
    .groupby(['pfa', 'year', 'outcome'], as_index=False)['freq'].sum()
    .to_csv('data/processed/PFA_2009-21_women_sentencing_outcomes_TEST.csv', index=False)
)

In [5]:
sentence_type = 'Immediate custody'
year = 2021

In [28]:
def crosstab(index, columns, values=None, aggfunc=None):
    return pd.crosstab(index, columns, values=values, aggfunc=aggfunc)

In [62]:
def csnap(df, fn=lambda x: x.shape, msg=None):
    """ Custom Help function to print things in method chaining.
        Returns back the df to further use in chaining.
    """
    if msg:
        print(msg)
    display(fn(df))
    return df

In [63]:
df.pipe(csnap)

(242500, 8)

Unnamed: 0,year,pfa,sex,age_group,offence,outcome,sentence_len,freq
0,2009,Avon and Somerset,Female,Young adults,Violence against the person,Community sentence,,2
1,2009,Avon and Somerset,Female,Young adults,Violence against the person,Suspended sentence,,1
2,2009,Avon and Somerset,Female,Young adults,Violence against the person,Suspended sentence,,1
3,2009,Avon and Somerset,Female,Young adults,Public order offences,Community sentence,,1
4,2009,Avon and Somerset,Female,Young adults,Miscellaneous crimes against society,Community sentence,,1
...,...,...,...,...,...,...,...,...
242495,2021,Wiltshire,Female,Adults,Summary motoring,Community sentence,,1
242496,2021,Wiltshire,Female,Adults,Summary motoring,Community sentence,,1
242497,2021,Wiltshire,Female,Adults,Summary motoring,Community sentence,,1
242498,2021,Wiltshire,Female,Adults,Summary motoring,Suspended sentence,,1
