In [1]:
import pandas as pd

In [2]:
def loadData(path_to_data, cols=None):
    '''Load CSV dataset to Pandas from filepath

    Parameters
    ----------
    path_to_data: str
        filepath location of dataset
    
    cols: str or list of str, optional
        A single string or list of column names to be included

    Returns
    -------
    DataFrame or TextParser
        A comma-separated values (csv) file is returned as two-dimensional data structure with labeled axes.
    '''
    return pd.read_csv(path_to_data, usecols=cols, low_memory=False)

In [5]:
## DATASETS ##
# 1. Sentencing data 2017–21 (from: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/1076592/Data-behind-interactive-tools-3.zip)
cols = ['Police Force Area', 'Year', 'Sex', 'Age group', 'Offence group', 'Sentence Outcome', 'Custodial Sentence Length','Sentenced']
df = loadData("data/external/sentencing.csv", cols=cols)
df.head()

Unnamed: 0,Year,Offence group,Sex,Age group,Police Force Area,Sentence Outcome,Custodial Sentence Length,Sentenced
0,2017,01: Violence against the person,02: Male,02: Young adults,Greater Manchester,07: Total Immediate custody,23: Custody - Life,1
1,2017,01: Violence against the person,02: Male,03: Adults,West Yorkshire,07: Total Immediate custody,23: Custody - Life,1
2,2017,01: Violence against the person,02: Male,03: Adults,Metropolitan Police,07: Total Immediate custody,23: Custody - Life,1
3,2017,01: Violence against the person,02: Male,03: Adults,West Yorkshire,07: Total Immediate custody,23: Custody - Life,2
4,2017,01: Violence against the person,02: Male,01: Children,Metropolitan Police,07: Total Immediate custody,23: Custody - Life,2


In [6]:
# 2. Court outcomes by police force area 2009–2019 (from: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/888561/csvs-behind-data-tools-2-2019.zip)
cols_2009 =['Police Force Area', 'Year of Appearance', 'Sex', 'Age Group', 'Offence Group', 'Outcome', 'Custodial Sentence Length','Count'] 
df_2009 = loadData('data/external/court-outcomes-by-PFA-2019.csv', cols=cols_2009)
df_2009.head()

Unnamed: 0,Police Force Area,Year of Appearance,Sex,Age Group,Offence Group,Outcome,Custodial Sentence Length,Count
0,Avon and Somerset,2009,01: Male,01: Children,04: Theft Offences,13: Community sentence,,1
1,Avon and Somerset,2009,01: Male,01: Children,03: Robbery,13: Community sentence,,1
2,Avon and Somerset,2009,01: Male,01: Children,11: Summary non-motoring,13: Community sentence,,1
3,Avon and Somerset,2009,01: Male,01: Children,11: Summary non-motoring,13: Community sentence,,1
4,Avon and Somerset,2009,01: Male,01: Children,01: Violence against the person,08: Committed for trial,,1


In [7]:
def lcColumns(data):
    data.columns = data.columns.str.lower().str.replace(' ', '_') 
    return data

In [8]:
def renameColumns(data, columns):
    '''
    Rename columns within a Pandas dataframe to standardised dictionary values
    
    Parameters
    ----------
    data: Pandas dataframe
    '''
    # data.columns = columns

    data.rename(
        columns = columns,
        inplace = True
    )

In [9]:
#Standardising variable names
for data in [df, df_2009]:
    lcColumns(data)
    renameColumns(data, columns={
        'year_of_appearance': 'year',
        'offence_group': 'offence',
        'police_force_area': 'pfa',
        'custodial_sentence_length': 'sentence_length',
        'sentenced': 'freq',
        'count': 'freq'}
        )

In [11]:
df.head()

Unnamed: 0,year,offence,sex,age_group,pfa,sentence_outcome,sentence_length,freq
0,2017,01: Violence against the person,02: Male,02: Young adults,Greater Manchester,07: Total Immediate custody,23: Custody - Life,1
1,2017,01: Violence against the person,02: Male,03: Adults,West Yorkshire,07: Total Immediate custody,23: Custody - Life,1
2,2017,01: Violence against the person,02: Male,03: Adults,Metropolitan Police,07: Total Immediate custody,23: Custody - Life,1
3,2017,01: Violence against the person,02: Male,03: Adults,West Yorkshire,07: Total Immediate custody,23: Custody - Life,2
4,2017,01: Violence against the person,02: Male,01: Children,Metropolitan Police,07: Total Immediate custody,23: Custody - Life,2


In [17]:
df.dtypes

year                 int64
offence             object
sex                 object
age_group           object
pfa                 object
sentence_outcome    object
sentence_length     object
freq                 int64
dtype: object

In [29]:
def remove_num_prefix(data, item):
    if type(item) == object:
        return data[item].str.replace("^\S*: \S* - ","", regex=True)
    else:
        return data[item]

In [31]:
df2 = df.apply(data, remove_num_prefix)
df2.head()

ValueError: No axis named <function remove_num_prefix at 0x118c2e310> for object type DataFrame

In [55]:
df['sentence_length'] = df['sentence_length'].str.replace("^\S*: \S* - ","", regex=True)

In [56]:
df.head()

Unnamed: 0,year,offence,sex,age_group,pfa,sentence_outcome,sentence_length,freq
0,2017,01: Violence against the person,02: Male,02: Young adults,Greater Manchester,07: Total Immediate custody,Life,1
1,2017,01: Violence against the person,02: Male,03: Adults,West Yorkshire,07: Total Immediate custody,Life,1
2,2017,01: Violence against the person,02: Male,03: Adults,Metropolitan Police,07: Total Immediate custody,Life,1
3,2017,01: Violence against the person,02: Male,03: Adults,West Yorkshire,07: Total Immediate custody,Life,2
4,2017,01: Violence against the person,02: Male,01: Children,Metropolitan Police,07: Total Immediate custody,Life,2


In [50]:
df['sentence_length'] = df['sentence_length'].str.replace("^\S*: \S* - ","", regex=True)
df_2009['sentence_length'] = df_2009['sentence_length'].str.replace("\d\d: ","", regex=True)

In [57]:
#Dropping duplicate data from 2009 dataset that also appears in df
filt = df_2009['year'] < 2017
df_2009 = df_2009[filt].copy()

#Removing sentence length prefixes 
df['sentence_length'] = df['sentence_length'].str.replace("^\S*: \S* - ","", regex=True)
df_2009['sentence_length'] = df_2009['sentence_length'].str.replace("\d\d: ","", regex=True)

df_combined = pd.concat([df_2009, df])
df_combined

Unnamed: 0,pfa,year,sex,age_group,offence,outcome,sentence_length,freq,sentence_outcome
0,Avon and Somerset,2009,01: Male,01: Children,04: Theft Offences,13: Community sentence,,1,
1,Avon and Somerset,2009,01: Male,01: Children,03: Robbery,13: Community sentence,,1,
2,Avon and Somerset,2009,01: Male,01: Children,11: Summary non-motoring,13: Community sentence,,1,
3,Avon and Somerset,2009,01: Male,01: Children,11: Summary non-motoring,13: Community sentence,,1,
4,Avon and Somerset,2009,01: Male,01: Children,01: Violence against the person,08: Committed for trial,,1,
...,...,...,...,...,...,...,...,...,...
1741813,Cheshire,2021,02: Male,03: Adults,13: Not known,,,1,03: Fine
1741814,North Wales,2021,01: Female,03: Adults,13: Not known,,,1,03: Fine
1741815,Cheshire,2021,01: Female,03: Adults,13: Not known,,,1,03: Fine
1741816,North Wales,2021,01: Female,03: Adults,13: Not known,,Up to and including 1 month,1,07: Total Immediate custody


In [25]:
df_combined['sex'].unique()

array(['01: Male', '02: Female', '04: Not known',
       '03: Companies, public bodies etc.', '02: Male', '01: Female'],
      dtype=object)

In [27]:
df_combined['year'].value_counts()

2017    795159
2018    755520
2019    742150
2010    410133
2009    408004
2011    398612
2012    367171
2013    351272
2014    343860
2021    335337
2015    334371
2016    314714
2020    288370
Name: year, dtype: int64