In [1]:
import pandas as pd

In [2]:
def loadData(path_to_data, cols=None):
    '''Load CSV dataset to Pandas from filepath

    Parameters
    ----------
    path_to_data: str
        filepath location of dataset
    
    cols: str or list of str, optional
        A single string or list of column names to be included

    Returns
    -------
    DataFrame or TextParser
        A comma-separated values (csv) file is returned as two-dimensional data structure with labeled axes.
    '''
    return pd.read_csv(path_to_data, usecols=cols, low_memory=False)

In [3]:
## DATASETS ##
# 1. Sentencing data 2017–21 (from: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/1076592/Data-behind-interactive-tools-3.zip)
cols = ['Police Force Area', 'Year', 'Sex', 'Age group', 'Offence group', 'Sentence Outcome', 'Custodial Sentence Length','Sentenced']
df = loadData("data/external/sentencing.csv", cols=cols)
df.head()

Unnamed: 0,Year,Offence group,Sex,Age group,Police Force Area,Sentence Outcome,Custodial Sentence Length,Sentenced
0,2017,01: Violence against the person,02: Male,02: Young adults,Greater Manchester,07: Total Immediate custody,23: Custody - Life,1
1,2017,01: Violence against the person,02: Male,03: Adults,West Yorkshire,07: Total Immediate custody,23: Custody - Life,1
2,2017,01: Violence against the person,02: Male,03: Adults,Metropolitan Police,07: Total Immediate custody,23: Custody - Life,1
3,2017,01: Violence against the person,02: Male,03: Adults,West Yorkshire,07: Total Immediate custody,23: Custody - Life,2
4,2017,01: Violence against the person,02: Male,01: Children,Metropolitan Police,07: Total Immediate custody,23: Custody - Life,2


In [4]:
# 2. Court outcomes by police force area 2009–2019 (from: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/888561/csvs-behind-data-tools-2-2019.zip)
cols_2009 =['Police Force Area', 'Year of Appearance', 'Sex', 'Age Group', 'Offence Group', 'Outcome', 'Custodial Sentence Length','Count'] 
df_2009 = loadData('data/external/court-outcomes-by-PFA-2019.csv', cols=cols_2009)
df_2009.head()

Unnamed: 0,Police Force Area,Year of Appearance,Sex,Age Group,Offence Group,Outcome,Custodial Sentence Length,Count
0,Avon and Somerset,2009,01: Male,01: Children,04: Theft Offences,13: Community sentence,,1
1,Avon and Somerset,2009,01: Male,01: Children,03: Robbery,13: Community sentence,,1
2,Avon and Somerset,2009,01: Male,01: Children,11: Summary non-motoring,13: Community sentence,,1
3,Avon and Somerset,2009,01: Male,01: Children,11: Summary non-motoring,13: Community sentence,,1
4,Avon and Somerset,2009,01: Male,01: Children,01: Violence against the person,08: Committed for trial,,1


In [7]:
def clean_header(data):
    """
    This functions removes weird characters and spaces from column names, while keeping everything lower case
    """
    data.columns = data.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

In [8]:
clean_header(df)

  data.columns = data.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')


In [9]:
df.columns

Index(['year', 'offence_group', 'sex', 'age_group', 'police_force_area',
       'sentence_outcome', 'custodial_sentence_length', 'sentenced'],
      dtype='object')

In [5]:
def lcColumns(data):
    return data.columns.str.lower()

In [6]:
def renameColumns(data, columns):
    '''
    Rename columns within a Pandas dataframe to standardised dictionary values
    
    Parameters
    ----------
    data: Pandas dataframe
    '''
    data.rename(
        columns = columns,
        inplace = True
    )

In [26]:
def dropYearsAfter(data, drop_year, year_column='year'):
    """Drop records in a DataFrame with a year value greater than or equal to `drop_year`

    Parameters
    ----------
    data : Pandas DataFrame
        DataFrame to perform the transformation on
    drop_year : int
        Starting year that records should be removed from `data` DataFrame
    """    
    filt = data[year_column] < drop_year
    return data[filt]


In [19]:
df_2009 = dropYearsAfter(df_2009, 2017, "Year of Appearance")

for data in [df, df_2009]:
    lcColumns(data)
    renameColumns(data, columns={
        'year of appearance': 'year',
        'offence group': 'offence',
        'age group': 'age_group',
        'police force area': 'pfa',
        'sentence outcome': 'outcome',
        'custodial sentence length': 'sentence_len',
        'sentenced': 'freq',
        'count': 'freq'}
        )
    



In [21]:
df_2009.head()

Unnamed: 0,pfa,year,sex,age_group,offence,outcome,sentence_len,freq
0,Avon and Somerset,2009,01: Male,01: Children,04: Theft Offences,13: Community sentence,,1
1,Avon and Somerset,2009,01: Male,01: Children,03: Robbery,13: Community sentence,,1
2,Avon and Somerset,2009,01: Male,01: Children,11: Summary non-motoring,13: Community sentence,,1
3,Avon and Somerset,2009,01: Male,01: Children,11: Summary non-motoring,13: Community sentence,,1
4,Avon and Somerset,2009,01: Male,01: Children,01: Violence against the person,08: Committed for trial,,1


Column ordering

In [None]:
df_column_order = ['year', 'pfa', 'sex', 'age_group', 'offence', 'outcome', 'sentence_len', 'freq']
df = df[df_column_order]
df_2009 = df_2009[df_column_order]