# Development of data processing functions for use in final production script

## Importing libraries

In [2]:
import pandas as pd
import utilities as utils
from time import sleep
import numpy as np
import operator

config = utils.read_config()

## Loading data

In [None]:
def loadData(path=config['data']['intFilePath'], filename="PFA_2010-22_women_cust_comm_sus.csv") -> pd.DataFrame:
    dfPath=f"{path}{filename}"
    return pd.read_csv(dfPath)

In [None]:
df = loadData()
df.info()

## Converting to categories

In [None]:
def categoryColumns(df):      #Converting object columns to category
    cols = df.select_dtypes(include='object').columns
    for col in cols:
        ratio = len(df[col].value_counts()) / len(df)
        if ratio < 0.05:
            df[col] = df[col].astype('category')
    return df.info()

In [None]:
categoryColumns(df)

That looks as though it's worth doing at this stage as memory usage is around two-thirds less, so let's rewrite the return statement

In [3]:
def categoryColumns(df) -> pd.DataFrame:
    """Convert columns to category data type if they meet ratio

    Parameters
    ----------
    df : DataFrame

    Returns
    -------
    DataFrame
        Processed DataFrame with object columns which meet criteria replaced with categories
    """
    cols = df.select_dtypes(include='object').columns
    for col in cols:
        ratio = len(df[col].value_counts()) / len(df)
        if ratio < 0.05:
            df[col] = df[col].astype('category')
    return df

As well as rewriting this to be returned on the loadData function as this will then cascade through the script.

In [114]:
def loadData(status='interim', filename='PFA_2010-22_women_cust_comm_sus.csv') -> pd.DataFrame:
    """Load CSV file into Pandas DataFrame and convert object columns to categories when they meet criteria in `categoryColumns()`

    Parameters
    ----------
    status : {'raw', 'interim', 'processed'}, default is 'interim'
        Status of the data processing.
        * If 'raw' file is located in "rawFilePath" within config file
        * If 'interim', file is located in "intFilePath"
        * If 'processed', file is located in "clnFilePath"
    filename : str, default is 'PFA_2010-22_women_cust_comm_sus.csv'
        Name of CSV file to be loaded.

    Returns
    -------
    DataFrame
        CSV data is returned as Pandas DataFrame with any eligible object columns converted into category columns to limit memory requirements
    """
    paths = {
        "raw": 'rawFilePath',
        "interim": 'intFilePath',
        "processed": 'clnFilePath'
    }

    dfPath=f"{config['data'][paths[status]]}{filename}"
    df = pd.read_csv(dfPath)
    print('Data loaded')
    return categoryColumns(df)

In [115]:
df = loadData()
df.info()

Data loaded
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243344 entries, 0 to 243343
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype   
---  ------           --------------   -----   
 0   year             243344 non-null  int64   
 1   pfa              243344 non-null  category
 2   sex              243344 non-null  category
 3   age_group        243344 non-null  category
 4   offence          243344 non-null  category
 5   outcome          243344 non-null  category
 6   sentence_length  243344 non-null  category
 7   freq             243344 non-null  int64   
dtypes: category(6), int64(2)
memory usage: 5.1 MB


## Saving data

In [6]:
def saveData(df, status, filename, index=True):
    """Save data during or at the end of a data processing pipeline

    Parameters
    ----------
    df : DataFrame
        
    status : {'interim', 'processed'}
        Status of the data processing. 
        * If 'interim', file is saved to "intFilePath"
        * If 'processed', file is saved to "clnFilePath"

    filename : str
        filename parameter for csv export

    index : bool
        include index of DataFrame in csv output, by default True

    Returns
    -------
    DataFrame
        Render DataFrame as comma-separated file.
    """
    
    paths = {
            "interim": 'intFilePath',
            "processed": 'clnFilePath'
        }
    
    df.to_csv(f"{config['data'][paths[status]]}{filename}.csv", index=index)
    print(f"{filename} saved")
    return df

## 1.SENTENCING OUTCOME FOR EACH PFA BY YEAR

In [7]:
def groupAndSum(df, columns, sum_column=['freq']):
    """Perform groupby and sum on a DataFrame

    Parameters
    ----------
    df : DataFrame
        _description_
    columns : label or list
        column names of DataFrame to perform `groupby()` operation
    sum_column : label, optional
        column name of DataFrame to perform `sum()` operation, by default "['freq']"

    Returns
    -------
    DataFrame
        Reshaped DataFrame grouped by `columns` parameter and the sum of the values over `sum_column`.
    """
    return df.groupby(columns, as_index=False)[sum_column].sum()

In [37]:
def sentencesByPFA(df, filename="sentencesByPFA"):
    """Data processing pipeline to produce sentencing outcomes by Police Force Area across
    the entire available date range.

    Parameters
    ----------
    df : DataFrame

    filename : str, optional
        filename parameter for final csv export, by default "sentencesByPFA"

    Returns
    -------
    DataFrame
        Returns the original DataFrame, but saves a fully processed CSV file containing sentencing outcomes by Police Force Area across 
        the entire available date range.
    """
    my_df = df.copy()

    (my_df
    .pipe(groupAndSum, columns=['pfa', 'year', 'outcome'])
    .pipe(saveData, status='processed', filename=filename, index=False)
    )
    
    #Return original DataFrame to allow for continued processing through the pipeline.
    return df

Testing

In [28]:
df=loadData()
print('Data loaded')
sentencesByPFA(df)

Data loaded
sentencesByPFA saved


## 2.CUSTODIAL SENTENCES FOR EACH PFA BY OFFENCE TYPE

In [44]:
def filterSentence(df, sentence_type='Immediate custody', column='outcome') -> pd.DataFrame:
    """DataFrame filter allowing selection of subset of data by sentence type

    Parameters
    ----------
    df : DataFrame

    sentence_type : single label or list-like, optional
        select from the available sentence types within the DataFrame—['Community sentence', 'Immediate custody', 'Suspended sentence'], by default 'Immediate custody'

    column : str, optional
        column name of DataFrame with sentence outcome values, by default 'outcome'

    Returns
    -------
    DataFrame
        A filtered DataFrame displaying the chosen sentence type
    """
    mask = None
    if type(sentence_type) == str:
        mask = df[column] == sentence_type
    elif type(sentence_type) == list:
        mask = df[column].isin(sentence_type)
    
    filtered_df = df.loc[mask].copy()
    return filtered_df

In [47]:
def filterYear(df, year=df['year'].max(), op="eq", column='year') -> pd.DataFrame: 
    """DataFrame filter allowing selection of subset of data by year using comparison operators from operator library
    Evaluate a comparison operation `=`, `!=`, `>=`, `>`, `<=`, or `<`.

    Parameters
    ----------
    df : DataFrame
        
    year : int, optional
        target year, by default df['year'].max()
    op : {eq, ne, gt, ge, lt, le}, optional
        comparison operator, by default "eq"
        lt is equivalent to a < b, 
        le is equivalent to a <= b, 
        eq is equivalent to a == b, 
        ne is equivalent to a != b, 
        gt is equivalent to a > b and 
        ge is equivalent to a >= b.
    column : str, optional
        column name of DataFrame with year values, by default 'year'

    Returns
    -------
    DataFrame
        A filtered DataFrame displaying the records for a chosen year or period
    """

    methods = {
            "eq": operator.eq,
            "ne": operator.ne,
            "lt": operator.lt,
            "gt": operator.gt,
            "le": operator.le,
            "ge": operator.ge,
        }

    mask = methods[op](df[column], year)
    filtered_df = df[mask]
    return filtered_df

In [32]:
my_df = df.copy()
filterYear(my_df, 2020, op="gt")

Unnamed: 0,year,pfa,sex,age_group,offence,outcome,sentence_length,freq
211073,2021,Avon and Somerset,Female,Young adults,Violence against the person,Community sentence,Not known,4
211074,2021,Avon and Somerset,Female,Young adults,Violence against the person,Suspended sentence,Not known,2
211075,2021,Avon and Somerset,Female,Young adults,Drug offences,Community sentence,Not known,1
211076,2021,Avon and Somerset,Female,Adults,Drug offences,Immediate custody,More than 2 years and up to and including 3 years,1
211077,2021,Avon and Somerset,Female,Adults,Violence against the person,Community sentence,Not known,1
...,...,...,...,...,...,...,...,...
243339,2022,Wiltshire,Female,Adults,Fraud Offences,Suspended sentence,Not known,1
243340,2022,Wiltshire,Female,Adults,Summary non-motoring,Community sentence,Not known,1
243341,2022,Wiltshire,Female,Adults,Summary motoring,Community sentence,Not known,1
243342,2022,Wiltshire,Female,Adults,Miscellaneous crimes against society,Community sentence,Not known,1


In [13]:
my_df = df.copy()
filterSentence(my_df)

Unnamed: 0,year,pfa,sex,age_group,offence,outcome,sentence_length,freq
2,2010,Avon and Somerset,Female,Young adults,Violence against the person,Immediate custody,Life sentence,1
10,2010,Avon and Somerset,Female,Young adults,Drug offences,Immediate custody,More than 6 months and up to and including 9 m...,1
14,2010,Avon and Somerset,Female,Young adults,Miscellaneous crimes against society,Immediate custody,More than 12 months and up to and including 18...,1
20,2010,Avon and Somerset,Female,Young adults,Summary non-motoring,Immediate custody,More than 3 months and up to 6 months,1
27,2010,Avon and Somerset,Female,Adults,Theft offences,Immediate custody,Up to and including 1 month,1
...,...,...,...,...,...,...,...,...
243319,2022,Wiltshire,Female,Adults,Drug offences,Immediate custody,More than 2 years and up to and including 3 years,2
243320,2022,Wiltshire,Female,Adults,Possession of weapons,Immediate custody,6 months,1
243322,2022,Wiltshire,Female,Adults,Theft offences,Immediate custody,6 months,1
243324,2022,Wiltshire,Female,Adults,Theft offences,Immediate custody,Up to and including 1 month,1


In [45]:
filterSentence(my_df, ["Suspended sentence", "Community sentence"])

Unnamed: 0,year,pfa,sex,age_group,offence,outcome,sentence_length,freq
0,2010,Avon and Somerset,Female,Young adults,Violence against the person,Community sentence,Not known,1
1,2010,Avon and Somerset,Female,Young adults,Drug offences,Community sentence,Not known,1
3,2010,Avon and Somerset,Female,Young adults,Violence against the person,Community sentence,Not known,1
4,2010,Avon and Somerset,Female,Young adults,Violence against the person,Suspended sentence,Not known,1
5,2010,Avon and Somerset,Female,Young adults,Theft offences,Community sentence,Not known,1
...,...,...,...,...,...,...,...,...
243339,2022,Wiltshire,Female,Adults,Fraud Offences,Suspended sentence,Not known,1
243340,2022,Wiltshire,Female,Adults,Summary non-motoring,Community sentence,Not known,1
243341,2022,Wiltshire,Female,Adults,Summary motoring,Community sentence,Not known,1
243342,2022,Wiltshire,Female,Adults,Miscellaneous crimes against society,Community sentence,Not known,1


In [31]:
my_df = df.copy()
test_df = (
    my_df
    .pipe(filterYear, 2020, op="ge")
    .pipe(filterSentence)
    )
test_df

Unnamed: 0,year,pfa,sex,age_group,offence,outcome,sentence_length,freq
196615,2020,Avon and Somerset,Female,Young adults,Drug offences,Immediate custody,More than 2 years and up to and including 3 years,1
196617,2020,Avon and Somerset,Female,Young adults,Miscellaneous crimes against society,Immediate custody,More than 2 months and up to and including 3 m...,1
196626,2020,Avon and Somerset,Female,Adults,Violence against the person,Immediate custody,More than 1 month and up to and including 2 mo...,1
196631,2020,Avon and Somerset,Female,Adults,Summary non-motoring,Immediate custody,More than 1 month and up to and including 2 mo...,1
196632,2020,Avon and Somerset,Female,Adults,Summary non-motoring,Immediate custody,More than 2 months and up to and including 3 m...,1
...,...,...,...,...,...,...,...,...
243319,2022,Wiltshire,Female,Adults,Drug offences,Immediate custody,More than 2 years and up to and including 3 years,2
243320,2022,Wiltshire,Female,Adults,Possession of weapons,Immediate custody,6 months,1
243322,2022,Wiltshire,Female,Adults,Theft offences,Immediate custody,6 months,1
243324,2022,Wiltshire,Female,Adults,Theft offences,Immediate custody,Up to and including 1 month,1


In [16]:
def offenceProportions(df) -> pd.DataFrame:
    """Calculate proportions of each offence type for each Police Force Area

    Parameters
    ----------
    df : DataFrame

    Returns
    -------
    DataFrame
        A cross-tabulated DataFrame with float values normalised to each Police Force Area
    """
    return pd.crosstab(index=df['pfa'], 
                columns=df['offence'], 
                values=df['freq'], 
                aggfunc=sum, 
                normalize='index',
                ).round(3)

In [90]:
def custodialSentencesByOffence(df, filename=f"custodial_sentences_by_offence_{df['year'].max()}"):
    """Data processing pipeline to produce interim dataset of offence types which received a custodial sentence, by Police Force Area

    Parameters
    ----------
    df : DataFrame

    filename : str, optional
        filename parameter for final csv export, by default f"custodial_sentences_by_offence_{df['year'].max()}"

    Returns
    -------
    pd.DataFrame
        Produces and saves CSV of a processed DataFrame containing offence types which received a custodial sentence, by Police Force Area, for the latest available year
    """
    my_df = df.copy()
    
    (my_df
    .pipe(filterYear, 2022)
    .pipe(filterSentence)
    .pipe(groupAndSum, columns=['pfa', 'offence'])
    .pipe(offenceProportions)
    .pipe(saveData, status='processed', filename=filename)
    )
    
    #Returns original DataFrame to allow for continued processing through the pipeline.
    return df

Testing

In [39]:
df=loadData()
print('Data loaded')
custodialSentencesByOffence(df)

Data loaded
custodial_sentences_by_offence_2022 saved


Unnamed: 0,year,pfa,sex,age_group,offence,outcome,sentence_length,freq
0,2010,Avon and Somerset,Female,Young adults,Violence against the person,Community sentence,Not known,1
1,2010,Avon and Somerset,Female,Young adults,Drug offences,Community sentence,Not known,1
2,2010,Avon and Somerset,Female,Young adults,Violence against the person,Immediate custody,Life sentence,1
3,2010,Avon and Somerset,Female,Young adults,Violence against the person,Community sentence,Not known,1
4,2010,Avon and Somerset,Female,Young adults,Violence against the person,Suspended sentence,Not known,1
...,...,...,...,...,...,...,...,...
243339,2022,Wiltshire,Female,Adults,Fraud Offences,Suspended sentence,Not known,1
243340,2022,Wiltshire,Female,Adults,Summary non-motoring,Community sentence,Not known,1
243341,2022,Wiltshire,Female,Adults,Summary motoring,Community sentence,Not known,1
243342,2022,Wiltshire,Female,Adults,Miscellaneous crimes against society,Community sentence,Not known,1


## 3.CUSTODIAL SENTENCE LENGTHS FOR EACH PFA BY YEAR
THIS PRODUCES THE DATA FOR FIGURE 1 IN THE PFA FACTSHEET

In [68]:
## NOTE: I HAVE AMENDED THE BIN CATEGORIES HERE FOR 6–LESS THAN 12 MONTHS, AND 12+ MONTHS. MAY CAUSE ISSUE IN LATER STAGES

def consolidateSentenceLengths(df) -> pd.DataFrame:
    """Bin sentence lengths into three new distinct categories:
        * Less than 6 months;
        * 6 months to less than 12 months
        * 12 months or more

        12 months or more is the default value if it is not found in dict_map.

    Parameters
    ----------
    df : DataFrame

    Returns
    -------
    DataFrame
        A processed DataFrame with three distinct custodial sentence length categories based on values in `dict-map`
    """
    dict_map = {"Up to and including 1 month": 'Less than 6 months',
                    "More than 1 month and up to and including 2 months": 'Less than 6 months',
                    "More than 2 months and up to and including 3 months": 'Less than 6 months',
                    "More than 3 months and up to 6 months": 'Less than 6 months',
                    "6 months": '6 months to less than 12 months',
                    "More than 6 months and up to and including 9 months": '6 months to less than 12 months',
                    "More than 9 months and up to 12 months": '6 months to less than 12 months'
                    }
        
    
    df['sentence_length'] = df['sentence_length'].map(lambda x: dict_map.get(x, "12 months or more"))

    return df

In [42]:
def custodialSentenceLengths(df, filename=f"women_cust_sentence_length_PFA_{df['year'].min()}-{df['year'].max()}") -> pd.DataFrame:
    """Data processing pipeline to produce interim dataset of grouped custodial sentence lengths, by Police Force Area

    Parameters
    ----------
    df : _type_
        _description_
    filename : _type_, optional
        _description_, by default f"women_cust_sentence_length_PFA_{df['year'].min()}-{df['year'].max()}"

    Returns
    -------
    pd.DataFrame
        Produces and saves CSV of a processed DataFrame containing grouped custodial sentence lengths, by Police Force Area
    """
    my_df = df.copy()
    
    df_processed =(
        my_df
        .pipe(filterSentence)
        .pipe(consolidateSentenceLengths)
        .pipe(groupAndSum, columns=['pfa', 'year', 'sentence_length'])
        .pipe(saveData, status="interim", filename=filename, index=False) #Query whether the status of this is interim given that it is used in production of figure 1
    )
    #Returning processed version of DataFrame in order to allow for further filtering by year and sentence length
    return df_processed

In [70]:
my_df = df.copy()
    
df_processed =(
    my_df
    .pipe(filterSentence)
    .pipe(consolidateSentenceLengths)
)

In [71]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68549 entries, 2 to 243331
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   year             68549 non-null  int64   
 1   pfa              68549 non-null  category
 2   sex              68549 non-null  category
 3   age_group        68549 non-null  category
 4   offence          68549 non-null  category
 5   outcome          68549 non-null  category
 6   sentence_length  68549 non-null  object  
 7   freq             68549 non-null  int64   
dtypes: category(5), int64(2), object(1)
memory usage: 2.4+ MB


In [72]:
df=loadData()
print('Data loaded')
df_custodialSentences_PFA = custodialSentenceLengths(df)
df_custodialSentences_PFA

Data loaded
women_cust_sentence_length_PFA_2010-2022 saved


Unnamed: 0,pfa,year,sentence_length,freq
0,Avon and Somerset,2010,12 months or more,44
1,Avon and Somerset,2010,6 months to less than 12 months,16
2,Avon and Somerset,2010,Less than 6 months,113
3,Avon and Somerset,2011,12 months or more,43
4,Avon and Somerset,2011,6 months to less than 12 months,21
...,...,...,...,...
1633,Wiltshire,2021,6 months to less than 12 months,3
1634,Wiltshire,2021,Less than 6 months,15
1635,Wiltshire,2022,12 months or more,7
1636,Wiltshire,2022,6 months to less than 12 months,5


## 4.CUSTODIAL SENTENCES FOR EACH PFA BY YEAR

THIS PRODUCES THREE DATASETS: 
* TOTAL NUMBER OF WOMEN SENTENCED TO CUSTODY BY PFA; AND OF THOSE 
  * SENTENCED TO LESS THAN SIX MONTHS; AND
  * SENTENCED TO LESS THAN 12 MONTHS

In [None]:
df_custodialSentences_PFA

This following function is the starting point for producing all of the following three final datasets. It takes the DataFrame produced by `custodialSentenceLengths(df)` then filters by year.

In [73]:
filterYear(df_custodialSentences_PFA, 2014, op="ge")

Unnamed: 0,pfa,year,sentence_length,freq
12,Avon and Somerset,2014,12 months or more,38
13,Avon and Somerset,2014,6 months to less than 12 months,14
14,Avon and Somerset,2014,Less than 6 months,144
15,Avon and Somerset,2015,12 months or more,31
16,Avon and Somerset,2015,6 months to less than 12 months,8
...,...,...,...,...
1633,Wiltshire,2021,6 months to less than 12 months,3
1634,Wiltshire,2021,Less than 6 months,15
1635,Wiltshire,2022,12 months or more,7
1636,Wiltshire,2022,6 months to less than 12 months,5


In [78]:
def filterSentenceLength(df, sentence_length, column='sentence_length') -> pd.DataFrame:
    """DataFrame filter allowing selection of subset of data by custodial sentence length

    Parameters
    ----------
    df : DataFrame

    sentence_length : str or list-like
        {"Less than 6 months", "6 months to less than 12 months", "12 months or more"}

    column : str, optional
        column name of DataFrame with custodial sentence length values, by default 'sentence_length'

    Returns
    -------
    DataFrame
        A filtered DataFrame displaying the chosen custodial sentence length
    """
    mask = None
    if type(sentence_length) == str:
        mask = df[column] == sentence_length
    elif type(sentence_length) == list:
        mask = df[column].isin(sentence_length)
    
    filtered_df = df.loc[mask]
    return filtered_df

In [76]:
def aggregateSentences(df) -> pd.DataFrame:
    """Calculate total number of custodial sentences of a given length in each year, by Police Force Area

    Parameters
    ----------
    df : DataFrame
        Ensure that the DataFrame being passed to this function contains the correct sentence length(s)

    Returns
    -------
    DataFrame
        A cross-tabulated DataFrame of the total number of custodial sentences in each year, by Police Force Area
    """
    
    agg_df = pd.crosstab(index=df['pfa'], 
                columns=df['year'], 
                values=df['freq'], 
                aggfunc=sum, 
                )
    return agg_df

In [84]:
def percentageChange(df, periods=8) -> pd.DataFrame:
    """Function to calculate percentage change between the first and last year in the DataFrame.

    Parameters
    ----------
    df : DataFrame
        
    periods : int, optional
        The total time period in years, by default 8

    Returns
    -------
    DataFrame
        DataFrame is returned with additional column showing the percentage change as a float
    """
    df.fillna(0.0).astype(int)
    df[f'per_change_{df.columns[0]}'] = df.pct_change(axis='columns', periods=periods).dropna(axis='columns')
    return df

In [86]:
(df_custodialSentences_PFA
.pipe(filterYear, 2014, op="ge") #All custodial sentence lengths
# .pipe(filterSentenceLength, ["Less than 6 months", "6 months to less than 12 months"]) #Less than 12 months
.pipe(aggregateSentences)
.pipe(percentageChange)
)

year,2014,2015,2016,2017,2018,2019,2020,2021,2022,per_change_2014
pfa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Avon and Somerset,196,165,164,158,148,151,103,103,116,-0.408163
Bedfordshire,69,80,53,53,36,31,23,20,38,-0.449275
Cambridgeshire,91,89,112,115,116,89,78,47,68,-0.252747
Cheshire,169,181,167,172,176,149,123,117,74,-0.56213
Cleveland,91,78,108,152,140,98,55,103,100,0.098901
Cumbria,92,103,92,104,132,72,45,40,29,-0.684783
Derbyshire,171,179,176,174,178,123,130,126,122,-0.28655
Devon and Cornwall,116,126,120,148,120,106,106,86,63,-0.456897
Dorset,56,67,52,73,52,61,35,38,23,-0.589286
Durham,82,76,80,64,79,41,56,50,57,-0.304878


In [97]:
def custodialSentenceTableProcessing(df, filename):
    """Processing chain to output number of custodial sentences by Police Force Area and percentage change

    Parameters
    ----------
    df : DataFrame
        
    filename : str
        filename parameter for csv export
    """
    (df
    .pipe(aggregateSentences)
    .pipe(percentageChange)
    .pipe(saveData, status='processed', filename=f'{filename}_TEST')
    )

In [109]:
def custodialSentenceTableOutput(df):
    """Output final tables in csv format by:
    * Total number of women sentenced to custody and percentage change by Police Force Area; and of those
        * Sentenced to less than six months; and
        * Sentenced to less than 12 months.

    Parameters
    ----------
    df : DataFrame

    Returns
    -------
    str
        Returns message if processing has been successful
    """

    df_custodialSentences_PFA = df.pipe(filterYear, 2014, op="ge")
        
    sentence_length_dict = {'cust_sentences_total': "", 'cust_sentences_lt_12m': ["Less than 6 months", "6 months to less than 12 months"], 'cust_sentences_lt_6m':"Less than 6 months"}

    for k, v in sentence_length_dict.items():
        if v != "":
            (df_custodialSentences_PFA
            .pipe(filterSentenceLength, sentence_length=v)
            .pipe(custodialSentenceTableProcessing, filename=f'{k}')
            )

        else:
            (df_custodialSentences_PFA
            .pipe(custodialSentenceTableProcessing, filename=f'{k}')
            )
    
    return "Processing complete"

In [110]:
df=loadData()
print('Data loaded')
(df
.pipe(custodialSentenceLengths)
.pipe(custodialSentenceTableOutput)
)

Data loaded
women_cust_sentence_length_PFA_2010-2022 saved
cust_sentences_total_TEST saved
cust_sentences_lt_12m_TEST saved
cust_sentences_lt_6m_TEST saved


'Processing complete'

In [None]:
(df_custodialSentences_PFA
.pipe(filterYear, 2014, op="ge") #All custodial sentence lengths
# .pipe(filterSentenceLength, ["Less than 6 months", "6 months to less than 12 months"]) #Less than 12 months
.pipe(aggregateSentences)
.pipe(percentageChange)
)

year,2014,2015,2016,2017,2018,2019,2020,2021,2022,per_change_2014
pfa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Avon and Somerset,196,165,164,158,148,151,103,103,116,-0.408163
Bedfordshire,69,80,53,53,36,31,23,20,38,-0.449275
Cambridgeshire,91,89,112,115,116,89,78,47,68,-0.252747
Cheshire,169,181,167,172,176,149,123,117,74,-0.56213
Cleveland,91,78,108,152,140,98,55,103,100,0.098901
Cumbria,92,103,92,104,132,72,45,40,29,-0.684783
Derbyshire,171,179,176,174,178,123,130,126,122,-0.28655
Devon and Cornwall,116,126,120,148,120,106,106,86,63,-0.456897
Dorset,56,67,52,73,52,61,35,38,23,-0.589286
Durham,82,76,80,64,79,41,56,50,57,-0.304878


## Pipeline testing

In [116]:
df=loadData()
(df
.pipe(sentencesByPFA)
.pipe(custodialSentencesByOffence)
.pipe(custodialSentenceLengths)
.pipe(custodialSentenceTableOutput)
)

Data loaded
sentencesByPFA saved
custodial_sentences_by_offence_2022 saved
women_cust_sentence_length_PFA_2010-2022 saved
cust_sentences_total_TEST saved
cust_sentences_lt_12m_TEST saved
cust_sentences_lt_6m_TEST saved


'Processing complete'