# Women's imprisonment rates
## ONS mid-year population estimates: Data exploration

In [12]:
import pandas as pd
import src.utilities as utils

In [2]:
df = utils.load_data(status='raw', filename='mid-2022-england-wales_v3.csv')
df.head()

2025-06-25 12:07:58,540 - INFO - Loaded data from data/raw/mid-2022-england-wales_v3.csv


Unnamed: 0,v4_0,calendar-years,Time,administrative-geography,Geography,sex,Sex,single-year-of-age,Age
0,705619,2011,2011,K04000001,ENGLAND AND WALES,all,All,3,3
1,670398,2011,2011,K04000001,ENGLAND AND WALES,all,All,5,5
2,771297,2011,2011,K04000001,ENGLAND AND WALES,all,All,23,23
3,831041,2011,2011,K04000001,ENGLAND AND WALES,all,All,47,47
4,797077,2011,2011,K04000001,ENGLAND AND WALES,all,All,49,49


## Checking year range

In [3]:
df['calendar-years'].value_counts().sort_index()

calendar-years
2011    98532
2012    98532
2013    98532
2014    98532
2015    98532
2016    98532
2017    98532
2018    98532
2019    98532
2020    98532
2021    98532
2022    98532
2023    98532
Name: count, dtype: int64

## Checking column values

In [5]:
df.columns

Index(['v4_0', 'calendar-years', 'Time', 'administrative-geography',
       'Geography', 'sex', 'Sex', 'single-year-of-age', 'Age'],
      dtype='object')

## Checking filtering by area

Note that I needed to wrap the column name in backticks because of the way that Pandas processes column names with hyphens (it treats them as minuses and so looks for `df['administrative']` rather than `df['administrative-geography']`)

In [8]:
df.query('`administrative-geography` == "E06000001"')

Unnamed: 0,v4_0,calendar-years,Time,administrative-geography,Geography,sex,Sex,single-year-of-age,Age
505908,1404,2011,2011,E06000001,Hartlepool,all,All,44,44
505909,1156,2011,2011,E06000001,Hartlepool,all,All,0,0
505910,1093,2011,2011,E06000001,Hartlepool,all,All,13,13
505911,1141,2011,2011,E06000001,Hartlepool,all,All,2,2
505912,1415,2011,2011,E06000001,Hartlepool,all,All,45,45
...,...,...,...,...,...,...,...,...,...
509491,512,2023,2023,E06000001,Hartlepool,male,Male,68,68
509492,353,2023,2023,E06000001,Hartlepool,male,Male,77,77
509493,259,2023,2023,E06000001,Hartlepool,male,Male,90+,90+
509494,483,2023,2023,E06000001,Hartlepool,male,Male,70,70


## Random sampling of the dataframe

Checking accuracy against published Excel spreadsheet on the ONS website at https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates/datasets/estimatesofthepopulationforenglandandwales

In [14]:
df[df['sex'] != "all"].sample(n=20, random_state=41)

Unnamed: 0,v4_0,calendar-years,Time,administrative-geography,Geography,sex,Sex,single-year-of-age,Age
122127,456,2011,2011,E07000136,Boston,male,Male,50,50
963600,733,2017,2017,E07000179,South Oxfordshire,female,Female,24,24
484259,1468,2022,2022,E09000013,Hammersmith and Fulham,male,Male,21,21
946626,1781,2022,2022,E06000025,South Gloucestershire,female,Female,47,47
958340,343,2011,2011,E07000149,South Norfolk,female,Female,85,85
57843,227,2013,2013,E07000105,Ashford,female,Female,86,86
147105,560,2023,2023,E09000005,Brent,male,Male,78,78
324794,168,2017,2017,E07000035,Derbyshire Dales,male,Male,84,84
874096,494,2018,2018,E07000075,Rochford,male,Male,26,26
532740,510,2017,2017,E07000037,High Peak,male,Male,19,19


Relying on a manual check of these values for speed, and all values are identical.

## Starting to develop `ons_cleaning.py`

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from src.data.processing import ons_cleaning

In [19]:
df = ons_cleaning.load_population_data()
df.head()

2025-06-26 12:18:32,494 - INFO - Loading ONS population data...
2025-06-26 12:18:33,800 - INFO - Loaded data from data/raw/ONS_mid-2022-england-wales_v3.csv


Unnamed: 0,v4_0,calendar-years,administrative-geography,Geography,Sex,Age
0,705619,2011,K04000001,ENGLAND AND WALES,All,3
1,670398,2011,K04000001,ENGLAND AND WALES,All,5
2,771297,2011,K04000001,ENGLAND AND WALES,All,23
3,831041,2011,K04000001,ENGLAND AND WALES,All,47
4,797077,2011,K04000001,ENGLAND AND WALES,All,49


## Renaming and reordering columns

In [20]:
ons_cleaning.rename_and_reorder_columns(df)

Unnamed: 0,ladcode23,laname23,year,sex,age,freq
0,K04000001,ENGLAND AND WALES,2011,All,3,705619
1,K04000001,ENGLAND AND WALES,2011,All,5,670398
2,K04000001,ENGLAND AND WALES,2011,All,23,771297
3,K04000001,ENGLAND AND WALES,2011,All,47,831041
4,K04000001,ENGLAND AND WALES,2011,All,49,797077
...,...,...,...,...,...,...
1280911,E06000014,York,2023,Female,25,1608
1280912,E06000014,York,2023,Female,27,1210
1280913,E06000014,York,2023,Female,54,1340
1280914,E06000014,York,2023,Female,87,480


In [23]:
df = ons_cleaning.load_and_process_data()
df

2025-06-26 12:35:40,502 - INFO - Loading ONS population data...
2025-06-26 12:35:41,794 - INFO - Loaded data from data/raw/ONS_mid-2022-england-wales_v3.csv
2025-06-26 12:35:42,589 - INFO - Processing data...
2025-06-26 12:35:42,614 - INFO - Data successfully processed for ONS population estimates


Unnamed: 0,ladcode23,laname23,year,sex,age,freq
0,K04000001,ENGLAND AND WALES,2011,All,3,705619
1,K04000001,ENGLAND AND WALES,2011,All,5,670398
2,K04000001,ENGLAND AND WALES,2011,All,23,771297
3,K04000001,ENGLAND AND WALES,2011,All,47,831041
4,K04000001,ENGLAND AND WALES,2011,All,49,797077
...,...,...,...,...,...,...
1280911,E06000014,York,2023,Female,25,1608
1280912,E06000014,York,2023,Female,27,1210
1280913,E06000014,York,2023,Female,54,1340
1280914,E06000014,York,2023,Female,87,480


Checking how many unique local authority prefixes there are to get an idea of what needs removing.

In [None]:
df['ladcode23'].str[0].unique()

array(['K', 'E', 'W'], dtype=object)

In [26]:
df[df['ladcode23'].str.startswith('K')]

Unnamed: 0,ladcode23,laname23,year,sex,age,freq
0,K04000001,ENGLAND AND WALES,2011,All,3,705619
1,K04000001,ENGLAND AND WALES,2011,All,5,670398
2,K04000001,ENGLAND AND WALES,2011,All,23,771297
3,K04000001,ENGLAND AND WALES,2011,All,47,831041
4,K04000001,ENGLAND AND WALES,2011,All,49,797077
...,...,...,...,...,...,...
3583,K04000001,ENGLAND AND WALES,2023,Male,38,399998
3584,K04000001,ENGLAND AND WALES,2023,Male,58,401516
3585,K04000001,ENGLAND AND WALES,2023,Male,62,362163
3586,K04000001,ENGLAND AND WALES,2023,Male,83,125305


Great, just the aggregated England and Wales values. However, there may be regional aggregates in here as well

In [35]:
uppers = df[df['laname23'].str.isupper()]['laname23'].unique()

In [36]:
type(uppers)

pandas.core.arrays.categorical.Categorical

In [37]:
uppers

['ENGLAND AND WALES', 'ENGLAND', 'WALES', 'NORTH EAST', 'NORTH WEST', ..., 'WEST MIDLANDS', 'EAST', 'LONDON', 'SOUTH EAST', 'SOUTH WEST']
Length: 12
Categories (357, object): ['Adur', 'Amber Valley', 'Arun', 'Ashfield', ..., 'Wyre', 'Wyre Forest', 'YORKSHIRE AND THE HUMBER', 'York']

Let's see whether there's a pattern with these aggregated areas' `ladcode23`

In [42]:
list(df[df['laname23'].str.isupper()]['ladcode23'].unique())

['K04000001',
 'E92000001',
 'W92000004',
 'E12000001',
 'E12000002',
 'E12000003',
 'E12000004',
 'E12000005',
 'E12000006',
 'E12000007',
 'E12000008',
 'E12000009']

Cool, the aggregated English values start with "E12" and the countries "*92" and ENGLAND AND WALES is it's own case. Let's use this to drop those records which aren't needed.

In [9]:
df = ons_cleaning.load_and_process_data()
df

2025-06-26 14:43:50,030 - INFO - Loading ONS population data...


2025-06-26 14:43:51,242 - INFO - Loaded data from data/raw/ONS_mid-2022-england-wales_v3.csv
2025-06-26 14:43:52,030 - INFO - Processing data...
2025-06-26 14:43:52,049 - INFO - Finding aggregated national and regional data codes...
2025-06-26 14:43:52,050 - INFO - Current number of rows: 1280916
2025-06-26 14:43:52,070 - INFO - Aggregated codes found: ['K04000001', 'E92000001', 'W92000004', 'E12000001', 'E12000002', 'E12000003', 'E12000004', 'E12000005', 'E12000006', 'E12000007', 'E12000008', 'E12000009']
2025-06-26 14:43:52,070 - INFO - Dropping unwanted rows...
2025-06-26 14:43:52,119 - INFO - Rows dropped. Remaining rows: 1237860
2025-06-26 14:43:52,129 - INFO - Data successfully processed for ONS population estimates


Unnamed: 0,ladcode,laname,year,sex,age,freq
43056,E07000223,Adur,2012,All,1,739
43057,E07000223,Adur,2012,All,6,667
43058,E07000223,Adur,2012,All,25,605
43059,E07000223,Adur,2012,All,9,642
43060,E07000223,Adur,2012,All,11,545
...,...,...,...,...,...,...
1280911,E06000014,York,2023,Female,25,1608
1280912,E06000014,York,2023,Female,27,1210
1280913,E06000014,York,2023,Female,54,1340
1280914,E06000014,York,2023,Female,87,480


Just checking these have been removed.

In [5]:
df[df['laname'].str.isupper()]['laname'].unique()

[], Categories (357, object): ['Adur', 'Amber Valley', 'Arun', 'Ashfield', ..., 'Wyre', 'Wyre Forest', 'YORKSHIRE AND THE HUMBER', 'York']

In [6]:
df.query('`laname` == "YORKSHIRE AND THE HUMBER"')

Unnamed: 0,ladcode,laname,year,sex,age,freq


## Filtering for only adult women

In [15]:
df['age'] = pd.to_numeric(df['age'], errors='raise')
df2 = df[(df['age'] >= 18) & (df['sex'] == "Female")]
df2

Unnamed: 0,ladcode,laname,year,sex,age,freq
43071,E07000223,Adur,2012,Female,25.0,300
43072,E07000223,Adur,2012,Female,32.0,390
43073,E07000223,Adur,2012,Female,33.0,383
43074,E07000223,Adur,2012,Female,65.0,580
43075,E07000223,Adur,2012,Female,86.0,210
...,...,...,...,...,...,...
1280893,E06000014,York,2022,Female,81.0,583
1280911,E06000014,York,2023,Female,25.0,1608
1280912,E06000014,York,2023,Female,27.0,1210
1280913,E06000014,York,2023,Female,54.0,1340


In [16]:
df2.sort_values(by=['age'], ascending=False)

Unnamed: 0,ladcode,laname,year,sex,age,freq
94906,E09000004,Bexley,2016,Female,89.0,380
43200,E07000223,Adur,2011,Female,89.0,142
1280734,E06000014,York,2023,Female,89.0,359
645992,E07000110,Maidstone,2011,Female,89.0,262
1183807,E06000037,West Berkshire,2022,Female,89.0,242
...,...,...,...,...,...,...
709888,W06000012,Neath Port Talbot,2021,Female,18.0,758
868558,E08000005,Rochdale,2013,Female,18.0,1349
759815,E07000147,North Norfolk,2021,Female,18.0,430
759842,E07000147,North Norfolk,2020,Female,18.0,448


Have now added to the `load_and_process_data` pipeline

In [58]:
df = ons_cleaning.load_and_process_data()
df

2025-06-26 15:44:07,291 - INFO - Loading ONS population data...


2025-06-26 15:44:08,570 - INFO - Loaded data from data/raw/ONS_mid-2022-england-wales_v3.csv
2025-06-26 15:44:09,360 - INFO - Processing data...
2025-06-26 15:44:09,380 - INFO - Finding aggregated national and regional data codes...
2025-06-26 15:44:09,381 - INFO - Current number of rows: 1280916
2025-06-26 15:44:09,398 - INFO - Aggregated codes found: ['K04000001', 'E92000001', 'W92000004', 'E12000001', 'E12000002', 'E12000003', 'E12000004', 'E12000005', 'E12000006', 'E12000007', 'E12000008', 'E12000009']
2025-06-26 15:44:09,399 - INFO - Dropping unwanted rows...
2025-06-26 15:44:09,450 - INFO - Rows dropped. Remaining rows: 1237860
2025-06-26 15:44:09,457 - INFO - Filtering for adult women...
2025-06-26 15:44:10,373 - INFO - Data successfully processed for ONS population estimates


Unnamed: 0,ladcode,laname,year,sex,age,freq
43071,E07000223,Adur,2012,Female,25.0,300
43072,E07000223,Adur,2012,Female,32.0,390
43073,E07000223,Adur,2012,Female,33.0,383
43074,E07000223,Adur,2012,Female,65.0,580
43075,E07000223,Adur,2012,Female,86.0,210
...,...,...,...,...,...,...
1280893,E06000014,York,2022,Female,81.0,583
1280911,E06000014,York,2023,Female,25.0,1608
1280912,E06000014,York,2023,Female,27.0,1210
1280913,E06000014,York,2023,Female,54.0,1340


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 327405 entries, 43071 to 1280914
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype   
---  ------   --------------   -----   
 0   ladcode  327405 non-null  category
 1   laname   327405 non-null  category
 2   year     327405 non-null  int64   
 3   sex      327405 non-null  category
 4   age      327405 non-null  float64 
 5   freq     327405 non-null  int64   
dtypes: category(3), float64(1), int64(2)
memory usage: 11.6 MB


In [57]:
df.query('age == 90')

Unnamed: 0,ladcode,laname,year,sex,age,freq
43076,E07000223,Adur,2012,Female,90.0,526
43361,E07000223,Adur,2011,Female,90.0,502
43896,E07000223,Adur,2013,Female,90.0,524
44035,E07000223,Adur,2014,Female,90.0,537
44351,E07000223,Adur,2015,Female,90.0,526
...,...,...,...,...,...,...
1279583,E06000014,York,2018,Female,90.0,1412
1280037,E06000014,York,2020,Female,90.0,1396
1280170,E06000014,York,2021,Female,90.0,1429
1280384,E06000014,York,2022,Female,90.0,1458


In [63]:
ons_cleaning.combine_ages(df)

2025-06-26 16:23:14,830 - INFO - Combining age groups for aggregation...


Unnamed: 0,ladcode,laname,year,freq
0,E06000001,Hartlepool,2011,37332
1,E06000001,Hartlepool,2012,37470
2,E06000001,Hartlepool,2013,37476
3,E06000001,Hartlepool,2014,37491
4,E06000001,Hartlepool,2015,37524
...,...,...,...,...
4480,W06000024,Merthyr Tydfil,2019,24168
4481,W06000024,Merthyr Tydfil,2020,24134
4482,W06000024,Merthyr Tydfil,2021,24061
4483,W06000024,Merthyr Tydfil,2022,24056


## Testing `ons_cleaning` pipeline

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from src.data.processing import ons_cleaning

In [3]:
ons_cleaning.main()

2025-06-27 10:05:48,850 - INFO - Loading ONS population data...
2025-06-27 10:05:50,210 - INFO - Loaded data from data/raw/ONS_mid-2022-england-wales_v3.csv
2025-06-27 10:05:51,030 - INFO - Processing data...
2025-06-27 10:05:51,052 - INFO - Finding aggregated national and regional data codes...
2025-06-27 10:05:51,053 - INFO - Current number of rows: 1280916
2025-06-27 10:05:51,078 - INFO - Aggregated codes found: ['K04000001', 'E92000001', 'W92000004', 'E12000001', 'E12000002', 'E12000003', 'E12000004', 'E12000005', 'E12000006', 'E12000007', 'E12000008', 'E12000009']
2025-06-27 10:05:51,079 - INFO - Dropping unwanted rows...
2025-06-27 10:05:51,133 - INFO - Rows dropped. Remaining rows: 1237860
2025-06-27 10:05:51,141 - INFO - Filtering for adult women...
2025-06-27 10:05:52,045 - INFO - Combining age groups for aggregation...
2025-06-27 10:05:52,109 - INFO - Data successfully processed for ONS population estimates


KeyError: 'min_year'

Debugging KeyError

In [4]:
df, min_year, max_year = ons_cleaning.load_and_process_data()

2025-06-27 10:05:55,438 - INFO - Loading ONS population data...


2025-06-27 10:05:56,610 - INFO - Loaded data from data/raw/ONS_mid-2022-england-wales_v3.csv
2025-06-27 10:05:57,511 - INFO - Processing data...
2025-06-27 10:05:57,530 - INFO - Finding aggregated national and regional data codes...
2025-06-27 10:05:57,531 - INFO - Current number of rows: 1280916
2025-06-27 10:05:57,552 - INFO - Aggregated codes found: ['K04000001', 'E92000001', 'W92000004', 'E12000001', 'E12000002', 'E12000003', 'E12000004', 'E12000005', 'E12000006', 'E12000007', 'E12000008', 'E12000009']
2025-06-27 10:05:57,553 - INFO - Dropping unwanted rows...
2025-06-27 10:05:57,607 - INFO - Rows dropped. Remaining rows: 1237860
2025-06-27 10:05:57,617 - INFO - Filtering for adult women...
2025-06-27 10:05:58,608 - INFO - Combining age groups for aggregation...
2025-06-27 10:05:58,668 - INFO - Data successfully processed for ONS population estimates


In [5]:
min_year, max_year

(2011, 2023)

In [6]:
import src.utilities as utils
config = utils.read_config()
OUTPUT_FILENAME_TEMPLATE = config['data']['datasetFilenames']['ons_cleaning']

In [9]:
filename = utils.get_output_filename(year=(min_year, max_year), template=OUTPUT_FILENAME_TEMPLATE)
filename

'LA_population_women_2011-2023.csv'

In [None]:
utils.get_year_range(df)

(2011, 2023)

Testing again

In [3]:
ons_cleaning.main()

2025-06-27 10:48:50,177 - INFO - Loading ONS population data...


2025-06-27 10:48:51,373 - INFO - Loaded data from data/raw/ONS_mid-2022-england-wales_v3.csv
2025-06-27 10:48:52,214 - INFO - Processing data...
2025-06-27 10:48:52,235 - INFO - Finding aggregated national and regional data codes...
2025-06-27 10:48:52,236 - INFO - Current number of rows: 1280916
2025-06-27 10:48:52,259 - INFO - Aggregated codes found: ['K04000001', 'E92000001', 'W92000004', 'E12000001', 'E12000002', 'E12000003', 'E12000004', 'E12000005', 'E12000006', 'E12000007', 'E12000008', 'E12000009']
2025-06-27 10:48:52,260 - INFO - Dropping unwanted rows...
2025-06-27 10:48:52,309 - INFO - Rows dropped. Remaining rows: 1237860
2025-06-27 10:48:52,318 - INFO - Filtering for adult women...
2025-06-27 10:48:53,237 - INFO - Combining age groups for aggregation...
2025-06-27 10:48:53,295 - INFO - Data successfully processed for ONS population estimates
2025-06-27 10:48:53,296 - INFO - Saving...
2025-06-27 10:48:53,316 - INFO - Data successfully saved to data/interim/LA_population_wom