# Women's imprisonment rates
## ONS mid-year population estimates: Data QA
Checking that my new dataset values are in line with previous years

## Loading datasets for comparison

In [1]:
import src.utilities as utils
import pandas as pd

In [2]:
df_new = utils.load_data('interim', 'LA_population_women_2011-2023.csv')
df_new

2025-06-30 14:46:32,904 - INFO - Loaded data from data/interim/LA_population_women_2011-2023.csv


Unnamed: 0,ladcode,laname,year,freq
0,E06000001,Hartlepool,2011,37332
1,E06000001,Hartlepool,2012,37470
2,E06000001,Hartlepool,2013,37476
3,E06000001,Hartlepool,2014,37491
4,E06000001,Hartlepool,2015,37524
...,...,...,...,...
4480,W06000024,Merthyr Tydfil,2019,24168
4481,W06000024,Merthyr Tydfil,2020,24134
4482,W06000024,Merthyr Tydfil,2021,24061
4483,W06000024,Merthyr Tydfil,2022,24056


In [3]:
df_old = utils.load_data('interim', 'ons_comparator_2001-2020.csv')
df_old

2025-06-30 14:46:35,408 - INFO - Loaded data from data/interim/ons_comparator_2001-2020.csv


Unnamed: 0,ladcode,laname,year,freq
0,E06000001,Hartlepool,2001,35629
1,E06000001,Hartlepool,2002,35660
2,E06000001,Hartlepool,2003,35795
3,E06000001,Hartlepool,2004,35901
4,E06000001,Hartlepool,2005,36065
...,...,...,...,...
6615,W06000024,Merthyr Tydfil,2016,24249
6616,W06000024,Merthyr Tydfil,2017,24358
6617,W06000024,Merthyr Tydfil,2018,24426
6618,W06000024,Merthyr Tydfil,2019,24493


## Filtering datasets for years of overlap

In [4]:
# Filter both dataframes to only include shared years
df_new_years = set(df_new['year'])
df_old_years = set(df_old['year'])
shared_years = df_new_years & df_old_years

df_new_filtered = df_new[df_new['year'].isin(shared_years)]
df_old_filtered = df_old[df_old['year'].isin(shared_years)]

In [5]:
df_new_filtered

Unnamed: 0,ladcode,laname,year,freq
0,E06000001,Hartlepool,2011,37332
1,E06000001,Hartlepool,2012,37470
2,E06000001,Hartlepool,2013,37476
3,E06000001,Hartlepool,2014,37491
4,E06000001,Hartlepool,2015,37524
...,...,...,...,...
4477,W06000024,Merthyr Tydfil,2016,24109
4478,W06000024,Merthyr Tydfil,2017,24152
4479,W06000024,Merthyr Tydfil,2018,24180
4480,W06000024,Merthyr Tydfil,2019,24168


In [6]:
df_old_filtered

Unnamed: 0,ladcode,laname,year,freq
10,E06000001,Hartlepool,2011,37332
11,E06000001,Hartlepool,2012,37450
12,E06000001,Hartlepool,2013,37564
13,E06000001,Hartlepool,2014,37631
14,E06000001,Hartlepool,2015,37677
...,...,...,...,...
6615,W06000024,Merthyr Tydfil,2016,24249
6616,W06000024,Merthyr Tydfil,2017,24358
6617,W06000024,Merthyr Tydfil,2018,24426
6618,W06000024,Merthyr Tydfil,2019,24493


## Merging dataframes for comparison

In [7]:
# Merge on shared keys (ladcode, laname, year) and compare freq values
merge_keys = ['ladcode', 'laname', 'year']
df_merged = pd.merge(
    df_new_filtered, df_old_filtered,
    on=merge_keys,
    suffixes=('_new', '_old'),
    how='inner'
)

In [8]:
df_merged

Unnamed: 0,ladcode,laname,year,freq_new,freq_old
0,E06000001,Hartlepool,2011,37332,37332
1,E06000001,Hartlepool,2012,37470,37450
2,E06000001,Hartlepool,2013,37476,37564
3,E06000001,Hartlepool,2014,37491,37631
4,E06000001,Hartlepool,2015,37524,37677
...,...,...,...,...,...
3135,W06000024,Merthyr Tydfil,2016,24109,24249
3136,W06000024,Merthyr Tydfil,2017,24152,24358
3137,W06000024,Merthyr Tydfil,2018,24180,24426
3138,W06000024,Merthyr Tydfil,2019,24168,24493


In [9]:
# Calculate difference and absolute difference
if 'freq_new' in df_merged.columns and 'freq_old' in df_merged.columns:
    df_merged['freq_diff'] = df_merged['freq_new'] - df_merged['freq_old']
    df_merged['freq_diff_abs'] = df_merged['freq_diff'].abs()

In [10]:
# Calculate percentage difference and flag if within 5%
df_merged['freq_pct_diff'] = df_merged['freq_diff_abs'] / df_merged['freq_old'].replace(0, pd.NA) * 100

df_merged['within_5pct'] = df_merged['freq_pct_diff'] <= 5

# Show summary of percentage differences
pct_within_5 = df_merged['within_5pct'].mean() * 100
print(f"Percentage of rows with variance within 5%: {pct_within_5:.2f}%")
df_merged[['ladcode', 'laname', 'year', 'freq_new', 'freq_old', 'freq_diff', 'freq_pct_diff', 'within_5pct']]

Percentage of rows with variance within 5%: 94.90%


Unnamed: 0,ladcode,laname,year,freq_new,freq_old,freq_diff,freq_pct_diff,within_5pct
0,E06000001,Hartlepool,2011,37332,37332,0,0.000000,True
1,E06000001,Hartlepool,2012,37470,37450,20,0.053405,True
2,E06000001,Hartlepool,2013,37476,37564,-88,0.234267,True
3,E06000001,Hartlepool,2014,37491,37631,-140,0.372034,True
4,E06000001,Hartlepool,2015,37524,37677,-153,0.406083,True
...,...,...,...,...,...,...,...,...
3135,W06000024,Merthyr Tydfil,2016,24109,24249,-140,0.577343,True
3136,W06000024,Merthyr Tydfil,2017,24152,24358,-206,0.845718,True
3137,W06000024,Merthyr Tydfil,2018,24180,24426,-246,1.007124,True
3138,W06000024,Merthyr Tydfil,2019,24168,24493,-325,1.326910,True


In [None]:
df_merged.query('within_5pct == False')

Unnamed: 0,ladcode,laname,year,freq_new,freq_old,freq_diff,freq_diff_abs,freq_pct_diff,within_5pct
158,E06000016,Leicester,2019,143977,135346,8631,8631,6.376989,False
159,E06000016,Leicester,2020,143665,134804,8861,8861,6.573247,False
278,E06000030,Swindon,2019,91511,86581,4930,4930,5.694090,False
279,E06000030,Swindon,2020,91879,86826,5053,5053,5.819685,False
285,E06000031,Peterborough,2016,79135,74493,4642,4642,6.231458,False
...,...,...,...,...,...,...,...,...,...
2899,E09000031,Waltham Forest,2020,111364,104656,6708,6708,6.409570,False
2916,E09000033,Westminster,2017,89092,95132,-6040,6040,6.349073,False
2917,E09000033,Westminster,2018,89701,98761,-9060,9060,9.173662,False
2918,E09000033,Westminster,2019,89905,101748,-11843,11843,11.639541,False


In [28]:
# Save only rows where variance is not within 5% to CSV for further analysis
df_merged[df_merged['within_5pct'] == False].to_csv('data/processed/merged_freq_comparison.csv', index=False)
print('Rows with variance >5% saved to data/processed/merged_freq_comparison.csv')

Rows with variance >5% saved to data/processed/merged_freq_comparison.csv


In [11]:
# Summary statistics
summary = df_merged['freq_diff'].describe()
exact_matches = (df_merged['freq_diff'] == 0).sum()
mismatches = (df_merged['freq_diff'] != 0).sum()

print('Summary statistics for freq differences:')
print(summary)
print(f'Number of records within 5% variance: {df_merged["within_5pct"].sum()}')
print(f'Exact matches: {exact_matches}')
print(f'Mismatches: {mismatches}')

Summary statistics for freq differences:
count     3140.000000
mean       529.968471
std       2025.171702
min     -16650.000000
25%       -163.000000
50%         27.500000
75%        729.500000
max      18048.000000
Name: freq_diff, dtype: float64
Number of records within 5% variance: 2980
Exact matches: 317
Mismatches: 2823


Currently looking at this reconcilliation explainer following the Census 2021 and there's quite a bit of overlap with the areas they highlight as showing higher levels of variance to previously published mid-year estimates. See section 6 in https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates/articles/reconciliationofmidyearpopulationestimateswithcensus2021atlocalauthoritylevel/2023-03-02

The ONS highlights "The pattern in most LAs is a complex mix of both underestimation and overestimation...The discrepancies between the MYEs rolled forward from 2011 and the 2021 Census-based estimates tend to be larger in areas of high population churn such as areas with significant student populations or large urban areas."

Areas with the largest differences between estimates can generally be attributed to one of three groups:
  - LAs in London
  - LAs with a large student population
  - LAs with a special population (presence of military personnel, for example)

Differences in these areas are likely to be partially attributed to high population churn. It is worth noting that similar areas do not show a unifying pattern, with LAs in London presenting large positive (Camden) and negative (Ealing) differences.

In [None]:
len(df_merged['laname'].unique())

314

In [None]:
len(df_merged[df_merged['within_5pct'] == False]['laname'].unique())

42

Loading ONS data on those Local authorities with a percentage difference of larger than positive or negative 5% between the 2021 rolled-forward and 2021 Census-based mid-year estimates

In [32]:
ons_df = (
    pd.read_csv('data/raw/ons_reconciliation.csv', skiprows=4, header=None)
    .dropna()
    .drop(axis=0, index=[37])
    .rename(columns={0:'laname', 1:'pct_diff'})
    )
ons_df

Unnamed: 0,laname,pct_diff
0,Leicester,-5.22
1,Crawley,-5.49
2,Oxford,-5.56
3,Brent,-5.62
4,Milton Keynes,-5.64
5,Luton,-5.69
6,Merton,-5.82
7,Burnley,-6.09
8,Slough,-6.53
9,Peterborough,-6.53


In [33]:
len(ons_df['laname'].unique())

36

In [35]:
df_merged[df_merged['within_5pct'] == False]['laname'].unique()

array(['Leicester', 'Swindon', 'Peterborough', 'Luton', 'Reading',
       'Slough', 'Milton Keynes', 'Isles of Scilly', 'Bedford',
       'West Northamptonshire', 'Cambridge', 'Harlow', 'Rushmoor',
       'Watford', 'Burnley', 'Lincoln', 'Cherwell', 'Oxford', 'Woking',
       'Crawley', 'Worcester', 'Sheffield', 'Coventry', 'Sandwell',
       'City of London', 'Barking and Dagenham', 'Brent', 'Camden',
       'Ealing', 'Greenwich', 'Hammersmith and Fulham', 'Haringey',
       'Harrow', 'Hounslow', 'Islington', 'Lambeth', 'Lewisham', 'Merton',
       'Newham', 'Redbridge', 'Waltham Forest', 'Westminster'],
      dtype=object)

In [36]:
# Get unique LA names from both sources
las_merged = set(df_merged[df_merged['within_5pct'] == False]['laname'].unique())
las_ons = set(ons_df['laname'].unique())

# Find overlap
overlap = las_merged & las_ons

# Find LAs in merged but missing from ons_df
missing_in_ons = las_merged - las_ons

# Find LAs in ons_df but not in merged
missing_in_merged = las_ons - las_merged

print(f"Number of overlapping LAs: {len(overlap)}")
print(f"LAs in merged but missing from ons_df: {missing_in_ons}")
print(f"LAs in ons_df but not in merged: {missing_in_merged}")

Number of overlapping LAs: 24
LAs in merged but missing from ons_df: {'Haringey', 'Barking and Dagenham', 'West Northamptonshire', 'Newham', 'Waltham Forest', 'Harrow', 'Sandwell', 'Greenwich', 'Bedford', 'Lewisham', 'Redbridge', 'Lambeth', 'Woking', 'Hammersmith and Fulham', 'Cherwell', 'Lincoln', 'Swindon', 'Worcester'}
LAs in ons_df but not in merged: {'Canterbury', 'Gwynedd', 'Exeter', 'Richmondshire', 'Kingston upon Thames', 'York', 'Brighton and Hove', 'Hackney', 'Nottingham', 'Guildford', 'Tower Hamlets', 'Kensington and Chelsea'}
