# Women's imprisonment rates
## ONS mid-year population estimates (2001–2021): Data Cleansing

#### Importing pandas library and reading in data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/external/MYEB1_detailed_population_estimates_series_UK_(2021_geog21).csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68068 entries, 0 to 68067
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ladcode21        68068 non-null  object
 1   ladname21        68068 non-null  object
 2   country          68068 non-null  object
 3   sex              68068 non-null  int64 
 4   age              68068 non-null  int64 
 5   population_2021  68068 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 3.1+ MB


In [3]:
df

Unnamed: 0,ladcode21,ladname21,country,sex,age,population_2021
0,E06000001,Hartlepool,E,1,0,446
1,E06000001,Hartlepool,E,1,1,477
2,E06000001,Hartlepool,E,1,2,506
3,E06000001,Hartlepool,E,1,3,464
4,E06000001,Hartlepool,E,1,4,524
...,...,...,...,...,...,...
68063,W06000024,Merthyr Tydfil,W,2,86,85
68064,W06000024,Merthyr Tydfil,W,2,87,54
68065,W06000024,Merthyr Tydfil,W,2,88,50
68066,W06000024,Merthyr Tydfil,W,2,89,50


In [4]:
df_recon = pd.read_csv('data/external/MYEB2_detailed_components_of_change_for reconciliation_EW_(2021_geog21).csv', usecols=range(26))
df_recon

Unnamed: 0,ladcode21,ladname21,country,sex,age,population_2001,population_2002,population_2003,population_2004,population_2005,...,population_2012,population_2013,population_2014,population_2015,population_2016,population_2017,population_2018,population_2019,population_2020,population_2021
0,E06000001,Hartlepool,E,1,0,519,499,513,517,551,...,557,509,514,517,511,509,464,491,455,435
1,E06000001,Hartlepool,E,1,1,550,520,511,508,518,...,559,557,507,515,522,526,496,477,489,472
2,E06000001,Hartlepool,E,1,2,548,558,517,506,513,...,575,574,554,516,523,526,525,516,484,489
3,E06000001,Hartlepool,E,1,3,523,549,554,511,501,...,565,586,581,551,531,543,530,542,516,493
4,E06000001,Hartlepool,E,1,4,589,527,553,574,510,...,552,561,591,584,564,532,550,525,539,515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60237,W06000024,Merthyr Tydfil,W,2,86,39,43,39,37,51,...,57,56,73,85,69,74,74,71,63,84
60238,W06000024,Merthyr Tydfil,W,2,87,29,30,36,29,29,...,55,43,47,63,74,65,64,67,55,54
60239,W06000024,Merthyr Tydfil,W,2,88,27,22,25,31,24,...,38,43,35,40,55,66,63,55,60,47
60240,W06000024,Merthyr Tydfil,W,2,89,22,24,17,22,23,...,44,32,38,28,32,45,54,55,44,58


In [5]:
df_recon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60242 entries, 0 to 60241
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ladcode21        60242 non-null  object
 1   ladname21        60242 non-null  object
 2   country          60242 non-null  object
 3   sex              60242 non-null  int64 
 4   age              60242 non-null  int64 
 5   population_2001  60242 non-null  int64 
 6   population_2002  60242 non-null  int64 
 7   population_2003  60242 non-null  int64 
 8   population_2004  60242 non-null  int64 
 9   population_2005  60242 non-null  int64 
 10  population_2006  60242 non-null  int64 
 11  population_2007  60242 non-null  int64 
 12  population_2008  60242 non-null  int64 
 13  population_2009  60242 non-null  int64 
 14  population_2010  60242 non-null  int64 
 15  population_2011  60242 non-null  int64 
 16  population_2012  60242 non-null  int64 
 17  population_2013  60242 non-null

Okay, so there are fewer records in `df_recon` than in `df`. Let's check whether this is an issue with figures for Scotland/Northern Ireland being included.

In [6]:
df['country'].unique()

array(['E', 'N', 'S', 'W'], dtype=object)

Yep, okay let's drop those first.

In [14]:
filt = df['country'].str.contains("(?:^E|^W)", regex=True)
df_eng_wales= df[filt].reset_index(drop=True)
df_eng_wales

Unnamed: 0,ladcode21,ladname21,country,sex,age,population_2021
0,E06000001,Hartlepool,E,1,0,446
1,E06000001,Hartlepool,E,1,1,477
2,E06000001,Hartlepool,E,1,2,506
3,E06000001,Hartlepool,E,1,3,464
4,E06000001,Hartlepool,E,1,4,524
...,...,...,...,...,...,...
60237,W06000024,Merthyr Tydfil,W,2,86,85
60238,W06000024,Merthyr Tydfil,W,2,87,54
60239,W06000024,Merthyr Tydfil,W,2,88,50
60240,W06000024,Merthyr Tydfil,W,2,89,50


Perfect, the number of rows now match. Let's merge the dataframes together.

In [18]:
df_merged = df_recon.merge(df_eng_wales, how='inner', on=['ladcode21', 'sex', 'age'], suffixes=(None, '_census'))
df_merged

Unnamed: 0,ladcode21,ladname21,country,sex,age,population_2001,population_2002,population_2003,population_2004,population_2005,...,population_2015,population_2016,population_2017,population_2018,population_2019,population_2020,population_2021,ladname21_census,country_census,population_2021_census
0,E06000001,Hartlepool,E,1,0,519,499,513,517,551,...,517,511,509,464,491,455,435,Hartlepool,E,446
1,E06000001,Hartlepool,E,1,1,550,520,511,508,518,...,515,522,526,496,477,489,472,Hartlepool,E,477
2,E06000001,Hartlepool,E,1,2,548,558,517,506,513,...,516,523,526,525,516,484,489,Hartlepool,E,506
3,E06000001,Hartlepool,E,1,3,523,549,554,511,501,...,551,531,543,530,542,516,493,Hartlepool,E,464
4,E06000001,Hartlepool,E,1,4,589,527,553,574,510,...,584,564,532,550,525,539,515,Hartlepool,E,524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60237,W06000024,Merthyr Tydfil,W,2,86,39,43,39,37,51,...,85,69,74,74,71,63,84,Merthyr Tydfil,W,85
60238,W06000024,Merthyr Tydfil,W,2,87,29,30,36,29,29,...,63,74,65,64,67,55,54,Merthyr Tydfil,W,54
60239,W06000024,Merthyr Tydfil,W,2,88,27,22,25,31,24,...,40,55,66,63,55,60,47,Merthyr Tydfil,W,50
60240,W06000024,Merthyr Tydfil,W,2,89,22,24,17,22,23,...,28,32,45,54,55,44,58,Merthyr Tydfil,W,50


Dropping old `population` column and `_census` columns that aren't needed

In [21]:
df_merged.drop(columns=['population_2021', 'ladname21_census', 'country_census'], inplace=True)

In [22]:
df_merged

Unnamed: 0,ladcode21,ladname21,country,sex,age,population_2001,population_2002,population_2003,population_2004,population_2005,...,population_2012,population_2013,population_2014,population_2015,population_2016,population_2017,population_2018,population_2019,population_2020,population_2021_census
0,E06000001,Hartlepool,E,1,0,519,499,513,517,551,...,557,509,514,517,511,509,464,491,455,446
1,E06000001,Hartlepool,E,1,1,550,520,511,508,518,...,559,557,507,515,522,526,496,477,489,477
2,E06000001,Hartlepool,E,1,2,548,558,517,506,513,...,575,574,554,516,523,526,525,516,484,506
3,E06000001,Hartlepool,E,1,3,523,549,554,511,501,...,565,586,581,551,531,543,530,542,516,464
4,E06000001,Hartlepool,E,1,4,589,527,553,574,510,...,552,561,591,584,564,532,550,525,539,524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60237,W06000024,Merthyr Tydfil,W,2,86,39,43,39,37,51,...,57,56,73,85,69,74,74,71,63,85
60238,W06000024,Merthyr Tydfil,W,2,87,29,30,36,29,29,...,55,43,47,63,74,65,64,67,55,54
60239,W06000024,Merthyr Tydfil,W,2,88,27,22,25,31,24,...,38,43,35,40,55,66,63,55,60,50
60240,W06000024,Merthyr Tydfil,W,2,89,22,24,17,22,23,...,44,32,38,28,32,45,54,55,44,50


In [23]:
df_merged.rename(columns = {'population_2021_census':'population_2021'}, inplace = True)

In [24]:
df_merged

Unnamed: 0,ladcode21,ladname21,country,sex,age,population_2001,population_2002,population_2003,population_2004,population_2005,...,population_2012,population_2013,population_2014,population_2015,population_2016,population_2017,population_2018,population_2019,population_2020,population_2021
0,E06000001,Hartlepool,E,1,0,519,499,513,517,551,...,557,509,514,517,511,509,464,491,455,446
1,E06000001,Hartlepool,E,1,1,550,520,511,508,518,...,559,557,507,515,522,526,496,477,489,477
2,E06000001,Hartlepool,E,1,2,548,558,517,506,513,...,575,574,554,516,523,526,525,516,484,506
3,E06000001,Hartlepool,E,1,3,523,549,554,511,501,...,565,586,581,551,531,543,530,542,516,464
4,E06000001,Hartlepool,E,1,4,589,527,553,574,510,...,552,561,591,584,564,532,550,525,539,524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60237,W06000024,Merthyr Tydfil,W,2,86,39,43,39,37,51,...,57,56,73,85,69,74,74,71,63,85
60238,W06000024,Merthyr Tydfil,W,2,87,29,30,36,29,29,...,55,43,47,63,74,65,64,67,55,54
60239,W06000024,Merthyr Tydfil,W,2,88,27,22,25,31,24,...,38,43,35,40,55,66,63,55,60,50
60240,W06000024,Merthyr Tydfil,W,2,89,22,24,17,22,23,...,44,32,38,28,32,45,54,55,44,50


#### Further filtering to only include adult women

In [25]:
filt1 = df_merged['age'] >= 18
filt2 = df_merged['sex'] == 2
filt = filt1 & filt2
df3 = df_merged[filt]
df3

Unnamed: 0,ladcode21,ladname21,country,sex,age,population_2001,population_2002,population_2003,population_2004,population_2005,...,population_2012,population_2013,population_2014,population_2015,population_2016,population_2017,population_2018,population_2019,population_2020,population_2021
109,E06000001,Hartlepool,E,2,18,558,595,645,640,665,...,655,667,628,622,546,548,554,618,521,521
110,E06000001,Hartlepool,E,2,19,490,465,537,580,556,...,517,583,612,575,577,497,461,489,553,452
111,E06000001,Hartlepool,E,2,20,506,465,442,505,549,...,611,499,570,564,556,544,475,454,494,509
112,E06000001,Hartlepool,E,2,21,498,497,450,439,503,...,598,605,478,554,575,552,545,481,465,489
113,E06000001,Hartlepool,E,2,22,467,494,489,468,459,...,558,635,618,504,582,595,578,559,509,502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60237,W06000024,Merthyr Tydfil,W,2,86,39,43,39,37,51,...,57,56,73,85,69,74,74,71,63,85
60238,W06000024,Merthyr Tydfil,W,2,87,29,30,36,29,29,...,55,43,47,63,74,65,64,67,55,54
60239,W06000024,Merthyr Tydfil,W,2,88,27,22,25,31,24,...,38,43,35,40,55,66,63,55,60,50
60240,W06000024,Merthyr Tydfil,W,2,89,22,24,17,22,23,...,44,32,38,28,32,45,54,55,44,50


#### Melting the columns to make the data long, rather than wide

In [26]:
df4 = df3.melt(id_vars=["ladcode21", "ladname21", "country", "sex", "age"], var_name="year", value_name="population")
df4

Unnamed: 0,ladcode21,ladname21,country,sex,age,year,population
0,E06000001,Hartlepool,E,2,18,population_2001,558
1,E06000001,Hartlepool,E,2,19,population_2001,490
2,E06000001,Hartlepool,E,2,20,population_2001,506
3,E06000001,Hartlepool,E,2,21,population_2001,498
4,E06000001,Hartlepool,E,2,22,population_2001,467
...,...,...,...,...,...,...,...
507418,W06000024,Merthyr Tydfil,W,2,86,population_2021,85
507419,W06000024,Merthyr Tydfil,W,2,87,population_2021,54
507420,W06000024,Merthyr Tydfil,W,2,88,population_2021,50
507421,W06000024,Merthyr Tydfil,W,2,89,population_2021,50


#### Removing 'population_' from the `year` values

In [27]:
df4['year'] = df4['year'].str.replace("population_","", regex=True)

In [28]:
df4

Unnamed: 0,ladcode21,ladname21,country,sex,age,year,population
0,E06000001,Hartlepool,E,2,18,2001,558
1,E06000001,Hartlepool,E,2,19,2001,490
2,E06000001,Hartlepool,E,2,20,2001,506
3,E06000001,Hartlepool,E,2,21,2001,498
4,E06000001,Hartlepool,E,2,22,2001,467
...,...,...,...,...,...,...,...
507418,W06000024,Merthyr Tydfil,W,2,86,2021,85
507419,W06000024,Merthyr Tydfil,W,2,87,2021,54
507420,W06000024,Merthyr Tydfil,W,2,88,2021,50
507421,W06000024,Merthyr Tydfil,W,2,89,2021,50


#### Aggregating all of the ages to just have one population figure for adult women in each local authority for each year

In [29]:
df5 = df4.groupby(['ladcode21', 'ladname21', 'year'], as_index=False, sort=False).agg({'population':'sum'})
df5

Unnamed: 0,ladcode21,ladname21,year,population
0,E06000001,Hartlepool,2001,32246
1,E06000002,Middlesbrough,2001,50887
2,E06000003,Redcar and Cleveland,2001,50671
3,E06000004,Stockton-on-Tees,2001,66611
4,E06000005,Darlington,2001,35701
...,...,...,...,...
6946,W06000020,Torfaen,2021,34999
6947,W06000021,Monmouthshire,2021,36922
6948,W06000022,Newport,2021,60097
6949,W06000023,Powys,2021,53829


#### Converting `year` values to integers to allow filtering

In [30]:
df5['year']= df5['year'].astype('int')
df5.dtypes

ladcode21     object
ladname21     object
year           int64
population     int64
dtype: object

### In the knowledge that this might not be perfect, I'm going to use this data and work through the steps

In [32]:
df5.to_csv('data/interim/LA_population_female_2001_2021_NOT_REBASED.csv', index=False)