# Hourly Pay Per Head: by Place of Residence in London

In [204]:
import pandas as pd

df = pd.read_excel('earnings_by_borough.xlsx', sheet_name='Total, Hourly')

### Data Cleaning:

In [205]:
# Visualise the data frame
df.head()

Unnamed: 0,Code,Area,2002,Unnamed: 3,2003,Unnamed: 5,2004,Unnamed: 7,2005,Unnamed: 9,...,2017,Unnamed: 33,2018,Unnamed: 35,2019,Unnamed: 37,2020,Unnamed: 39,2021,Unnamed: 41
0,,,Pay (£),conf %,Pay (£),conf %,Pay (£),conf %,Pay (£),conf %,...,Pay (£),conf %,Pay (£),conf %,Pay (£),conf %,Pay (£),conf %,Pay (£),conf %
1,,,,,,,,,,,...,,,,,,,,,,
2,00AA,City of London,!,!,!,!,#,#,#,#,...,#,#,27.34,20,#,#,#,#,#,#
3,00AB,Barking and Dagenham,10.16,4.8,9.55,5,9.61,4.3,9.99,4.4,...,11.97,4.8,12.52,5,12.88,5.5,12.94,6.3,14.34,6
4,00AC,Barnet,11.9,5.4,12.79,5.2,12.76,5.5,12.77,5.5,...,15.75,5,15.61,4.3,16,4.5,15.84,5.6,16.82,5.1


In [206]:
# Brief overview of dataset
df.info()

# Some instant problems to be cleaned:
# Unnamed column headings are not relevant
# City of London row has very limited data
# All data types are objects, need converting to floats except for code and area columns
# Only the London borough data is relevant to the project
# London borough area codes are ONS codes not GSS codes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 42 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Code         47 non-null     object
 1   Area         47 non-null     object
 2   2002         48 non-null     object
 3   Unnamed: 3   48 non-null     object
 4   2003         48 non-null     object
 5   Unnamed: 5   48 non-null     object
 6   2004         48 non-null     object
 7   Unnamed: 7   48 non-null     object
 8   2005         48 non-null     object
 9   Unnamed: 9   48 non-null     object
 10  2006         48 non-null     object
 11  Unnamed: 11  48 non-null     object
 12  2007         48 non-null     object
 13  Unnamed: 13  48 non-null     object
 14  2008         48 non-null     object
 15  Unnamed: 15  48 non-null     object
 16  2009         48 non-null     object
 17  Unnamed: 17  48 non-null     object
 18  2010         48 non-null     object
 19  Unnamed: 19  48 non-null     ob

In [207]:
# Remove unnamed columns 
df2 = df.copy()

df2 = df2.loc[:, ~df2.columns.str.contains('^Unnamed', na=False)]

In [208]:
df2.head()

Unnamed: 0,Code,Area,2002,2003,2004,2005,2006,2007,2008,2009,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,,,Pay (£),Pay (£),Pay (£),Pay (£),Pay (£),Pay (£),Pay (£),Pay (£),...,Pay (£),Pay (£),Pay (£),Pay (£),Pay (£),Pay (£),Pay (£),Pay (£),Pay (£),Pay (£)
1,,,,,,,,,,,...,,,,,,,,,,
2,00AA,City of London,!,!,#,#,#,#,#,20.45,...,#,#,#,#,25.54,#,27.34,#,#,#
3,00AB,Barking and Dagenham,10.16,9.55,9.61,9.99,10.65,11.28,11.44,12.26,...,11.98,11.84,11.88,11.89,11.95,11.97,12.52,12.88,12.94,14.34
4,00AC,Barnet,11.9,12.79,12.76,12.77,12.98,12.78,13.76,14.83,...,14.72,14.95,14.45,14.37,14.61,15.75,15.61,16,15.84,16.82


In [209]:
# Remove rows 0, 1 and 2
df2.drop(0, inplace=True, axis=0)
df2.drop(1, inplace=True, axis=0)
df2.drop(2, inplace=True, axis=0)

In [210]:
# Reset the index
df2 = df2.reset_index(drop = True)

In [211]:
# Filter data frame so it only contains data on London
df2 = df2[:32]

In [212]:
# Check if there are any null values
df2.isna().any()

Code    False
Area    False
2002    False
2003    False
2004    False
2005    False
2006    False
2007    False
2008    False
2009    False
2010    False
2011    False
2012    False
2013    False
2014    False
2015    False
2016    False
2017    False
2018    False
2019    False
2020    False
2021    False
dtype: bool

In [213]:
# Replace object data types in all year columns with floats
df2 = df2.replace(df2.iloc[:, 2:].astype(float))

df2.dtypes

Code     object
Area     object
2002    float64
2003    float64
2004    float64
2005    float64
2006    float64
2007    float64
2008    float64
2009    float64
2010    float64
2011    float64
2012    float64
2013    float64
2014    float64
2015    float64
2016    float64
2017    float64
2018    float64
2019    float64
2020    float64
2021    float64
dtype: object

In [214]:
# New column with codes in GSS format 
gss_codes = ['E09000002', 'E09000003', 'E09000004', 'E09000005', 'E09000006', 'E09000007', 'E09000008', 'E09000009', 
             'E09000010', 'E09000011', 'E09000012', 'E09000013', 'E09000014', 'E09000015', 'E09000016', 'E09000017',
             'E09000018', 'E09000019', 'E09000020', 'E09000021', 'E09000022', 'E09000023', 'E09000024', 'E09000025', 
             'E09000026', 'E09000027', 'E09000028', 'E09000029', 'E09000030', 'E09000031', 'E09000032', 'E09000033']

df2.insert(loc = 1, column = 'GSS Code', value = gss_codes)

# Remove old Code column
df2.drop('Code', inplace=True, axis=1)

In [215]:
# Visulaise the clean data frame
df2.head()

Unnamed: 0,GSS Code,Area,2002,2003,2004,2005,2006,2007,2008,2009,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,E09000002,Barking and Dagenham,10.16,9.55,9.61,9.99,10.65,11.28,11.44,12.26,...,11.98,11.84,11.88,11.89,11.95,11.97,12.52,12.88,12.94,14.34
1,E09000003,Barnet,11.9,12.79,12.76,12.77,12.98,12.78,13.76,14.83,...,14.72,14.95,14.45,14.37,14.61,15.75,15.61,14.67,15.84,16.82
2,E09000004,Bexley,10.57,11.39,11.37,11.96,11.57,12.37,13.23,13.34,...,13.99,13.86,13.69,13.72,14.55,14.64,14.66,15.84,16.38,16.76
3,E09000005,Brent,9.77,9.19,10.4,10.74,10.76,11.08,11.61,11.73,...,11.94,11.85,12.16,12.05,12.48,12.41,13.11,14.21,14.91,15.03
4,E09000006,Bromley,12.47,12.91,12.76,12.98,13.84,14.82,15.45,15.94,...,16.28,16.48,16.41,16.44,17.25,17.23,17.77,18.41,19.92,18.48


### Descriptive Analysis:

In [216]:
df2.describe()

Unnamed: 0,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
count,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
mean,11.554375,12.025313,12.383125,12.817813,13.010625,13.525625,14.192187,14.656875,14.65125,14.655,14.787813,14.8325,14.755937,14.818125,15.20125,15.760938,16.094063,16.663437,17.301875,17.516563
std,1.663273,1.767438,2.013475,1.989422,1.93254,1.94686,2.442809,2.264407,2.26549,2.423861,2.128614,2.145202,2.062357,1.961649,2.033539,2.220515,2.235823,2.220387,2.339136,2.044637
min,8.55,8.87,9.61,9.99,10.28,10.37,10.96,11.15,10.99,10.13,10.51,10.96,10.55,10.89,11.5,11.97,12.52,12.88,12.94,14.34
25%,10.5625,11.0975,11.2875,11.64,11.6875,12.3325,12.7925,13.2625,13.5,13.1975,13.515,13.4675,13.5375,13.5925,13.89,14.4525,14.6275,15.165,15.72,16.1975
50%,11.095,11.835,11.97,12.415,12.51,12.77,13.455,14.395,14.18,14.1,14.235,14.605,14.42,14.605,14.92,15.425,15.61,16.16,16.605,16.86
75%,12.0425,12.82,12.84,13.42,13.8475,14.8275,15.015,15.73,15.7375,15.575,16.3175,16.5,16.425,16.45,16.7575,16.9525,17.84,18.5925,18.89,18.835
max,16.67,17.52,19.93,20.13,18.66,18.67,22.65,22.85,21.51,21.57,19.36,20.07,18.95,18.4,19.76,21.29,20.59,21.13,22.6,22.63
