In [24]:
import numpy as np
import pandas as pd

import altair as alt

In [14]:
dat = pd.read_csv('data-school-workforce-in-england.csv')

In [37]:
dat
# I did some data checks:
# some columns have no variation: location, school_type, age_category

Unnamed: 0,location,location_code,geographic_level,time_period,gender,school_type,grade,age_category,average_mean
0,England,E92000001,country,201011,Female,Total state-funded schools,Classroom teachers,Total,34174.7
1,England,E92000001,country,201011,Female,Total state-funded schools,Head teachers,Total,60289.8
2,England,E92000001,country,201011,Female,Total state-funded schools,Other Leadership teachers,Total,49442.5
3,England,E92000001,country,201011,Female,Total state-funded schools,Total,Total,36366.9
4,England,E92000001,country,201011,Male,Total state-funded schools,Classroom teachers,Total,35494
...,...,...,...,...,...,...,...,...,...
155,England,E92000001,country,201920,Total,Total state-funded schools,Total,Total,40537
156,England,E92000001,country,201920,Unclassified,Total state-funded schools,Classroom teachers,Total,34244
157,England,E92000001,country,201920,Unclassified,Total state-funded schools,Head teachers,Total,63910.4
158,England,E92000001,country,201920,Unclassified,Total state-funded schools,Other Leadership teachers,Total,53403.1


In [39]:
dat.loc[dat['average_mean'] != 'c']

Unnamed: 0,location,location_code,geographic_level,time_period,gender,school_type,grade,age_category,average_mean
125,England,E92000001,country,201718,Unclassified,Total state-funded schools,Head teachers,Total,c


In [49]:
datc['average_mean'] = pd.to_numeric(datc['average_mean'])
datc.dtypes

location             object
location_code        object
geographic_level     object
time_period           int64
gender               object
school_type          object
grade                object
age_category         object
average_mean        float64
dtype: object

In [252]:
datc['year'] = [str(x)[:4] for x in datc['time_period']]
datc['gender_sign'] = ["\u2640" if i == "Female" else "\u2642" for i in datc['gender']]
datc['grade'] = ["Leadership teachers" if i == "Other Leadership teachers" else i for i in datc['grade']]

In [253]:
datc['gender'].unique()

array(['Female', 'Male', 'Total', 'Unclassified'], dtype=object)

In [254]:
data = datc[["year", "gender", "gender_sign", "grade", "average_mean"]]
data.shape

(159, 5)

In [255]:
# check if the total entries are correct:
tcheck_gender = pd.pivot_table(data, 
                               values='average_mean', 
                               index=['grade'],
                               columns=['gender'], aggfunc=np.mean)

tcheck_gender 

gender,Female,Male,Total,Unclassified
grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Classroom teachers,34758.97,35700.61,34976.89,30284.12
Head teachers,64056.41,72011.81,66740.86,62779.5
Leadership teachers,50997.19,54859.2,52214.43,52541.96
Total,37305.23,40243.74,38025.34,32686.78


In [256]:
datac = data.loc[(data['gender'].isin(['Female', 'Male'])) & (data['grade'] != 'Total')].copy()
datac.shape

(60, 5)

In [262]:
points = alt.Chart(datac, title = "Gender pay gaps among teachers in England").mark_text(size = 15).encode(
    alt.X('year', title='Year', 
          scale=alt.Scale(zero = False), 
          axis = alt.Axis(labelAngle = 0, values= [2010, 2013, 2016, 2019])),
    alt.Y('average_mean', title='Average Pay', axis = alt.Axis(titleAngle = 0, titleX=-90)),
    color = "grade", text = "gender_sign"
).properties( width=300, height=400 ) 

labels = alt.Chart(datac).mark_text(align='left', dx = 10, dy = 10).encode(
    alt.X('year', aggregate='max'),
    alt.Y('average_mean', aggregate='max'),
    alt.Text('grade'),
    alt.Color('grade', legend=None))

points +labels