# Gender Wage Gap Data Processing

In [1]:
import pandas as pd
import numpy as np

## 1 Map Country Codes

In [2]:
# numeric is country id, which will be used in the map to identify countries
df_country = pd.read_excel("country_codes.xlsx")
df_country.head()

Unnamed: 0,Country,Alpha-2 code,Alpha-3 code,Numeric
0,Åland Islands,AX,ALA,248
1,Zimbabwe,ZW,ZWE,716
2,Zambia,ZM,ZMB,894
3,Yemen,YE,YEM,887
4,Western Sahara,EH,ESH,732


In [3]:
df_gap = pd.read_csv("gap.csv", delimiter=";")
df_gap.head()

Unnamed: 0,location,indicator,subject,measure,frequency,time,value
0,AUS,WAGEGAP,EMPLOYEE,PC,A,1975,21.582.733.813
1,AUS,WAGEGAP,EMPLOYEE,PC,A,1976,20.754.716.981
2,AUS,WAGEGAP,EMPLOYEE,PC,A,1977,18.390.804.598
3,AUS,WAGEGAP,EMPLOYEE,PC,A,1978,19.791.666.667
4,AUS,WAGEGAP,EMPLOYEE,PC,A,1979,20


In [4]:
df = df_gap.merge(df_country[["Alpha-3 code", "Country", "Numeric"]], left_on='location', right_on='Alpha-3 code', how='left')
df.head()

Unnamed: 0,location,indicator,subject,measure,frequency,time,value,Alpha-3 code,Country,Numeric
0,AUS,WAGEGAP,EMPLOYEE,PC,A,1975,21.582.733.813,AUS,Australia,36.0
1,AUS,WAGEGAP,EMPLOYEE,PC,A,1976,20.754.716.981,AUS,Australia,36.0
2,AUS,WAGEGAP,EMPLOYEE,PC,A,1977,18.390.804.598,AUS,Australia,36.0
3,AUS,WAGEGAP,EMPLOYEE,PC,A,1978,19.791.666.667,AUS,Australia,36.0
4,AUS,WAGEGAP,EMPLOYEE,PC,A,1979,20,AUS,Australia,36.0


In [5]:
# 'OECD', 'EU27' can't match any country code
df[df["Alpha-3 code"].isna()]["location"].unique()

array(['OECD', 'EU27'], dtype=object)

In [6]:
EU27_str = "Austria, Belgium, Bulgaria, Croatia, Cyprus, the Czech Republic, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Ireland, Italy, Latvia, Lithuania, Luxembourg, Malta, the Netherlands, Poland, Portugal, Romania, Slovak Republic, Slovenia, Spain, Sweden"
EU27_countries = EU27_str.split(", ")
assert(len(EU27_countries) == 27)
all_countries = df["Country"].unique()
print("Among EU27 countries, the following countries don't have its own data:")
for country in EU27_countries:
    if (country not in all_countries):
        print(country)

Among EU27 countries, the following countries don't have its own data:
the Czech Republic
the Netherlands
Slovak Republic


We won't draw EU27 and OECD countries in the map, but we'll draw it in the bar chart!

## 2 Data Processing

### 2.1 Year - Country

In [7]:
# edit wierd value columns
df["split_value"] = df["value"].str.split(".")
df["joined_value"] = df["split_value"].apply(lambda x: ".".join(x[:2]))
df["value"] = df["joined_value"].astype(float)
df = df.drop(["split_value", "joined_value"], axis=1)
df

Unnamed: 0,location,indicator,subject,measure,frequency,time,value,Alpha-3 code,Country,Numeric
0,AUS,WAGEGAP,EMPLOYEE,PC,A,1975,21.582,AUS,Australia,36.0
1,AUS,WAGEGAP,EMPLOYEE,PC,A,1976,20.754,AUS,Australia,36.0
2,AUS,WAGEGAP,EMPLOYEE,PC,A,1977,18.390,AUS,Australia,36.0
3,AUS,WAGEGAP,EMPLOYEE,PC,A,1978,19.791,AUS,Australia,36.0
4,AUS,WAGEGAP,EMPLOYEE,PC,A,1979,20.000,AUS,Australia,36.0
...,...,...,...,...,...,...,...,...,...,...
1205,EU27,WAGEGAP,EMPLOYEE,PC,A,2016,115.776,,,
1206,EU27,WAGEGAP,EMPLOYEE,PC,A,2017,113.314,,,
1207,EU27,WAGEGAP,EMPLOYEE,PC,A,2018,111.385,,,
1208,EU27,WAGEGAP,EMPLOYEE,PC,A,2019,111.223,,,


In [8]:
df["subject"].value_counts()

subject
EMPLOYEE        766
SELFEMPLOYED    444
Name: count, dtype: int64

In [9]:
df_employee = df[df["subject"] == "EMPLOYEE"]
df_employee
# drop OECD and EU27
# df_employee_id = df_employee.dropna(subset=['Numeric'])
# df_employee_id["Numeric"] = df_employee_id["Numeric"].astype('int')
# df_employee_id
# df_employee_id_out = pd.pivot_table(df_employee_id, values='value', index=['time'], columns=['Numeric'])

Unnamed: 0,location,indicator,subject,measure,frequency,time,value,Alpha-3 code,Country,Numeric
0,AUS,WAGEGAP,EMPLOYEE,PC,A,1975,21.582,AUS,Australia,36.0
1,AUS,WAGEGAP,EMPLOYEE,PC,A,1976,20.754,AUS,Australia,36.0
2,AUS,WAGEGAP,EMPLOYEE,PC,A,1977,18.390,AUS,Australia,36.0
3,AUS,WAGEGAP,EMPLOYEE,PC,A,1978,19.791,AUS,Australia,36.0
4,AUS,WAGEGAP,EMPLOYEE,PC,A,1979,20.000,AUS,Australia,36.0
...,...,...,...,...,...,...,...,...,...,...
1205,EU27,WAGEGAP,EMPLOYEE,PC,A,2016,115.776,,,
1206,EU27,WAGEGAP,EMPLOYEE,PC,A,2017,113.314,,,
1207,EU27,WAGEGAP,EMPLOYEE,PC,A,2018,111.385,,,
1208,EU27,WAGEGAP,EMPLOYEE,PC,A,2019,111.223,,,


In [10]:
output = {} # {year:{country: {id:, name:, value:,}}}
for _, row in df_employee.iterrows():
    year = row['time']
    country = row['location']
    id = row['Numeric']
    value = row['value']
    name = country if country in ['OECD', 'EU27'] else row['Country']
    if year not in output:
        output[year] = {}
    output[year][country] = {}
    output[year][country]['id'] = int(id) if not pd.isna(id) else None
    output[year][country]['name'] = name
    output[year][country]['value'] = value

In [11]:
import json
with open("employee_wage_gap.json", "w") as outfile:
    json.dump(output, outfile)
json.dumps(output)

'{"1975": {"AUS": {"id": 36, "name": "Australia", "value": 21.582}, "JPN": {"id": 392, "name": "Japan", "value": 42.389}, "GBR": {"id": 826, "name": "United Kingdom of Great Britain and Northern Ireland (the)", "value": 39.855}, "USA": {"id": 840, "name": "United States of America (the)", "value": 3.763}}, "1976": {"AUS": {"id": 36, "name": "Australia", "value": 20.754}, "JPN": {"id": 392, "name": "Japan", "value": 39.741}, "GBR": {"id": 826, "name": "United Kingdom of Great Britain and Northern Ireland (the)", "value": 36.55}, "USA": {"id": 840, "name": "United States of America (the)", "value": 37.755}}, "1977": {"AUS": {"id": 36, "name": "Australia", "value": 18.39}, "FIN": {"id": 246, "name": "Finland", "value": 2.769}, "JPN": {"id": 392, "name": "Japan", "value": 40.326}, "GBR": {"id": 826, "name": "United Kingdom of Great Britain and Northern Ireland (the)", "value": 3.578}, "USA": {"id": 840, "name": "United States of America (the)", "value": 38.224}}, "1978": {"AUS": {"id": 36,

In [12]:
df_employee.describe()

Unnamed: 0,time,value,Numeric
count,766.0,766.0,721.0
mean,2005.124021,38.265646,422.061026
std,11.176961,41.167896,260.182327
min,1970.0,0.384387,36.0
25%,1999.0,15.8025,208.0
50%,2007.0,21.8525,392.0
75%,2014.0,42.00475,620.0
max,2020.0,419.34,840.0


### 2.2 Country - Year

In [16]:
df_employee_country = df_employee.sort_values('location')
df_employee_country

Unnamed: 0,location,indicator,subject,measure,frequency,time,value,Alpha-3 code,Country,Numeric
0,AUS,WAGEGAP,EMPLOYEE,PC,A,1975,21.582,AUS,Australia,36.0
25,AUS,WAGEGAP,EMPLOYEE,PC,A,2001,14.342,AUS,Australia,36.0
26,AUS,WAGEGAP,EMPLOYEE,PC,A,2002,15.000,AUS,Australia,36.0
27,AUS,WAGEGAP,EMPLOYEE,PC,A,2003,13.043,AUS,Australia,36.0
28,AUS,WAGEGAP,EMPLOYEE,PC,A,2004,14.352,AUS,Australia,36.0
...,...,...,...,...,...,...,...,...,...,...
986,USA,WAGEGAP,EMPLOYEE,PC,A,2000,23.088,USA,United States of America (the),840.0
985,USA,WAGEGAP,EMPLOYEE,PC,A,1999,23.462,USA,United States of America (the),840.0
984,USA,WAGEGAP,EMPLOYEE,PC,A,1998,23.745,USA,United States of America (the),840.0
995,USA,WAGEGAP,EMPLOYEE,PC,A,2009,1.978,USA,United States of America (the),840.0


In [18]:
output_country = {} # {country: {id:, name:, data: {year: value}}}
for _, row in df_employee_country.iterrows():
    year = row['time']
    country = row['location']
    id = row['Numeric']
    value = row['value']
    name = country if country in ['OECD', 'EU27'] else row['Country']
    if country not in output_country:
        output_country[country] = {}
    output_country[country]['id'] = int(id) if not pd.isna(id) else None
    output_country[country]['name'] = name
    if 'data' not in output_country[country]:
        output_country[country]['data'] = {}
    output_country[country]['data'][year] = value
    

In [23]:
with open("employee_wage_gap-country.json", "w") as outfile:
    json.dump(output_country, outfile)
json.dumps(output_country)

'{"AUS": {"id": 36, "name": "Australia", "data": {"1975": 21.582, "2001": 14.342, "2002": 15.0, "2003": 13.043, "2004": 14.352, "2005": 15.777, "2006": 16.666, "2007": 15.4, "2008": 11.937, "2009": 16.363, "2010": 1.404, "2011": 15.966, "2012": 13.75, "2014": 17.05, "2015": 15.384, "2016": 13.728, "2017": 14.507, "2018": 16.142, "2019": 14.965, "2020": 12.266, "2000": 17.2, "1999": 14.285, "2013": 18.0, "1997": 15.254, "1998": 13.242, "1976": 20.754, "1977": 18.39, "1978": 19.791, "1979": 20.0, "1980": 18.75, "1981": 18.253, "1983": 19.155, "1984": 18.674, "1985": 19.607, "1982": 20.819, "1987": 18.518, "1995": 14.478, "1986": 18.848, "1993": 13.224, "1992": 14.258, "1991": 16.015, "1994": 14.409, "1989": 1.858, "1988": 1.879, "1990": 18.181}}, "AUT": {"id": 40, "name": "Austria", "data": {"2012": 18.185, "2020": 13.325, "2019": 14.011, "2018": 14.876, "2017": 1.538, "2016": 15.67, "2015": 1.704, "2014": 17.728, "2013": 18.053, "2011": 18.55, "2004": 22.43, "2009": 19.355, "2008": 20.9