In [1]:
import os
import pandas as pd
import numpy as np
import re
from scipy.stats import gmean
from scipy import stats
import matplotlib.pyplot as plt
from copy import deepcopy

# Import data from the O*NET database, at ISCO-08 occupation level.
# The original data uses a version of SOC classification, but the data we load here
# are already cross-walked to ISCO-08 using: https://ibs.org.pl/en/resources/occupation-classifications-crosswalks-from-onet-soc-to-isco/

# Sets the path to the parent directory of RR classes
# 1. setting relavent path
currentPath = os.getcwd()
os.chdir(currentPath)
task_data = pd.read_csv("./Data/onet_tasks.csv")


FileNotFoundError: [Errno 2] No such file or directory: './Data/onet_tasks.csv'

In [None]:
# The O*NET database contains information for occupations in the USA, including
# the tasks and activities typically associated with a specific occupation.
# isco08 variable is for occupation codes
# the t_* variables are specific tasks conducted on the job

# read employment data from Eurostat
# These datasets include quarterly information on the number of workers in specific
# 1-digit ISCO occupation categories. (Check here for details: https://www.ilo.org/public/english/bureau/stat/isco/isco08/)
dir_isco = './Data/Eurostat_employment_isco.xlsx'
isco_xlsx = pd.read_excel(dir_isco, sheet_name=None)

for i in range(9):
    idx = str(i + 1)
    locals()['isco'+idx] = isco_xlsx["ISCO"+idx]
    locals()['isco'+idx]['ISCO'] = i+1

# # We will focus on three countries, but perhaps we could clean this code to allow it
# # to easily run for all the countries in the sample?
total_Belgium = isco1["Belgium"] + isco2["Belgium"] + isco3["Belgium"] + isco4["Belgium"] + isco5["Belgium"] + isco6["Belgium"] + isco7["Belgium"] + isco8["Belgium"] + isco9["Belgium"]
total_Spain = isco1["Spain"] + isco2["Spain"] + isco3["Spain"] + isco4["Spain"] + isco5["Spain"] + isco6["Spain"] + isco7["Spain"] + isco8["Spain"] + isco9["Spain"]
total_Poland = isco1["Poland"] + isco2["Poland"] + isco3["Poland"] + isco4["Poland"] + isco5["Poland"] + isco6["Poland"] + isco7["Poland"] + isco8["Poland"] + isco9["Poland"]


In [None]:
# and this gives us one large file with employment in all occupations.
all_data = pd.concat([isco1, isco2, isco3, isco4, isco5, isco6, isco7, isco8, isco9], ignore_index=True)

# We have 9 occupations and the same time range for each, so we can add the totals by
# adding a vector that is 9 times the previously calculated totals
all_data["total_Belgium"] = pd.concat([total_Belgium]*9, ignore_index=True)
all_data["total_Spain"] = pd.concat([total_Spain]*9, ignore_index=True)
all_data["total_Poland"] = pd.concat([total_Poland]*9, ignore_index=True)

# And this will give us shares of each occupation among all workers in a period-country
all_data['share_Belgium'] = all_data['Belgium'] / all_data['total_Belgium']
all_data['share_Spain'] = all_data['Spain'] / all_data['total_Spain']
all_data['share_Poland'] = all_data['Poland'] / all_data['total_Poland']

In [None]:
task_data["isco08_1dig"] = task_data["isco08"].astype(str).str[:1].astype(int)

# And we'll calculate the mean task values at a 1-digit level 
# (more on what these tasks are below)
aggdata = task_data.groupby(["isco08_1dig"]).mean()
aggdata = aggdata.drop(columns=["isco08"])

# We'll be interested in tracking the intensity of Non-routine cognitive analytical tasks
# Using a framework reminiscent of the work by David Autor.

#These are the ones we're interested in:
# Non-routine cognitive analytical
# 4.A.2.a.4 Analyzing Data or Information
# 4.A.2.b.2 Thinking Creatively
# 4.A.4.a.1 Interpreting the Meaning of Information for Others

#Let's combine the data.
combined = pd.merge(all_data, aggdata, left_on='ISCO', right_on='isco08_1dig', how='left')
# Traditionally, the first step is to standardise the task values using weights 
# defined by share of occupations in the labour force. This should be done separately
# for each country. Standardisation -> getting the mean to 0 and std. dev. to 1.
# Let's do this for each of the variables that interests us:

In [None]:
#first task item
def std_nation(combined, nation_name, spec_col):
    share_name = 'share_'+nation_name
    combined_names = 'std_'+nation_name+'_'+spec_col
    
    temp_mean=np.average(combined[spec_col],weights=combined[share_name])
    temp_sd=np.sqrt(np.average((combined[spec_col]-temp_mean)**2,weights=combined[share_name]))
    combined[combined_names]=(combined[spec_col]-temp_mean)/temp_sd 
    
    return combined

#first task item
combined = std_nation(combined, 'Belgium', 't_4A2a4')
combined = std_nation(combined, 'Poland', 't_4A2a4')
combined = std_nation(combined, 'Spain', 't_4A2a4')


#second task item
combined = std_nation(combined, 'Belgium', 't_4A2b2')
combined = std_nation(combined, 'Poland', 't_4A2b2')
combined = std_nation(combined, 'Spain', 't_4A2b2')


#third task item
combined = std_nation(combined, 'Belgium', 't_4A4a1')
combined = std_nation(combined, 'Poland', 't_4A4a1')
combined = std_nation(combined, 'Spain', 't_4A4a1')

In [None]:
# The next step is to calculate the `classic` task content intensity, i.e.
# how important is a particular general task content category in the workforce
# Here, we're looking at non-routine cognitive analytical tasks, as defined
# by David Autor and Darron Acemoglu:

def fig_show(nation, combined):
    combined[nation+'_NRCA'] = combined["std_"+nation+"_t_4A2a4"]+combined["std_"+nation+"_t_4A2b2"]+combined["std_"+nation+"_t_4A4a1"]
    temp_mean = np.average(combined[nation+"_NRCA"], weights=combined["share_"+nation])
    temp_sd = np.sqrt(np.average((combined[nation+"_NRCA"] - temp_mean)**2, weights=combined["share_"+nation]))
    combined["std_"+nation+"_NRCA"] = (combined[nation+"_NRCA"] - temp_mean) / temp_sd 
    combined["multip_"+nation+"_NRCA"] = combined["std_"+nation+"_NRCA"] * combined["share_"+nation]
    
    # We can plot it now!
    decorated_nation = 'multip_'+nation+'_NRCA'
    agg = combined.groupby(["TIME"])[decorated_nation].sum().reset_index()
    plt.plot(agg_Poland["TIME"], agg[decorated_nation])
    plt.xticks(range(0, len(agg), 3), agg["TIME"][::3])
    plt.show()
    
fig_show('Poland', combined)
fig_show('Spain', combined)
fig_show('Belgium', combined)

# If this code gets automated and cleaned properly,
#  you should be able to easily add other countries as well as other tasks.
# E.g.:

# Routine manual
# 4.A.3.a.3	Controlling Machines and Processes
# 4.C.2.d.1.i	Spend Time Making Repetitive Motions
# 4.C.3.d.3	Pace Determined by Speed of Equipment
