In [None]:
import os
import pandas as pd
import numpy as np
import re
from scipy.stats import gmean
from scipy import stats
import matplotlib.pyplot as plt

In [None]:
##### Define functions #######

# Function to standardize task values
def standardize_task_values(group):
    mean = np.average(group['task_value'], weights=group['share'])
    std_dev = np.sqrt(np.average((group['task_value'] - mean)**2, weights=group['share']))
    standardized_values = (group['task_value'] - mean) / std_dev
    return standardized_values

In [None]:
# Sets the path to the parent directory of RR classes
os.chdir(os.path.join( "RRcourse2023", "6. Coding and documentation"))

# Import task data from the O*NET database
task_data = pd.read_csv("Data\\onet_tasks.csv")

# Read Eurostat employment data from Excel file
excel_file = "Data\\Eurostat_employment_isco.xlsx"
sheet_names = ["ISCO1", "ISCO2", "ISCO3", "ISCO4", "ISCO5", "ISCO6", "ISCO7", "ISCO8", "ISCO9"]
employment_data = []

for sheet_name in sheet_names:
    df = pd.read_excel(excel_file, sheet_name=sheet_name)
    df['ISCO'] = int(sheet_name[-1])
    employment_data.append(df)

# Combine employment data from different ISCO levels into a single DataFrame
all_data = pd.concat(employment_data, ignore_index=True)

In [None]:
# Calculate the total number of workers for each country
countries = ["Belgium", "Spain", "Poland"]
total_workers = {}

for country in countries:
    total_workers[country] = sum(isco[country] for isco in employment_data)

# Add 1-digit ISCO code to the task data
task_data["isco08_1dig"] = task_data["isco08"].astype(str).str[:1].astype(int)

# Standardize task values for each task and country separately
task_columns = ['t_4A2a4', 't_4A2b2', 't_4A4a1']

for task_column in task_columns:
    for country in countries:
        # Filter task data for the specific task and country
        task_subset = task_data[(task_data['task'] == task_column) & (task_data['country'] == country)]
        # Group by 1-digit ISCO and apply the standardization function
        standardized_values = task_subset.groupby('isco08_1dig').apply(standardize_task_values)
        # Add standardized values to the task data
        task_data[f'std_{country}_{task_column}'] = standardized_values.values

In [None]:
# Calculate the intensity of non-routine cognitive analytical tasks (NRCA)
for country in countries:
    task_columns_std = [f'std_{country}_{task_column}' for task_column in task_columns]
    task_data[f'{country}_NRCA'] = task_data[task_columns_std].sum(axis=1)

# Standardize NRCA values for each country
for country in countries:
    std_column = f'std_{country}_NRCA'
    mean = np.average(task_data[std_column], weights=task_data['share'])
    std_dev = np.sqrt(np.average((task_data[std_column] - mean)**2, weights=task_data['share']))
    task_data[f'std_{country}_NRCA'] = (task_data[std_column] - mean) / std_dev

In [None]:
# Calculate the country-level mean of standardized NRCA values
multip_columns = [f'std_{country}_NRCA' for country in countries]
task_data['multip_NRCA'] = task_data[multip_columns].mul(task_data['share'], axis=0).sum(axis=1)

# Group by time and calculate the sum of multip_NRCA for each country over time
agg_data = task_data.groupby('TIME')[[f'multip_{country}_NRCA' for country in countries]].sum().reset_index()

In [None]:
# Plot the changes in NRCA intensity over time for each country using subplots
fig, axs = plt.subplots(len(countries), 1, figsize=(8, 6), sharex=True)

for i, country in enumerate(countries):
    axs[i].plot(agg_data['TIME'], agg_data[f'multip_{country}_NRCA'])
    axs[i].set_xticks(range(0, len(agg_data), 3))
    axs[i].set_xticklabels(agg_data['TIME'][::3])
    axs[i].set_title(country)

plt.tight_layout()
plt.show()