In [None]:
import json
import os

import pandas as pd
from numpy import nan

In [None]:
base_path = "../"
data_path = os.path.join(base_path, "data")
data_path = os.path.join(data_path, "esco")

In [None]:
# NOTE: This script works for both the French and English versions of ESCO.
# You have to change the language manually in the LANG variable below,
# and re-run the script.

# LANG = 'en'
LANG = "fr"

In [None]:
# Load raw data files
df = pd.read_csv(
    os.path.join(data_path, LANG, f"occupations_{LANG.lower()}.csv"),
    encoding="utf-8",
    dtype={"iscoGroup": str},
)
isco_groups = pd.read_csv(
    os.path.join(data_path, LANG, f"ISCOGroups_{LANG.lower()}.csv"),
    encoding="utf-8",
    dtype={"code": str},
)
df_research = pd.read_csv(
    os.path.join(data_path, LANG, f"researchOccupationsCollection_{LANG.lower()}.csv"),
    encoding="utf-8",
)

In [None]:
# Load ISCO group labels
df["category"] = df["iscoGroup"].apply(
    lambda isco_group: isco_groups[isco_groups["code"] == isco_group][
        "preferredLabel"
    ].values[0],
)

In [None]:
# Concat df and df_research
df_global = pd.concat([df, df_research], ignore_index=True)

In [None]:
df_global.replace([nan], [None], inplace=True)

In [None]:
columns_to_export = [
    "preferredLabel",
    "altLabels",
    "description",
    "hiddenLabels",
    "code",
    "category",
    "broaderConceptPT",
    "conceptUri",
]

json_data = df_global[columns_to_export].to_dict("records")
with open(
    os.path.join(data_path, LANG, "cleaned_occupations.json"),
    "w",
    encoding="utf-8",
) as f:
    json.dump(json_data, f, indent=2, ensure_ascii=False, separators=(",", ": "))