In [10]:
import json
import os

import pandas as pd
from numpy import nan

In [11]:
base_path = "../"
data_path = os.path.join(base_path, "data")
data_path = os.path.join(data_path, "esco")

In [12]:
# NOTE: This script works for both the French and English versions of ESCO.
# You have to change the language manually in the LANG variable below, and re-run the script.

# LANG = 'en'
LANG = "fr"

In [13]:
# Load raw data files
df = pd.read_csv(
    os.path.join(data_path, LANG, f"occupations_{LANG.lower()}.csv"),
    encoding="utf-8",
    dtype={"iscoGroup": str},
)

FILES = [
    f"researchSkillsCollection_{LANG.lower()}.csv",
    f"transversalSkillsCollection_{LANG.lower()}.csv",
    f"languageSkillsCollection_{LANG.lower()}.csv",
    f"greenSkillsCollection_{LANG.lower()}.csv",
    f"digitalSkillsCollection_{LANG.lower()}.csv",
    f"digCompSkillsCollection_{LANG.lower()}.csv",
]
df_others = pd.DataFrame({})
for FILE in FILES:
    df_others = pd.concat(
        [df_others, pd.read_csv(os.path.join(data_path, LANG, FILE), encoding="utf-8")],
    )

In [16]:
# Concat df and df_others
df_global = pd.concat([df, df_others], ignore_index=True)

In [17]:
df_global.replace([nan], [None], inplace=True)

In [18]:
columns_to_export = [
    "preferredLabel",
    "altLabels",
    "description",
    "conceptUri",
    "hiddenLabels",
    "broaderConceptPT",
]

json_data = df_global[columns_to_export].to_dict("records")
with open(
    os.path.join(data_path, LANG, "cleaned_skills.json"),
    "w",
    encoding="utf-8",
) as f:
    json.dump(json_data, f, indent=2, ensure_ascii=False, separators=(",", ": "))