In [1]:
import json
import os

import pandas as pd

In [2]:
base_path = "../"
data_path = os.path.join(base_path, "data")
data_path = os.path.join(data_path, "rome")

In [3]:
# Load raw data files
df = pd.read_csv(os.path.join(data_path, "cr_gd_dp_appellations.csv"), dtype=str)
df_text = pd.read_csv(os.path.join(data_path, "texte.csv"), dtype=str)

In [4]:
# Forward fill missing hierarchical codes
# This propagates parent codes to child rows that have empty hierarchy fields
for idx, row in df.iterrows():
    if (
        pd.isna(row["code_grand_domaine"])
        and pd.isna(row["code_domaine_professionnel"])
        and pd.isna(row["numero_fiche_rome"])
        and pd.isna(row["intitule"])
    ):
        row["code_grand_domaine"] = df.at[idx - 1, "code_grand_domaine"]
        row["code_domaine_professionnel"] = df.at[idx - 1, "code_domaine_professionnel"]
        row["numero_fiche_rome"] = df.at[idx - 1, "numero_fiche_rome"]
        row["intitule"] = df.at[idx - 1, "intitule"]

In [None]:
# Create composite code by concatenating hierarchy levels
df["code"] = (
    df["code_grand_domaine"].fillna("")
    + df["code_domaine_professionnel"].fillna("")
    + df["numero_fiche_rome"].fillna("")
)

In [6]:
# Filter and group data to get occupations with their appellations
# Only keep rows that have complete hierarchy and job titles
df_filtered = df[
    df["code_grand_domaine"].notna()
    & df["code_domaine_professionnel"].notna()
    & df["numero_fiche_rome"].notna()
    & df["libelle_appellation_long"].notna()
]

In [7]:
# Group by occupation code and aggregate job titles
df_grouped = (
    df_filtered.groupby("code")
    .agg(
        {
            "code_grand_domaine": "first",
            "code_domaine_professionnel": "first",
            "intitule": "first",
            "libelle_appellation_long": list,
            "libelle_appellation_court": list,
        },
    )
    .reset_index()
)

In [8]:
df_grouped["keywords"] = None

# Create category and keywords fields
for idx, row in df_grouped.iterrows():
    # Get parent category names
    intitule_grand_domaine = df[df["code"] == row["code_grand_domaine"]][
        "intitule"
    ].values[0]
    intitule_domaine_professionnel = df[
        df["code"] == row["code_grand_domaine"] + row["code_domaine_professionnel"]
    ]["intitule"].values[0]

    # Update the grouped DataFrame with category and keywords
    df_grouped.at[idx, "category"] = ", ".join(
        list(set([intitule_grand_domaine, intitule_domaine_professionnel])),
    )
    df_grouped.at[idx, "keywords"] = list(
        set(row["libelle_appellation_long"] + row["libelle_appellation_court"]),
    )
    df_grouped.at[idx, "description"] = " ".join(
        df_text[df_text["code_rome"] == row["code"]]["libelle_texte"].values,
    )

In [9]:
columns_to_export = [
    "code",
    "intitule",
    "category",
    "description",
    "keywords",
]
json_data = df_grouped[columns_to_export].to_dict("records")
with open(os.path.join(data_path, "cleaned.json"), "w", encoding="utf-8") as f:
    json.dump(json_data, f, indent=2, ensure_ascii=False, separators=(",", ": "))