In [1]:
import pandas as pd
import os
import glob

# === Paths ===
base_input_dir = "FD_csv_EEC"
bronze_output_dir = os.path.join("emploi", "parquet", "bronze")
os.makedirs(bronze_output_dir, exist_ok=True)

# === Load and prepare metadata once ===
metadata_file = os.path.join(base_input_dir, "Varmod_EEC.csv")
metadata_df = pd.read_csv(metadata_file, sep=';', encoding='utf-8')
unique_vars = metadata_df[['COD_VAR', 'TYPE_VAR']].drop_duplicates()

type_map = {
    'NUM': 'float',
    'CHAR': 'string',
    'DATE': 'datetime'
}

dtype_mapping = {
    row['COD_VAR']: type_map.get(row['TYPE_VAR'], 'string')
    for _, row in unique_vars.iterrows()
}

# === Process each year's CSV ===
for file_path in glob.glob(os.path.join(base_input_dir, "FD_EEC_*.csv")):
    year_str = os.path.basename(file_path).split("_")[-1].split(".")[0]
    print(f"Processing file for year {year_str}")

    df = pd.read_csv(file_path, sep=';', encoding='utf-8')

    if 'ANNEE' not in df.columns:
        df['ANNEE'] = int(year_str)

    for col, dtype in dtype_mapping.items():
        if col not in df.columns:
            continue
        try:
            if dtype == 'datetime':
                df[col] = pd.to_datetime(df[col], errors='coerce')
            else:
                df[col] = df[col].astype(dtype)
        except Exception as e:
            print(f"Error converting {col} to {dtype}: {e}")

    # Save as Parquet
    output_parquet = os.path.join(bronze_output_dir, f"df_bronze_emploi_EEC{year_str}.parquet")
    df.to_parquet(output_parquet, index=False)
    print(f"Saved: {output_parquet}")


Processing file for year 2018
Saved: emploi\parquet\bronze\df_bronze_emploi_EEC2018.parquet
Processing file for year 2019
Saved: emploi\parquet\bronze\df_bronze_emploi_EEC2019.parquet
Processing file for year 2020
Saved: emploi\parquet\bronze\df_bronze_emploi_EEC2020.parquet
Processing file for year 2021
Saved: emploi\parquet\bronze\df_bronze_emploi_EEC2021.parquet
Processing file for year 2022
Saved: emploi\parquet\bronze\df_bronze_emploi_EEC2022.parquet
Processing file for year 2023
Saved: emploi\parquet\bronze\df_bronze_emploi_EEC2023.parquet


In [2]:
import pandas as pd
import os
import glob

# === Paths ===
bronze_dir = os.path.join("emploi", "parquet", "bronze")
silver_dir = os.path.join("emploi", "parquet", "silver")
os.makedirs(silver_dir, exist_ok=True)

# === Process all Bronze Parquet files ===
for bronze_path in glob.glob(os.path.join(bronze_dir, "df_bronze_emploi_EEC*.parquet")):
    year_str = os.path.basename(bronze_path).split("EEC")[-1].split(".")[0]
    print(f"\nProcessing year: {year_str}")

    df = pd.read_parquet(bronze_path)

    # === Cleaning ===
    df.dropna(how='all', inplace=True)
    df.drop_duplicates(inplace=True)

    # Drop rows missing the 'ANNEE' column if present
    if 'ANNEE' in df.columns:
        df.dropna(subset=['ANNEE'], inplace=True)
    else:
        df['ANNEE'] = int(year_str)

    # Save cleaned file to Silver
    silver_path = os.path.join(silver_dir, f"df_silver_emploi_EEC{year_str}.parquet")
    df.to_parquet(silver_path, index=False)

    print(f"Saved cleaned Silver file: {silver_path} ({df.shape[0]} rows)")



Processing year: 2018
Saved cleaned Silver file: emploi\parquet\silver\df_silver_emploi_EEC2018.parquet (423283 rows)

Processing year: 2019
Saved cleaned Silver file: emploi\parquet\silver\df_silver_emploi_EEC2019.parquet (415031 rows)

Processing year: 2020
Saved cleaned Silver file: emploi\parquet\silver\df_silver_emploi_EEC2020.parquet (318427 rows)

Processing year: 2021
Saved cleaned Silver file: emploi\parquet\silver\df_silver_emploi_EEC2021.parquet (343304 rows)

Processing year: 2022
Saved cleaned Silver file: emploi\parquet\silver\df_silver_emploi_EEC2022.parquet (348964 rows)

Processing year: 2023
Saved cleaned Silver file: emploi\parquet\silver\df_silver_emploi_EEC2023.parquet (348624 rows)


In [3]:
import pandas as pd
import os
import glob

# === Paths ===
silver_dir = os.path.join("emploi", "parquet", "silver")
gold_dir = os.path.join("emploi", "parquet", "gold")
os.makedirs(gold_dir, exist_ok=True)
summary_output_path = os.path.join(gold_dir, "df_gold_emploi_summary.parquet")

# === Collect data from all Silver files ===
required_columns = ['AGE6', 'SEXE', 'ACTEU', 'EXTRIAN', 'STATUT', 'STATUTDET', 'TPPRED', 'ANNEE']
df_all = []

for silver_path in glob.glob(os.path.join(silver_dir, "df_silver_emploi_EEC*.parquet")):
    print(f"Reading {os.path.basename(silver_path)}")
    df = pd.read_parquet(silver_path)

    # Filter required columns
    available_cols = [col for col in required_columns if col in df.columns]
    df = df[available_cols]

    # Ensure ANNEE and EXTRIAN are numeric
    df = df[df['ANNEE'].notna()]
    df['ANNEE'] = df['ANNEE'].astype(int)
    df['EXTRIAN'] = pd.to_numeric(df['EXTRIAN'], errors='coerce').fillna(0)

    df_all.append(df)

# Combine all years
df_all_years = pd.concat(df_all, ignore_index=True)

# === Compute indicators grouped by ANNEE ===
# Ensure EXTRIAN and ACTEU are numeric
df_all_years['EXTRIAN'] = pd.to_numeric(df_all_years['EXTRIAN'], errors='coerce')
df_all_years['ACTEU'] = pd.to_numeric(df_all_years['ACTEU'], errors='coerce')

# Filter for relevant rows
df_chomeurs = df_all_years[df_all_years['ACTEU'] == 2]
df_actifs = df_all_years[df_all_years['ACTEU'].isin([1, 2])]

# Compute sums of EXTRIAN per year
sum_chomeurs = df_chomeurs.groupby('ANNEE')['EXTRIAN'].sum()
sum_actifs = df_actifs.groupby('ANNEE')['EXTRIAN'].sum()

# Compute taux_chomage safely
summary = pd.DataFrame({
    'taux_chomage': 100 * sum_chomeurs / sum_actifs
}).reset_index()

summary['taux_chomage'] = summary['taux_chomage'].round(2)

# === Save summary ===
summary.to_parquet(summary_output_path, index=False)

print(f"Weighted employment summary saved to: {summary_output_path}")
print(summary)


Reading df_silver_emploi_EEC2018.parquet
Reading df_silver_emploi_EEC2019.parquet
Reading df_silver_emploi_EEC2020.parquet
Reading df_silver_emploi_EEC2021.parquet
Reading df_silver_emploi_EEC2022.parquet
Reading df_silver_emploi_EEC2023.parquet
Weighted employment summary saved to: emploi\parquet\gold\df_gold_emploi_summary.parquet
   ANNEE  taux_chomage
0   2018          9.06
1   2019          8.44
2   2020          8.01
3   2021          7.86
4   2022          7.31
5   2023          7.33
