In [None]:
import pandas as pd
import os
import glob

base_input_dir = os.path.join("..", "emploi")
bronze_output_dir = os.path.join(base_input_dir, "parquet", "bronze")
metadata_dir = os.path.join(base_input_dir, "metadata")
os.makedirs(bronze_output_dir, exist_ok=True)

type_map = {
    'NUM': 'float',
    'CHAR': 'string',
    'DATE': 'datetime'
}

for file_path in glob.glob(os.path.join(base_input_dir, "raw", "FD_csv_EEC*.csv")):
    filename = os.path.basename(file_path)
    year_suffix = filename.replace("FD_csv_EEC", "").replace(".csv", "")

    if year_suffix.isdigit():
        print(f"Processing file for year {year_suffix}")

        full_year = f"20{year_suffix.zfill(2)}"
        metadata_file = os.path.join(metadata_dir, f"Varmod_EEC_{full_year}.csv")
        if not os.path.exists(metadata_file):
            print(f"Metadata file not found: {metadata_file}, skipping.")
            continue

        metadata_df = pd.read_csv(metadata_file, sep=';', encoding='utf-8')
        unique_vars = metadata_df[['COD_VAR', 'TYPE_VAR']].drop_duplicates()

        dtype_mapping = {
            row['COD_VAR']: type_map.get(row['TYPE_VAR'], 'string')
            for _, row in unique_vars.iterrows()
        }

        df = pd.read_csv(file_path, sep=';', encoding='utf-8')

        for col, dtype in dtype_mapping.items():
            if col not in df.columns:
                continue
            try:
                if dtype == 'datetime':
                    df[col] = pd.to_datetime(df[col], errors='coerce')
                else:
                    df[col] = df[col].astype(dtype)
            except Exception as e:
                print(f"Error converting {col} to {dtype}: {e}")

        output_parquet = os.path.join(bronze_output_dir, f"df_bronze_FD_EEC{year_suffix}.parquet")
        df.to_parquet(output_parquet, index=False)
        print(f"Saved: {output_parquet}")
    else:
        print(f"Could not extract year from filename: {filename}")


In [None]:
import pandas as pd
import os
import glob

bronze_dir = os.path.join(base_input_dir, "parquet", "bronze")
silver_dir = os.path.join(base_input_dir, "parquet", "silver")
os.makedirs(silver_dir, exist_ok=True)

for bronze_path in glob.glob(os.path.join(bronze_dir, "df_bronze_FD_EEC*.parquet")):
    year_str = os.path.basename(bronze_path).split("EEC")[-1].split(".")[0]
    print(f"\nProcessing year: {year_str}")

    df = pd.read_parquet(bronze_path)

    df.dropna(how='all', inplace=True)
    df.drop_duplicates(inplace=True)

    if 'ANNEE' in df.columns:
        df.dropna(subset=['ANNEE'], inplace=True)
    else:
        df['ANNEE'] = int(year_str)

    silver_path = os.path.join(silver_dir, f"df_silver_FD_EEC{year_str}.parquet")
    df.to_parquet(silver_path, index=False)

    print(f"Saved Silver parquet: {silver_path} ({df.shape[0]} rows)")


In [None]:
import pandas as pd
import os
import glob

silver_dir = os.path.join(base_input_dir, "parquet", "silver")
gold_dir = os.path.join(base_input_dir, "parquet", "gold")
os.makedirs(gold_dir, exist_ok=True)
summary_output_path = os.path.join(gold_dir, "df_gold_FD_EEC.parquet")

required_columns = ['AGE6', 'SEXE', 'ACTEU', 'EXTRIAN', 'STATUT', 'STATUTDET', 'TPPRED', 'ANNEE']
df_all = []

for silver_path in glob.glob(os.path.join(silver_dir, "df_silver_FD_EEC*.parquet")):
    print(f"Reading {os.path.basename(silver_path)}")
    df = pd.read_parquet(silver_path)

    available_cols = [col for col in required_columns if col in df.columns]
    df = df[available_cols]

    df = df[df['ANNEE'].notna()]
    df['ANNEE'] = df['ANNEE'].astype(int)
    df['EXTRIAN'] = pd.to_numeric(df['EXTRIAN'], errors='coerce').fillna(0)

    df_all.append(df)

df_all_years = pd.concat(df_all, ignore_index=True)

df_all_years['EXTRIAN'] = pd.to_numeric(df_all_years['EXTRIAN'], errors='coerce')
df_all_years['ACTEU'] = pd.to_numeric(df_all_years['ACTEU'], errors='coerce')

df_chomeurs = df_all_years[df_all_years['ACTEU'] == 2]
df_actifs = df_all_years[df_all_years['ACTEU'].isin([1, 2])]

sum_chomeurs = df_chomeurs.groupby('ANNEE')['EXTRIAN'].sum()
sum_actifs = df_actifs.groupby('ANNEE')['EXTRIAN'].sum()

summary = pd.DataFrame({
    'taux_chomage': 100 * sum_chomeurs / sum_actifs
}).reset_index()

summary['taux_chomage'] = summary['taux_chomage'].round(2)

summary.to_parquet(summary_output_path, index=False)

print(f"Gold parquet saved to: {summary_output_path}")
print(summary)
