In [None]:
import pandas as pd
import os

# === Define paths to gold parquets ===
base_dir = ".."
platinum_dir = os.path.join(base_dir, "platinum")
os.makedirs(platinum_dir, exist_ok=True)

# Load gold datasets
asso = pd.read_parquet(os.path.join(base_dir, "association", "parquet", "gold",  "df_gold_association_2004_2025_departement.parquet"))
crime = pd.read_parquet(os.path.join(base_dir, "crime", "parquet", "gold", "df_gold_crime_2016_2024_departement.parquet"))
emploi = pd.read_parquet(os.path.join(base_dir, "emploi", "parquet",  "gold", "df_gold_FD_EEC.parquet"))

# Normalize column names
asso.columns = asso.columns.str.upper()
crime.columns = crime.columns.str.upper()
emploi.columns = emploi.columns.str.upper()

# Ensure ANNEE is int and REGION is string
asso["ANNEE"] = asso["ANNEE"].astype(int)
crime["ANNEE"] = crime["ANNEE"].astype(int)
emploi["ANNEE"] = emploi["ANNEE"].astype(int)
asso["REGION"] = asso["REGION"].astype(str)
crime["REGION"] = crime["REGION"].astype(str)

# === Merge association and crime ===
merged = pd.merge(crime, asso, on=["ANNEE", "DEPARTEMENT", "REGION"], how="outer")

# === Merge with emploi on ANNEE only  ===
merged = pd.merge(merged, emploi, on="ANNEE", how="left")

# === Save platinum dataset ===
platinum_path = os.path.join(platinum_dir, "df_platinum_features.parquet")
merged.to_parquet(platinum_path, index=False)

print("Platinum dataset saved to:", platinum_path)