In [None]:
# Bronze step association
import os
import glob
import pandas as pd
import re

base_dir = "c:/Users/darka/Desktop/Projets/Elexxion/"
input_dir = os.path.join(base_dir, "association/raw")
valid_dir = os.path.join(base_dir, "association/valid")
parquet_dir = os.path.join(base_dir, "association/parquet/bronze")
expected_fields = 23
delimiter = ";"

os.makedirs(valid_dir, exist_ok=True)
os.makedirs(parquet_dir, exist_ok=True)

csv_files = glob.glob(os.path.join(input_dir, "*.csv"))

for input_path in csv_files:
  filename = os.path.basename(input_path)
  output_path = os.path.join(valid_dir, filename.replace(".csv", "_valid.csv"))

  valid_lines = []
  error_lines = []

  with open(input_path, "r", encoding="utf-8") as file:
    for line_number, line in enumerate(file, start=1):
      fields = line.strip().split(delimiter)

      if len(fields) == expected_fields:
        valid_lines.append(line.strip())
      else:
        error_lines.append((line_number, line.strip()))

  with open(output_path, "w", encoding="utf-8") as output_file:
    for line in valid_lines:
      output_file.write(line + "\n")

valid_csv_files  = glob.glob(os.path.join(valid_dir, "*.csv"))

for csv_path in valid_csv_files :
  filename = os.path.basename(csv_path)
  print(f"Processing file: {filename}")

  match = re.search(r'rna_import_(\d{8})_dpt_([0-9]{2}|[0-9]{3}|2A|2B|97[1-9]{1}[0-9]{1})', filename)
  if match:
    full_year = match.group(1)
    year = full_year[:4]
    dpt = match.group(2)

    print(f"Matched year: {year}, department: {dpt}")
    parquet_filename = f"df_bronze_association_{year}_dpt_{dpt}.parquet"
  else:
    print(f"[⚠️ Ignored file (unknown pattern) : {filename}")
    continue

  parquet_path = os.path.join(parquet_dir, parquet_filename)

  df = pd.read_csv(csv_path, sep=delimiter, dtype=str)
  df.to_parquet(parquet_path, index=False)
  print(f"Converted to parquet: {parquet_path}")


In [None]:
# Silver step association
import pandas as pd
import os

bronze_dir = os.path.join(base_dir, "association/parquet/bronze")
silver_dir = os.path.join(base_dir, "association/parquet/silver")

os.makedirs(silver_dir, exist_ok=True)

for filename in os.listdir(bronze_dir):
  if filename.endswith(".parquet"):
    file_path = os.path.join(bronze_dir, filename)
    df = pd.read_parquet(file_path)

    # Standardize column names
    df.columns = df.columns.str.lower().str.replace(' ', '_')

    # Remove duplicates
    df = df.drop_duplicates()

    # Remove empty rows
    df = df.dropna(how='all')

    # Colums renaming
    df.rename(columns={'adrs_codepostal': 'cp'}, inplace=True)
    df.rename(columns={'date_publi': 'publication'}, inplace=True)
    df.rename(columns={'libcom': 'commune'}, inplace=True)
    df.rename(columns={'maj_time': 'maj'}, inplace=True)
    df.rename(columns={'objet': 'resume'}, inplace=True)
    df.rename(columns={'publication': 'creation'}, inplace=True)
    df.rename(columns={'titre': 'nom'}, inplace=True)

    # Colums deleting
    df = df.drop(
      columns=[
        "adr1",
        "adr2",
        "adr3",
        "date_creat",
        "dir_civilite",
        "gestion",
        "groupement",
        "id_ex",
        "nature",
        "objet_social1",
        "objet_social2",
        "observation",
        "position",
        "rup_mi",
        "siret",
        "siteweb"
      ]
    )

    # Save as Parquet with '_df_silver' prefix
    silver_filename = filename.replace("df_bronze", "df_silver")
    silver_file_path = os.path.join(silver_dir, silver_filename)

    df.to_parquet(silver_file_path, index=False)
    print(f"✅ Silver files saved : {silver_filename}")


In [None]:
# Gold step association
import os
import pandas as pd

base_dir = "c:/Users/darka/Desktop/Projets/Elexxion/"
silver_dir = os.path.join(base_dir, "association/parquet/silver")
gold_dir = os.path.join(base_dir, "association/parquet/gold")
os.makedirs(gold_dir, exist_ok=True)

df_list = []

# Load silvers Parquet files
for filename in os.listdir(silver_dir):
  if filename.endswith(".parquet"):
    file_path = os.path.join(silver_dir, filename)
    df = pd.read_parquet(file_path)
    df_list.append(df)

if df_list:
  df_gold = pd.concat(df_list, ignore_index=True)

  df_gold["cp"] = df_gold["cp"].astype(str).str.strip()
  df_gold = df_gold[df_gold["cp"].str.match(r"^\d{5}$")]

  df_gold["departement"] = df_gold["cp"].str[:2]
  df_gold["annee"] = pd.to_datetime(df_gold["maj"], errors="coerce").dt.year

  df_agg = df_gold.groupby(["departement", "annee"]).size().reset_index(name="nombre")
  df_agg = df_agg.sort_values(by=["departement", "annee"])

  df_agg["cumul_global"] = df_agg.groupby("departement")["nombre"].cumsum()

  agg_filename = "df_gold_association_2025_aggregated.parquet"
  agg_file_path = os.path.join(gold_dir, agg_filename)
  df_agg.to_parquet(agg_file_path, index=False)
  print(f"📊 Aggregated file saved : {agg_filename}")


📊 Fichier agrégé sauvegardé : df_gold_association_2025_aggregated.parquet
