In [1]:
# Bronze step association
import os
import glob
import pandas as pd
import re

base_dir = "c:/Users/darka/Desktop/Projets/Elexxion/"
input_dir = os.path.join(base_dir, "association/raw")
valid_dir = os.path.join(base_dir, "association/valid")
parquet_dir = os.path.join(base_dir, "association/parquet/bronze")
expected_fields = 23
delimiter = ";"

os.makedirs(valid_dir, exist_ok=True)
os.makedirs(parquet_dir, exist_ok=True)

csv_files = glob.glob(os.path.join(input_dir, "*.csv"))

for input_path in csv_files:
  filename = os.path.basename(input_path)
  output_path = os.path.join(valid_dir, filename.replace(".csv", "_valid.csv"))

  valid_lines = []
  error_lines = []

  with open(input_path, "r", encoding="utf-8") as file:
    for line_number, line in enumerate(file, start=1):
      fields = line.strip().split(delimiter)

      if len(fields) == expected_fields:
        valid_lines.append(line.strip())
      else:
        error_lines.append((line_number, line.strip()))

  with open(output_path, "w", encoding="utf-8") as output_file:
    for line in valid_lines:
      output_file.write(line + "\n")

valid_csv_files  = glob.glob(os.path.join(valid_dir, "*.csv"))

for csv_path in valid_csv_files :
  filename = os.path.basename(csv_path)
  print(f"Processing file: {filename}")

  match = re.search(r'rna_import_(\d{8})_dpt_([0-9]{2}|[0-9]{3}|2A|2B|97[1-9]{1}[0-9]{1})', filename)
  if match:
    full_year = match.group(1)
    year = full_year[:4]
    dpt = match.group(2)

    print(f"Matched year: {year}, department: {dpt}")
    parquet_filename = f"df_bronze_association_{year}_dpt_{dpt}.parquet"
  else:
    print(f"[⚠️ Ignored file (unknown pattern) : {filename}")
    continue

  parquet_path = os.path.join(parquet_dir, parquet_filename)

  df = pd.read_csv(csv_path, sep=delimiter, dtype=str)
  df.to_parquet(parquet_path, index=False)
  print(f"Converted to parquet: {parquet_path}")


Processing file: rna_import_20250501_dpt_01_valid.csv
Matched year: 2025, department: 01
Converted to parquet: c:/Users/darka/Desktop/Projets/Elexxion/association/parquet/bronze\df_bronze_association_2025_dpt_01.parquet
Processing file: rna_import_20250501_dpt_02_valid.csv
Matched year: 2025, department: 02
Converted to parquet: c:/Users/darka/Desktop/Projets/Elexxion/association/parquet/bronze\df_bronze_association_2025_dpt_02.parquet
Processing file: rna_import_20250501_dpt_03_valid.csv
Matched year: 2025, department: 03
Converted to parquet: c:/Users/darka/Desktop/Projets/Elexxion/association/parquet/bronze\df_bronze_association_2025_dpt_03.parquet
Processing file: rna_import_20250501_dpt_04_valid.csv
Matched year: 2025, department: 04
Converted to parquet: c:/Users/darka/Desktop/Projets/Elexxion/association/parquet/bronze\df_bronze_association_2025_dpt_04.parquet
Processing file: rna_import_20250501_dpt_05_valid.csv
Matched year: 2025, department: 05
Converted to parquet: c:/Users/

In [2]:
# Silver step association
import pandas as pd
import os

bronze_dir = os.path.join(base_dir, "association/parquet/bronze")
silver_dir = os.path.join(base_dir, "association/parquet/silver")

os.makedirs(silver_dir, exist_ok=True)

for filename in os.listdir(bronze_dir):
  if filename.endswith(".parquet"):
    file_path = os.path.join(bronze_dir, filename)
    df = pd.read_parquet(file_path)

    # Standardize column names
    df.columns = df.columns.str.lower().str.replace(' ', '_')

    # Remove duplicates
    df = df.drop_duplicates()

    # Remove empty rows
    df = df.dropna(how='all')

    # Colums renaming
    df.rename(columns={'adrs_codepostal': 'cp'}, inplace=True)
    df.rename(columns={'date_publi': 'publication'}, inplace=True)
    df.rename(columns={'libcom': 'commune'}, inplace=True)
    df.rename(columns={'maj_time': 'maj'}, inplace=True)
    df.rename(columns={'objet': 'resume'}, inplace=True)
    df.rename(columns={'publication': 'creation'}, inplace=True)
    df.rename(columns={'titre': 'nom'}, inplace=True)

    # Colums deleting
    df = df.drop(
      columns=[
        "adr1",
        "adr2",
        "adr3",
        "date_creat",
        "dir_civilite",
        "gestion",
        "groupement",
        "id_ex",
        "nature",
        "objet_social1",
        "objet_social2",
        "observation",
        "position",
        "rup_mi",
        "siret",
        "siteweb"
      ]
    )

    # Save as Parquet with '_df_silver' prefix
    silver_filename = filename.replace("df_bronze", "df_silver")
    silver_file_path = os.path.join(silver_dir, silver_filename)

    df.to_parquet(silver_file_path, index=False)
    print(f"✅ Silver files saved : {silver_filename}")


✅ Silver files saved : df_silver_association_2025_dpt_01.parquet
✅ Silver files saved : df_silver_association_2025_dpt_02.parquet
✅ Silver files saved : df_silver_association_2025_dpt_03.parquet
✅ Silver files saved : df_silver_association_2025_dpt_04.parquet
✅ Silver files saved : df_silver_association_2025_dpt_05.parquet
✅ Silver files saved : df_silver_association_2025_dpt_06.parquet
✅ Silver files saved : df_silver_association_2025_dpt_07.parquet
✅ Silver files saved : df_silver_association_2025_dpt_08.parquet
✅ Silver files saved : df_silver_association_2025_dpt_09.parquet
✅ Silver files saved : df_silver_association_2025_dpt_10.parquet
✅ Silver files saved : df_silver_association_2025_dpt_11.parquet
✅ Silver files saved : df_silver_association_2025_dpt_12.parquet
✅ Silver files saved : df_silver_association_2025_dpt_13.parquet
✅ Silver files saved : df_silver_association_2025_dpt_14.parquet
✅ Silver files saved : df_silver_association_2025_dpt_15.parquet
✅ Silver files saved : df

In [3]:
# Gold step association
import os
import pandas as pd

base_dir = "c:/Users/darka/Desktop/Projets/Elexxion/"
silver_dir = os.path.join(base_dir, "association/parquet/silver")
gold_dir = os.path.join(base_dir, "association/parquet/gold")
os.makedirs(gold_dir, exist_ok=True)

df_list = []

departement_to_region = {
  '01': '84', '02': '32', '03': '84', '04': '93', '05': '93', '06': '93',
  '07': '84', '08': '44', '09': '76', '10': '44', '11': '76', '12': '76',
  '13': '93', '14': '28', '15': '84', '16': '75', '17': '75', '18': '24',
  '19': '75', '2A': '94', '2B': '94', '21': '27', '22': '53', '23': '75',
  '24': '75', '25': '27', '26': '84', '27': '28', '28': '24', '29': '53',
  '30': '76', '31': '76', '32': '76', '33': '75', '34': '76', '35': '53',
  '36': '24', '37': '24', '38': '84', '39': '27', '40': '75', '41': '24',
  '42': '84', '43': '84', '44': '52', '45': '24', '46': '76', '47': '75',
  '48': '76', '49': '52', '50': '28', '51': '44', '52': '44', '53': '52',
  '54': '44', '55': '44', '56': '53', '57': '44', '58': '27', '59': '32',
  '60': '32', '61': '28', '62': '32', '63': '84', '64': '75', '65': '76',
  '66': '76', '67': '44', '68': '44', '69': '84', '70': '27', '71': '27',
  '72': '52', '73': '84', '74': '84', '75': '11', '76': '28', '77': '11',
  '78': '11', '79': '75', '80': '32', '81': '76', '82': '76', '83': '93',
  '84': '93', '85': '52', '86': '75', '87': '75', '88': '44', '89': '27',
  '90': '27', '91': '11', '92': '11', '93': '11', '94': '11', '95': '11',
  '971': '01', '972': '02', '973': '03', '974': '04', '976': '06'
}
valid_department = set(departement_to_region.keys())

# Load silvers Parquet files
for filename in os.listdir(silver_dir):
  if filename.endswith(".parquet"):
    file_path = os.path.join(silver_dir, filename)
    df = pd.read_parquet(file_path)
    df_list.append(df)

if df_list:
  df_gold = pd.concat(df_list, ignore_index=True)

  df_gold["cp"] = df_gold["cp"].astype(str).str.strip()
  df_gold = df_gold[df_gold["cp"].str.match(r"^\d{5}$")]

  df_gold["departement"] = df_gold["cp"].str[:2]
  df_gold = df_gold[df_gold["departement"].isin(valid_department)]

  df_gold["annee"] = pd.to_datetime(df_gold["maj"], errors="coerce").dt.year

  df_agg = df_gold.groupby(["departement", "annee"]).size().reset_index(name="nombre_nouvelle_asso")
  df_agg = df_agg.sort_values(by=["departement", "annee"])

  df_agg["region"] = df_agg["departement"].map(departement_to_region)
  df_agg["cumul_global"] = df_agg.groupby("departement")["nombre_nouvelle_asso"].cumsum()

  columns_order = ["departement", "region", "annee", "nombre_nouvelle_asso", "cumul_global"]
  df_agg = df_agg[columns_order]

  agg_filename = "df_gold_association_2004_2025_departement.parquet"
  agg_file_path = os.path.join(gold_dir, agg_filename)
  df_agg.to_parquet(agg_file_path, index=False)
  print(f"📊 Aggregated file saved : {agg_filename}")


📊 Aggregated file saved : df_gold_association_2004_2025_departement.parquet
