In [31]:
# Bronze step association
import os
import glob
import pandas as pd
import re

base_dir = "c:/Users/darka/Desktop/Projets/Elexxion/"
input_dir = os.path.join(base_dir, "association/raw")
valid_dir = os.path.join(base_dir, "association/valid")
parquet_dir = os.path.join(base_dir, "association/parquet/bronze")
expected_fields = 23
delimiter = ";"

os.makedirs(valid_dir, exist_ok=True)
os.makedirs(parquet_dir, exist_ok=True)

csv_files = glob.glob(os.path.join(input_dir, "*.csv"))

for input_path in csv_files:
  filename = os.path.basename(input_path)
  output_path = os.path.join(valid_dir, filename.replace(".csv", "_valid.csv"))

  valid_lines = []
  error_lines = []

  with open(input_path, "r", encoding="utf-8") as file:
    for line_number, line in enumerate(file, start=1):
      fields = line.strip().split(delimiter)

      if len(fields) == expected_fields:
        valid_lines.append(line.strip())
      else:
        error_lines.append((line_number, line.strip()))

  with open(output_path, "w", encoding="utf-8") as output_file:
    for line in valid_lines:
      output_file.write(line + "\n")

valid_csv_files  = glob.glob(os.path.join(valid_dir, "*.csv"))

for csv_path in valid_csv_files :
  filename = os.path.basename(csv_path)

  match = re.search(r'(\d{4})\d{4}_dpt_((?:\d{2}|2A|2B))', filename)
  if match:
    year, dpt = match.group(1), match.group(2)
    parquet_filename = f"df_bronze_association_{year}_dpt_{dpt}.parquet"
  else:
    print(f"[⚠️] Fichier ignoré (pattern non reconnu) : {filename}")
    continue

  parquet_path = os.path.join(parquet_dir, parquet_filename)

  df = pd.read_csv(csv_path, sep=delimiter, dtype=str)
  df.to_parquet(parquet_path, index=False)


In [None]:
# Silver step association
import pandas as pd
import os

base_dir = "c:/Users/darka/Desktop/Projets/Elexxion"
bronze_dir = os.path.join(base_dir, "association/parquet/bronze")
silver_dir = os.path.join(base_dir, "association/parquet/silver")

os.makedirs(silver_dir, exist_ok=True)

for filename in os.listdir(bronze_dir):
  if filename.endswith(".parquet"):
    file_path = os.path.join(bronze_dir, filename)
    df = pd.read_parquet(file_path)

    # Remove duplicates
    df = df.drop_duplicates()

    # Remove empty rows
    df = df.dropna(how='all')

    # Save as Parquet with '_df_silver' prefix
    silver_filename = filename.replace("df_bronze", "df_silver")
    silver_file_path = os.path.join(silver_dir, silver_filename)

    df.to_parquet(silver_file_path, index=False)
    print(f"[✅] Fichier Silver sauvegardé : {silver_filename}")
