In [29]:
# Bronze step association
import os
import glob
import pandas as pd
import re

base_dir = "c:/Users/darka/Desktop/Projets/Elexxion/"
input_dir = os.path.join(base_dir, "association/raw")
valid_dir = os.path.join(base_dir, "association/valid")
parquet_dir = os.path.join(base_dir, "association/parquet/bronze")
expected_fields = 23
delimiter = ";"

os.makedirs(valid_dir, exist_ok=True)
os.makedirs(parquet_dir, exist_ok=True)

csv_files = glob.glob(os.path.join(input_dir, "*.csv"))

for input_path in csv_files:
  filename = os.path.basename(input_path)
  output_path = os.path.join(valid_dir, filename.replace(".csv", "_valid.csv"))

  valid_lines = []
  error_lines = []

  with open(input_path, "r", encoding="utf-8") as file:
    for line_number, line in enumerate(file, start=1):
      fields = line.strip().split(delimiter)

      if len(fields) == expected_fields:
        valid_lines.append(line.strip())
      else:
        error_lines.append((line_number, line.strip()))

  with open(output_path, "w", encoding="utf-8") as output_file:
    for line in valid_lines:
      output_file.write(line + "\n")

valid_csv_files  = glob.glob(os.path.join(valid_dir, "*.csv"))

for csv_path in valid_csv_files :
  filename = os.path.basename(csv_path)

  match = re.search(r'(\d{4})\d{4}_dpt_([0-9]{2}[A-B]?)', filename)
  if match:
    year, dpt = match.group(1), match.group(2)
    parquet_filename = f"df_bronze_association_{year}_dpt_{dpt}.parquet"
  else:
    print(f"[⚠️] Fichier ignoré (pattern non reconnu) : {filename}")
    continue

  parquet_path = os.path.join(parquet_dir, parquet_filename)

  df = pd.read_csv(csv_path, sep=delimiter, dtype=str)
  df.to_parquet(parquet_path, index=False)


[⚠️] Fichier ignoré (pattern non reconnu) : rna_import_20250501_dpt_2A_valid.csv
[⚠️] Fichier ignoré (pattern non reconnu) : rna_import_20250501_dpt_2B_valid.csv


In [30]:
# Silver step association
import pandas as pd
import os

base_dir = "c:/Users/darka/Desktop/Projets/Elexxion"
bronze_dir = os.path.join(base_dir, "association/parquet/bronze")
silver_dir = os.path.join(base_dir, "association/parquet/silver")

os.makedirs(silver_dir, exist_ok=True)

for filename in os.listdir(bronze_dir):
  if filename.endswith(".parquet"):
    file_path = os.path.join(bronze_dir, filename)
    df = pd.read_parquet(file_path)

    # Remove duplicates
    df = df.drop_duplicates()

    # Remove empty rows
    df = df.dropna(how='all')

    # Save as Parquet with '_df_silver' prefix
    silver_filename = filename.replace("df_bronze", "df_silver")
    silver_file_path = os.path.join(silver_dir, silver_filename)

    df.to_parquet(silver_file_path, index=False)
    print(f"[✅] Fichier Silver sauvegardé : {silver_filename}")


[✅] Fichier Silver sauvegardé : df_silver_association_2025_dpt_01.parquet
[✅] Fichier Silver sauvegardé : df_silver_association_2025_dpt_02.parquet
[✅] Fichier Silver sauvegardé : df_silver_association_2025_dpt_03.parquet
[✅] Fichier Silver sauvegardé : df_silver_association_2025_dpt_04.parquet
[✅] Fichier Silver sauvegardé : df_silver_association_2025_dpt_05.parquet
[✅] Fichier Silver sauvegardé : df_silver_association_2025_dpt_06.parquet
[✅] Fichier Silver sauvegardé : df_silver_association_2025_dpt_07.parquet
[✅] Fichier Silver sauvegardé : df_silver_association_2025_dpt_08.parquet
[✅] Fichier Silver sauvegardé : df_silver_association_2025_dpt_09.parquet
[✅] Fichier Silver sauvegardé : df_silver_association_2025_dpt_10.parquet
[✅] Fichier Silver sauvegardé : df_silver_association_2025_dpt_11.parquet
[✅] Fichier Silver sauvegardé : df_silver_association_2025_dpt_12.parquet
[✅] Fichier Silver sauvegardé : df_silver_association_2025_dpt_13.parquet
[✅] Fichier Silver sauvegardé : df_sil

In [None]:
import os
import pandas as pd

root_dir = 'c:/Users/darka/Desktop/Projets/Elexxion/association/clean/'
output_dir = 'c:/Users/darka/Desktop/Projets/Elexxion/association/parquet/'

# Create output folder if no exists
os.makedirs(output_dir, exist_ok=True)

# Recursive browsing of all files
for dirpath, _, filenames in os.walk(root_dir):
  for filename in filenames:
    if filename.lower().endswith('.csv'):
      csv_path = os.path.join(dirpath, filename)

      # Read CSV
      try:
        df = pd.read_csv(csv_path)
      except Exception as e:
        print(f"Read error {csv_path} : {e}")
        continue

      # Create output path in .parquet
      relative_path = os.path.relpath(dirpath, root_dir)
      parquet_dir = os.path.join(output_dir, relative_path)
      os.makedirs(parquet_dir, exist_ok=True)

      base_name = os.path.splitext(filename)[0]
      parquet_filename = f'df_bronze_{base_name}.parquet'
      parquet_path = os.path.join(parquet_dir, parquet_filename)

      # Save as Parquet with '_df_bronze' prefix
      try:
        df.to_parquet(parquet_path, index=False)
        print(f"Convert : {csv_path} → {parquet_path}")
      except Exception as e:
        print(f"Write error {parquet_path} : {e}")


In [None]:
import os
import pandas as pd

root_dir = 'c:/Users/darka/Desktop/Projets/Elexxion/association/clean/'
output_dir = 'c:/Users/darka/Desktop/Projets/Elexxion/association/parquet/'

# Create output folder if no exists
os.makedirs(output_dir, exist_ok=True)

# Recursive browsing of all files
for dirpath, _, filenames in os.walk(root_dir):
  for filename in filenames:
    if filename.lower().endswith('.csv'):
      csv_path = os.path.join(dirpath, filename)

      # Read CSV
      try:
        df = pd.read_csv(csv_path)
      except Exception as e:
        print(f"Read error {csv_path} : {e}")
        continue

      # Create output path in .parquet
      relative_path = os.path.relpath(dirpath, root_dir)
      parquet_dir = os.path.join(output_dir, relative_path)
      os.makedirs(parquet_dir, exist_ok=True)

      base_name = os.path.splitext(filename)[0]
      parquet_filename = f'df_bronze_{base_name}.parquet'
      parquet_path = os.path.join(parquet_dir, parquet_filename)

      # Save as Parquet with '_df_bronze' prefix
      try:
        df.to_parquet(parquet_path, index=False)
        print(f"Convert : {csv_path} → {parquet_path}")
      except Exception as e:
        print(f"Write error {parquet_path} : {e}")
