In [None]:
from functools import reduce
from pyspark.sql import functions as F
import re
import os
import pandas as pd

# --- Connexion stockage (clé dans les variables d'env du cluster) ---
storage_account_name = os.environ["STORAGE_ACCOUNT_NAME"]
storage_account_key  = os.environ["STORAGE_ACCOUNT_KEY"]

spark.conf.set(
    f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net",
    storage_account_key
)

# --- Chemins ABFSS ---
raw_plv_dir     = f"abfss://raw@{storage_account_name}.dfs.core.windows.net/parquet_plv/"
raw_res_dir     = f"abfss://raw@{storage_account_name}.dfs.core.windows.net/parquet_result/"
bronze_plv_dir  = f"abfss://bronze@{storage_account_name}.dfs.core.windows.net/plv_bronze/"
bronze_res_dir  = f"abfss://bronze@{storage_account_name}.dfs.core.windows.net/result_bronze/"

# --- Métastore / schéma ---
spark.sql("USE CATALOG hive_metastore")
spark.sql("CREATE SCHEMA IF NOT EXISTS bronze")
spark.sql("USE SCHEMA bronze")

years = [2021, 2022, 2023, 2024, 2025]

# ======================= PLV =======================
plv_paths = [f"{raw_plv_dir}dis-plv-{y}.parquet" for y in years]
plv_dfs   = [spark.read.parquet(p) for p in plv_paths]
plv_df    = reduce(lambda a, b: a.unionByName(b, allowMissingColumns=True), plv_dfs)

(plv_df.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .option("path", bronze_plv_dir)                                     # chemin Delta
    .saveAsTable("hive_metastore.bronze.plv_bronze"))                   # table métastore

# ===================== RESULT ======================
res_paths = [f"{raw_res_dir}dis_result_{y}.parquet" for y in years]
res_dfs   = [spark.read.parquet(p) for p in res_paths]
res_df    = reduce(lambda a, b: a.unionByName(b, allowMissingColumns=True), res_dfs)

(res_df.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .option("path", bronze_res_dir)
    .saveAsTable("hive_metastore.bronze.result_bronze"))
