In [None]:
from pyspark.sql import functions as F
from pyspark.sql import DataFrame
import re


account =  # your storage account name
container =  # your container name
date_folder =  # the landing_date where your 6 CSVs live

raw_root    = f"abfss://{container}@{account}.dfs.core.windows.net/raw/online_retail_ii/landing_date={date_folder}"
bronze_root = f"abfss://{container}@{account}.dfs.core.windows.net/bronze"


# file base names (without .csv) → bronze subfolder name
tables = {

    "order_products__prior": "order_products__prior",
    "order_products__train": "order_products__train",
    "products": "products",
}

def tidy_columns(df: DataFrame) -> DataFrame:
    new = df
    for c in df.columns:
        nc = re.sub(r"\s+", "_", c.strip().lower())
        new = new.withColumnRenamed(c, nc)
    return new

def bronze_write(csv_name: str, bronze_name: str):
    src = f"{raw_root}/{csv_name}.csv"
    out = f"{bronze_root}/{bronze_name}"

    df = (spark.read
          .option("header", True)
          .option("inferSchema", True)
          .option("encoding", "UTF-8")
          .csv(src))

    df = tidy_columns(df) \
         .withColumn("source_path", F.input_file_name()) \
         .withColumn("ingest_ts", F.current_timestamp()) \
         .withColumn("landing_date", F.lit(date_folder))

    # Write partitioned by landing_date for easy multi-day loads
    (df.repartition(1)
       .write.mode("overwrite")
       .partitionBy("landing_date")
       .parquet(out))

    cnt = df.count()
    print(f"✅ {csv_name}.csv → {out}  (rows: {cnt})")

for csv_name, bronze_name in tables.items():
    bronze_write(csv_name, bronze_name)

print("🎉 Bronze complete.")