In [0]:
# =========================
# 03_Run - CSV -> Bronze
# Beginner friendly - no external utils required
# =========================

import json, uuid, datetime, traceback
from pyspark.sql.functions import col, trim, current_timestamp, current_date, lit, to_timestamp, year

# Widgets
dbutils.widgets.text("domain", "")
dbutils.widgets.text("file_name", "")         # Sales.Currency or Sales.Currency.csv
dbutils.widgets.text("column_list", "")       # JSON array or CSV of headers
dbutils.widgets.text("year_column", "")       # optional hint, e.g. ModifiedDate
dbutils.widgets.text("table_name", "")        # optional folder name override
dbutils.widgets.text("batch_name", "")        # optional
dbutils.widgets.text("direct_account_key", "")# optional storage key
dbutils.widgets.text("BASE_RAW_PATH", "")     # e.g. abfss://project@acct.dfs.core.windows.net
dbutils.widgets.text("BASE_BRONZE_PATH", "")  # e.g. abfss://bronze@acct.dfs.core.windows.net
dbutils.widgets.text("include_layer", "false")# true to include /Bronze

# Read widgets
DOMAIN = dbutils.widgets.get("domain").strip()
FILE_NAME_IN = dbutils.widgets.get("file_name").strip()
COLUMN_LIST_WIDGET = dbutils.widgets.get("column_list").strip()
YEAR_COLUMN_WIDGET = dbutils.widgets.get("year_column").strip()
TABLE_NAME_WIDGET = dbutils.widgets.get("table_name").strip()
BATCH_NAME_WIDGET = dbutils.widgets.get("batch_name").strip()
DIRECT_KEY = dbutils.widgets.get("direct_account_key").strip()
BASE_RAW_WIDGET = dbutils.widgets.get("BASE_RAW_PATH").strip()
BASE_BRONZE_WIDGET = dbutils.widgets.get("BASE_BRONZE_PATH").strip()
INCLUDE_LAYER = dbutils.widgets.get("include_layer").strip().lower() in ("true","1","yes","y")

# Safe defaults / simple validation
if not FILE_NAME_IN:
    raise RuntimeError("Provide file_name widget (e.g. Sales.Currency or Sales.Currency.csv)")

# Configure storage key if passed
acct = None
if DIRECT_KEY:
    key = DIRECT_KEY.strip().strip('"').strip("'")
    if BASE_RAW_WIDGET and "@" in BASE_RAW_WIDGET:
        acct = BASE_RAW_WIDGET.split("@",1)[1].split(".")[0]
    elif BASE_BRONZE_WIDGET and "@" in BASE_BRONZE_WIDGET:
        acct = BASE_BRONZE_WIDGET.split("@",1)[1].split(".")[0]
    if not acct:
        acct = "scrgvkrmade"
    spark.conf.set(f"fs.azure.account.key.{acct}.dfs.core.windows.net", key)
    print("Configured storage key for account:", acct)
else:
    print("No storage key passed; relying on cluster identity or mounts.")

# Resolve raw / bronze base paths
if BASE_RAW_WIDGET:
    BASE_RAW_PATH = BASE_RAW_WIDGET.rstrip("/")
else:
    BASE_RAW_PATH = f"abfss://project@{acct}.dfs.core.windows.net"

if BASE_BRONZE_WIDGET:
    BASE_BRONZE_PATH = BASE_BRONZE_WIDGET.rstrip("/")
else:
    BASE_BRONZE_PATH = f"abfss://bronze@{acct}.dfs.core.windows.net"

print("BASE_RAW_PATH:", BASE_RAW_PATH)
print("BASE_BRONZE_PATH:", BASE_BRONZE_PATH)

# Build file & folder names
raw_filename = FILE_NAME_IN if FILE_NAME_IN.lower().endswith(".csv") else FILE_NAME_IN + ".csv"
file_key_no_ext = raw_filename[:-4]
folder_table_name = TABLE_NAME_WIDGET if TABLE_NAME_WIDGET else file_key_no_ext
raw_path = BASE_RAW_PATH + "/" + raw_filename
print("Raw file path:", raw_path)

# Detect delimiter by sampling file head
sep = ","
try:
    sample = dbutils.fs.head(raw_path, 8192)
    counts = {",": sample.count(","), "|": sample.count("|"), ";": sample.count(";"), "\t": sample.count("\t")}
    sep = max(counts, key=counts.get)
    if counts[sep] == 0:
        sep = ","
except Exception as e:
    print("Could not sample file; defaulting to comma. Error:", e)
    sep = ","
print("Detected delimiter:", repr(sep))

# Read CSV robustly (no schema inference)
try:
    df_raw = (spark.read
                .option("header","false")
                .option("sep", sep)
                .option("quote", '"')
                .option("escape", "\\")
                .option("multiLine", "true")
                .option("inferSchema", "false")
                .csv(raw_path))
    print("Raw rows:", df_raw.count())
    display(df_raw.limit(5))
except Exception:
    print("Initial read failed; falling back to comma delimiter.")
    df_raw = (spark.read
                .option("header","false")
                .option("sep", ",")
                .option("quote", '"')
                .option("escape", "\\")
                .option("multiLine", "true")
                .option("inferSchema", "false")
                .csv(raw_path))
    print("Fallback rows:", df_raw.count())
    display(df_raw.limit(5))

# Parse column_list widget to headers
cols = []
if COLUMN_LIST_WIDGET:
    txt = COLUMN_LIST_WIDGET.strip()
    try:
        parsed = json.loads(txt)
        if isinstance(parsed, list) and parsed:
            cols = [str(x).strip() for x in parsed]
        else:
            cols = [c.strip() for c in txt.split(",") if c.strip()]
    except Exception:
        cols = [c.strip() for c in txt.split(",") if c.strip()]

if not cols:
    raise RuntimeError("column_list parsed empty. Provide valid column_list.")

print("Applying headers:", cols)

# Apply headers to df_raw (rename first N columns)
for i, raw_col in enumerate(df_raw.columns):
    if i < len(cols):
        df_raw = df_raw.withColumnRenamed(raw_col, cols[i])

df_named = df_raw
# Trim all string columns
for c,t in df_named.dtypes:
    if t == "string":
        df_named = df_named.withColumn(c, trim(col(c)))

display(df_named.limit(5))

# Safe year extraction: only try to parse values that look like dates
year_col_used = None
if YEAR_COLUMN_WIDGET and YEAR_COLUMN_WIDGET in df_named.columns:
    year_col_used = YEAR_COLUMN_WIDGET
elif "ModifiedDate" in df_named.columns:
    year_col_used = "ModifiedDate"

if year_col_used:
    print("Trying to use year column:", year_col_used)
    from pyspark.sql.functions import when
    df_named = df_named.withColumn(
        year_col_used,
        when(col(year_col_used).rlike(r'^\s*\d{4}[-/]\d{1,2}[-/]\d{1,2}'), col(year_col_used)).otherwise(None)
    )
    df_named = df_named.withColumn(year_col_used, to_timestamp(col(year_col_used)))
    df_named = df_named.withColumn("_year", when(col(year_col_used).isNotNull(), year(col(year_col_used))).otherwise(lit(datetime.datetime.utcnow().year)))
else:
    df_named = df_named.withColumn("_year", lit(datetime.datetime.utcnow().year))

print("Year column used:", year_col_used)
display(df_named.select("_year").distinct())

# Add audit columns
_run_id = str(uuid.uuid4())
_job_id = ""  # optional best-effort
df_audited = (df_named
                .withColumn("_ingestion_ts", current_timestamp())
                .withColumn("_ingestion_date", current_date())
                .withColumn("_source_file", lit(raw_filename))
                .withColumn("_source_path", lit(raw_path))
                .withColumn("_job_id", lit(_job_id))
                .withColumn("_run_id", lit(_run_id))
                .withColumn("_batch_id", lit(BATCH_NAME_WIDGET if BATCH_NAME_WIDGET else "")))

display(df_audited.limit(5))

# Build bronze base path and write parquet (partition by _year)
if INCLUDE_LAYER:
    bronze_base = BASE_BRONZE_PATH.rstrip("/") + f"/{DOMAIN}/Bronze"
else:
    bronze_base = BASE_BRONZE_PATH.rstrip("/") + f"/{DOMAIN}"

out_path = f"{bronze_base}/{folder_table_name}"
print("Writing parquet to:", out_path)

(df_audited
   .write
   .mode("overwrite")
   .option("compression","snappy")
   .partitionBy("_year")
   .parquet(out_path))

print("Write complete.")

# Validate written files
years = [r["_year"] for r in df_audited.select("_year").distinct().collect()]
print("Outputs written for years:", years)
for y in years:
    p = f"{out_path}/_year={y}"
    print("Listing:", p)
    try:
        for f in dbutils.fs.ls(p):
            print(" -", f.path)
    except Exception as e:
        print("Could not list:", p, e)

print("03_Run finished.")
