In [0]:
# =========================
# 03_Run - ETL runner (delimiter detection + headers + parquet)
# Paste this entire content into a Databricks notebook named "03_Run"
# =========================

import sys, importlib, re, traceback, json
from pyspark.sql.functions import col, trim
from pyspark.sql import SparkSession

# ensure utils module importable from /FileStore
if "/dbfs/FileStore" not in sys.path:
    sys.path.insert(0, "/dbfs/FileStore")
try:
    import utils_etl
    importlib.reload(utils_etl)
    print("Imported utils_etl OK.")
except Exception as e:
    print("Failed to import utils_etl - ensure /FileStore/utils_etl.py exists")
    raise

# --- Widgets (ADF should pass these) ---
dbutils.widgets.text("domain", "Finance")
dbutils.widgets.text("file_name", "")            # e.g. Sales.Currency or Sales.Currency.csv
dbutils.widgets.text("column_list", "")          # JSON array OR CSV
dbutils.widgets.text("year_column", "")          # e.g. ModifiedDate (optional)
dbutils.widgets.text("table_name", "")           # folder name override (optional)

dbutils.widgets.text("direct_account_key", "")   # required or cluster must have access
dbutils.widgets.text("BASE_RAW_PATH", "")        # optional
dbutils.widgets.text("BASE_BRONZE_PATH", "")     # optional
dbutils.widgets.text("include_layer", "false")   # if you want /<layer> in path

# read widgets
DOMAIN = dbutils.widgets.get("domain").strip()
FILE_NAME_IN = dbutils.widgets.get("file_name").strip()
COLUMN_LIST_WIDGET = dbutils.widgets.get("column_list").strip()
YEAR_COLUMN_WIDGET = dbutils.widgets.get("year_column").strip()
TABLE_NAME_WIDGET = dbutils.widgets.get("table_name").strip()
DIRECT_KEY = dbutils.widgets.get("direct_account_key").strip()
BASE_RAW_WIDGET = dbutils.widgets.get("BASE_RAW_PATH").strip()
BASE_BRONZE_WIDGET = dbutils.widgets.get("BASE_BRONZE_PATH").strip()
INCLUDE_LAYER = dbutils.widgets.get("include_layer").strip().lower() in ("true","1","yes","y")

# storage config
STORAGE_ACCOUNT = "scrgvkrmade"
RAW_CONTAINER = "project"
BRONZE_CONTAINER = "bronze"

# --- helper: detect delimiter by sampling head of file ---
def detect_delimiter(file_path, sample_size=8192):
    """
    Returns a best-guess delimiter from [',','|',';','\t'] by counting occurrences in sample.
    """
    text = None
    try:
        text = dbutils.fs.head(file_path, sample_size)
    except Exception:
        # if head fails (e.g. file not found) return default comma
        return ","
    # normalize CRLF
    counts = {
        ",": text.count(","),
        "|": text.count("|"),
        ";": text.count(";"),
        "\t": text.count("\t")
    }
    # choose delimiter with highest count (must have at least 1)
    delim = max(counts, key=counts.get)
    if counts[delim] == 0:
        return ","
    return delim

# --- set storage key if passed (must do before abfss access) ---
def set_storage_key(k):
    if not k:
        return
    if (k.startswith('"') and k.endswith('"')) or (k.startswith("'") and k.endswith("'")):
        k = k[1:-1]
    k = k.strip()
    if not re.fullmatch(r"[A-Za-z0-9+/=]{20,300}", k):
        raise Exception("direct_account_key looks invalid; ensure you're passing raw account key (base64), not connection string")
    spark.conf.set(f"fs.azure.account.key.{STORAGE_ACCOUNT}.dfs.core.windows.net", k)
    print("Set storage key for", STORAGE_ACCOUNT)
    # quick test
    try:
        display(dbutils.fs.ls(f"abfss://{RAW_CONTAINER}@{STORAGE_ACCOUNT}.dfs.core.windows.net/"))
    except Exception as e:
        print("Warning: test listing raw container failed (invalid key or container).")
        raise

if DIRECT_KEY:
    set_storage_key(DIRECT_KEY)
else:
    print("No direct_account_key provided; cluster auth must have permissions to read ABFS.")

# resolve base paths
def task_or_widget(key, widget_name):
    try:
        v = dbutils.jobs.taskValues.get(taskKey=key, key=key)
        if hasattr(v,"value"): return v.value
        return v
    except Exception:
        return dbutils.widgets.get(widget_name).strip()

BASE_RAW_TASK = task_or_widget("BASE_RAW_PATH","BASE_RAW_PATH")
BASE_BRONZE_TASK = task_or_widget("BASE_BRONZE_PATH","BASE_BRONZE_PATH")

if BASE_RAW_TASK:
    BASE_RAW_PATH = BASE_RAW_TASK
elif BASE_RAW_WIDGET:
    BASE_RAW_PATH = BASE_RAW_WIDGET
elif DIRECT_KEY:
    BASE_RAW_PATH = f"abfss://{RAW_CONTAINER}@{STORAGE_ACCOUNT}.dfs.core.windows.net"
else:
    raise Exception("BASE_RAW_PATH not resolved. Provide direct_account_key or BASE_RAW_PATH widget or run 01_Config first.")

if BASE_BRONZE_TASK:
    BASE_BRONZE_PATH = BASE_BRONZE_TASK
elif BASE_BRONZE_WIDGET:
    BASE_BRONZE_PATH = BASE_BRONZE_WIDGET
elif DIRECT_KEY:
    BASE_BRONZE_PATH = f"abfss://{BRONZE_CONTAINER}@{STORAGE_ACCOUNT}.dfs.core.windows.net"
else:
    raise Exception("BASE_BRONZE_PATH not resolved. Provide direct_account_key or BASE_BRONZE_PATH widget or run 01_Config first.")

print("BASE_RAW_PATH:", BASE_RAW_PATH)
print("BASE_BRONZE_PATH:", BASE_BRONZE_PATH)

# --- normalize file name and folder table name ---
if not FILE_NAME_IN:
    raise Exception("file_name widget required (e.g. Sales.Currency or Sales.Currency.csv)")
file_key = FILE_NAME_IN
file_key_no_ext = file_key[:-4] if file_key.lower().endswith(".csv") else file_key
folder_table_name = TABLE_NAME_WIDGET if TABLE_NAME_WIDGET else file_key_no_ext

# --- parse column_list (accept JSON array or CSV) ---
def parse_column_list(text):
    txt = text.strip()
    if not txt:
        return []
    txt = txt.replace('""','"')
    if txt.startswith("[") and txt.endswith("]"):
        try:
            arr = json.loads(txt)
            if isinstance(arr, list):
                return [str(x).strip() for x in arr if x and str(x).strip()!=""]
        except Exception:
            txt2 = txt.strip("[]")
            return [c.strip().strip('"').strip("'") for c in txt2.split(",") if c.strip()!='']
    return [c.strip().strip('"').strip("'") for c in txt.split(",") if c.strip()!='']

if COLUMN_LIST_WIDGET:
    columns = parse_column_list(COLUMN_LIST_WIDGET)
else:
    raise Exception("column_list missing. Pass column_list from ADF Lookup (JSON array or CSV).")

if not columns:
    raise Exception("Parsed columns list is empty. Provide valid column_list.")

# default year_hint to YEAR_COLUMN_WIDGET or ModifiedDate if present
year_hint = YEAR_COLUMN_WIDGET if YEAR_COLUMN_WIDGET else None
if not year_hint and any(c.lower()=="modifieddate" for c in columns):
    year_hint = [c for c in columns if c.lower()=="modifieddate"][0]
    print("Using ModifiedDate as year_hint")

# --- build raw path and detect delimiter ---
raw_read_name = FILE_NAME_IN if FILE_NAME_IN and FILE_NAME_IN.strip()!="" else file_key_no_ext + ".csv"
raw_path = BASE_RAW_PATH.rstrip("/") + "/" + raw_read_name.lstrip("/")
print("Raw file path:", raw_path)

# detect delimiter by sampling first bytes
detected_sep = detect_delimiter(raw_path, sample_size=8192)
print("Detected delimiter:", repr(detected_sep))

# --- read CSV using detected delimiter and robust options ---
try:
    df_raw = (spark.read
                .option("header","false")
                .option("sep", detected_sep)
                .option("quote", '"')
                .option("escape", "\\")
                .option("multiLine", "true")
                .option("inferSchema", "false")
                .csv(raw_path))
    print("Raw row count:", df_raw.count())
    display(df_raw.limit(5))
except Exception:
    print("Failed to read CSV with detected delimiter; attempting fallback using comma.")
    try:
        df_raw = (spark.read
                    .option("header","false")
                    .option("sep", ",")
                    .option("quote", '"')
                    .option("escape", "\\")
                    .option("multiLine", "true")
                    .option("inferSchema", "false")
                    .csv(raw_path))
        print("Fallback read succeeded (comma). Row count:", df_raw.count())
        display(df_raw.limit(5))
    except Exception:
        traceback.print_exc()
        raise

# --- apply headers and trim whitespace ---
df_named = utils_etl.add_headers(df_raw, columns)
for c in df_named.columns:
    df_named = df_named.withColumn(c, trim(col(c)))
print("After applying headers sample:")
display(df_named.limit(5))

# --- extract year and write parquet by year (folder structure: /<DOMAIN>/<table>/<year>) ---
df_with_year, used_year = utils_etl.extract_year_column(df_named, year_hint)
print("Year column used:", used_year)
print("Distinct years:")
display(df_with_year.select("_year").distinct())

if INCLUDE_LAYER:
    bronze_base = BASE_BRONZE_PATH.rstrip("/") + f"/{DOMAIN}/{'Bronze'}"
else:
    bronze_base = BASE_BRONZE_PATH.rstrip("/") + f"/{DOMAIN}"

print("Writing parquet to:", bronze_base)
utils_etl.write_parquet_by_year(df_with_year, bronze_base, folder_table_name, compression="snappy", coalesce_out=True, write_mode="overwrite")

# --- confirm outputs ---
years = [r["_year"] for r in df_with_year.select("_year").distinct().collect()]
print("Outputs written for years:", years)
for y in years:
    out_path = f"{bronze_base.rstrip('/')}/{folder_table_name}/{y}"
    print("Listing:", out_path)
    try:
        for f in dbutils.fs.ls(out_path):
            print(" -", f.path)
    except Exception as e:
        print("Could not list:", out_path, e)

print("03_Run finished.")
