In [0]:
# ==============================
# 03_Run - ETL runner (end-to-end)
# ==============================
# Requirements:
# - 01_Config must run first in the same Databricks job to set spark.conf and taskValues
# - 02_Utils notebook must be in same folder and will be loaded with %run
#
# This notebook:
#  - reads BASE_RAW_PATH / BASE_BRONZE_PATH and FILE_NAME from taskValues set by 01_Config
#  - reads metadata from Azure SQL (jdbc widget inputs)
#  - reads CSV (no header), applies header, extracts year, writes parquet to bronze/<domain>/<layer>/<table>/<year>/
# ==============================
#/Users/u2786997@uel.ac.uk/OSBI/DataBricks/ETL-Framework/02_Utils

#Path (/Users/u2786997@uel.ac.uk/OSBI/DataBricks/ETL-Framework/02_Utils.py) doesn't exist.
# 0) include helper functions from 02_Utils (make sure 02_Utils is in same folder)
# Use %run to load functions into this notebook's namespace
try:
    # relative run â€” works when notebooks are in the same folder
    %run /Users/u2786997@uel.ac.uk/OSBI/DataBricks/ETL-Framework/02_Utils
    #%run ./02_Utils
except Exception as e:
    # If %run fails, provide a clearer error
    raise Exception("Could not load 02_Utils. Ensure a notebook named '02_Utils' exists in the same folder.") from e

# 1) Widgets for JDBC (ADF should pass these)
dbutils.widgets.text("jdbc_hostname", "")   # ex: sqlsrv-rm-dev.database.windows.net
dbutils.widgets.text("jdbc_database", "")   # ex: sqldb-rm-dev
dbutils.widgets.text("jdbc_user", "")       # ex: sqladminuser@sqlsrv-rm-dev
dbutils.widgets.text("jdbc_password", "")   # pass the password (securely via ADF)

JDBC_HOST = dbutils.widgets.get("jdbc_hostname").strip()
JDBC_DB = dbutils.widgets.get("jdbc_database").strip()
JDBC_USER = dbutils.widgets.get("jdbc_user").strip()
JDBC_PASS = dbutils.widgets.get("jdbc_password").strip()

# 2) Read values set by 01_Config (taskValues)
try:
    BASE_RAW_PATH = dbutils.jobs.taskValues.get(taskKey="BASE_RAW_PATH", key="BASE_RAW_PATH")
    BASE_BRONZE_PATH = dbutils.jobs.taskValues.get(taskKey="BASE_BRONZE_PATH", key="BASE_BRONZE_PATH")
    DOMAIN = dbutils.jobs.taskValues.get(taskKey="DOMAIN", key="DOMAIN")
    LAYER = dbutils.jobs.taskValues.get(taskKey="LAYER", key="LAYER")
    FILE_NAME = dbutils.jobs.taskValues.get(taskKey="FILE_NAME", key="FILE_NAME")
except Exception:
    # fallback - try widgets (if running interactively)
    BASE_RAW_PATH = dbutils.widgets.get("BASE_RAW_PATH") if "BASE_RAW_PATH" in dbutils.widgets.getArgumentNames() else None
    BASE_BRONZE_PATH = dbutils.widgets.get("BASE_BRONZE_PATH") if "BASE_BRONZE_PATH" in dbutils.widgets.getArgumentNames() else None
    DOMAIN = dbutils.widgets.get("domain") if "domain" in dbutils.widgets.getArgumentNames() else None
    LAYER = dbutils.widgets.get("layer") if "layer" in dbutils.widgets.getArgumentNames() else None
    FILE_NAME = dbutils.widgets.get("file_name") if "file_name" in dbutils.widgets.getArgumentNames() else None

# Safety checks
if not BASE_RAW_PATH or not BASE_BRONZE_PATH:
    raise Exception("BASE_RAW_PATH or BASE_BRONZE_PATH not set. Run 01_Config first (in same job) or pass widgets.")

if not FILE_NAME:
    raise Exception("FILE_NAME not provided. Pass via 01_Config or as widget file_name.")

if not (JDBC_HOST and JDBC_DB and JDBC_USER and JDBC_PASS):
    raise Exception("JDBC parameters missing. Provide jdbc_hostname, jdbc_database, jdbc_user, jdbc_password as notebook widgets.")

print("RUN CONFIGURATION:")
print(" BASE_RAW_PATH:", BASE_RAW_PATH)
print(" BASE_BRONZE_PATH:", BASE_BRONZE_PATH)
print(" DOMAIN:", DOMAIN, " LAYER:", LAYER)
print(" FILE_NAME:", FILE_NAME)
print(" JDBC_HOST:", JDBC_HOST, " JDBC_DB:", JDBC_DB, " JDBC_USER:", JDBC_USER)

# 3) Build JDBC URL
jdbc_url = f"jdbc:sqlserver://{JDBC_HOST}:1433;database={JDBC_DB};encrypt=true;trustServerCertificate=false;"

# 4) Read metadata table from SQL
print("Reading metadata table dbo.sql_table_metadata from SQL...")
meta_df = read_sql_metadata(jdbc_url, JDBC_USER, JDBC_PASS, table_name="dbo.sql_table_metadata")
print("Metadata rows count:", meta_df.count())

meta_map = metadata_to_map(meta_df)
if FILE_NAME not in meta_map:
    raise Exception(f"Metadata for file '{FILE_NAME}' not found in dbo.sql_table_metadata. Available keys: {list(meta_map.keys())}")

meta = meta_map[FILE_NAME]
columns = meta.get("columns", [])
year_hint = meta.get("year_column")
table_name = meta.get("table_name") or FILE_NAME.split(".")[0]

print(f"Metadata for {FILE_NAME}: table_name={table_name} year_hint={year_hint} columns={columns}")

# 5) Build raw file path and read CSV (no header)
raw_file_path = BASE_RAW_PATH.rstrip("/") + "/" + FILE_NAME.lstrip("/")
print("Reading raw CSV from:", raw_file_path)

df_raw = (spark.read
            .option("header", "false")
            .option("sep", ",")
            .option("inferSchema", "false")
            .csv(raw_file_path))

print("Raw schema:", df_raw.schema)
print("Raw row sample:")
display(df_raw.limit(5))

# 6) Apply headers from metadata
df_named = add_headers(df_raw, columns)
# trim whitespace for safety
from pyspark.sql.functions import trim
for c in df_named.columns:
    df_named = df_named.withColumn(c, trim(col(c)))

print("After applying headers. Schema:")
display(df_named.limit(5))

# 7) Extract or compute year
df_with_year, used_year = extract_year_column(df_named, year_hint)
print("Used year column:", used_year)
print("Distinct years found:")
display(df_with_year.select("_year").distinct())

# 8) Write parquet to bronze path by year
bronze_base = BASE_BRONZE_PATH.rstrip("/") + f"/{DOMAIN}/{LAYER}"
print("Writing parquet to bronze base:", bronze_base)

# Use coalesce(1) for single file per year (small data). Set coalesce_out=False for big data.
write_parquet_by_year(df_with_year, bronze_base, table_name, compression="snappy", coalesce_out=True, write_mode="overwrite")

# 9) List and show written files (for confirmation)
from pyspark.sql.functions import col as _col
years = [r["_year"] for r in df_with_year.select("_year").distinct().collect()]
print("Written output folders and files:")
for y in years:
    out_path = f"{bronze_base.rstrip('/')}/{table_name}/{y}"
    try:
        files = dbutils.fs.ls(out_path)
        print(f" Folder: {out_path}")
        for f in files:
            print("  -", f.path)
    except Exception as e:
        print("  Could not list", out_path, "Exception:", e)

print("03_Run completed successfully.")
