In [0]:
# RUN ONCE: create utils_etl.py
module_code = r'''
# utils_etl.py
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

from pyspark.sql import DataFrame
from pyspark.sql.functions import trim, col, year, to_timestamp, coalesce, lit
import datetime, json

def read_sql_metadata(jdbc_url: str, jdbc_user: str, jdbc_pass: str, table_name: str = "dbo.sql_table_metadata") -> DataFrame:
    df = (spark.read.format("jdbc")
            .option("url", jdbc_url)
            .option("dbtable", table_name)
            .option("user", jdbc_user)
            .option("password", jdbc_pass)
            .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")
            .load())
    return df

def _normalize_columns_from_row(row_dict: dict) -> list:
    if row_dict.get("column_list"):
        cols = [c.strip() for c in row_dict.get("column_list").split(",") if c.strip() != ""]
        if cols:
            return cols
    if row_dict.get("column_names_json"):
        try:
            parsed = json.loads(row_dict.get("column_names_json"))
            if isinstance(parsed, list):
                if all(isinstance(x, str) for x in parsed):
                    return [x.strip() for x in parsed if x and str(x).strip() != ""]
                else:
                    names = []
                    for el in parsed:
                        if isinstance(el, dict) and el.get("name"):
                            names.append(str(el.get("name")).strip())
                    if names:
                        return names
        except Exception:
            pass
    if row_dict.get("column_defs_json"):
        try:
            parsed = json.loads(row_dict.get("column_defs_json"))
            if isinstance(parsed, list):
                names = []
                for el in parsed:
                    if isinstance(el, dict):
                        for key in ("name","column_name","col_name"):
                            if el.get(key):
                                names.append(str(el.get(key)).strip())
                                break
                if names:
                    return names
        except Exception:
            pass
    return []

def metadata_to_map(meta_df: DataFrame) -> dict:
    rows = meta_df.collect()
    m = {}
    for r in rows:
        d = r.asDict()
        table_name = d.get("table_name") or d.get("table")
        file_name = d.get("file_name") or d.get("filename") or None
        cols = _normalize_columns_from_row(d)
        year_col = d.get("year_column") or d.get("yearcol") or d.get("date_column") or None
        logical_table = d.get("table_name") or d.get("table") or (file_name.split(".")[0] if file_name else (table_name if table_name else None))
        meta = {"columns": cols, "year_column": year_col, "table_name": logical_table}
        if table_name:
            m[str(table_name).strip()] = meta
        if file_name:
            m[str(file_name).strip()] = meta
            if isinstance(file_name, str) and "." in file_name:
                m[str(file_name).split(".")[0].strip()] = meta
    return m

def add_headers(df: DataFrame, columns: list) -> DataFrame:
    new_names = []
    for i, old in enumerate(df.columns):
        if i < len(columns):
            new_names.append(columns[i])
        else:
            new_names.append(f"_c{i}")
    return df.toDF(*new_names)

def extract_year_column(df: DataFrame, year_column_hint: str = None):
    current_year = datetime.datetime.utcnow().year
    cols = df.columns
    used = None
    df2 = df
    if year_column_hint and year_column_hint in cols:
        used = year_column_hint
        try:
            df2 = df2.withColumn("_year", year(to_timestamp(col(used))))
        except Exception:
            df2 = df2.withColumn("_year", col(used).cast("int"))
    else:
        candidates = [c for c in cols if "modified" in c.lower() or "year" in c.lower() or "date" in c.lower()]
        if candidates:
            used = candidates[0]
            try:
                df2 = df2.withColumn("_year", year(to_timestamp(col(used))))
            except Exception:
                df2 = df2.withColumn("_year", col(used).cast("int"))
        else:
            df2 = df2.withColumn("_year", lit(current_year))
    df2 = df2.withColumn("_year", coalesce(col("_year").cast("int"), lit(current_year)))
    return df2, used

def write_parquet_by_year(df_with_year: DataFrame, bronze_base_path: str, table_name: str,
                          compression: str = "snappy", coalesce_out: bool = True, write_mode: str = "overwrite"):
    years = [r["_year"] for r in df_with_year.select("_year").distinct().collect()]
    for y in years:
        out_path = f"{bronze_base_path.rstrip('/')}/{table_name}/{y}"
        df_year = df_with_year.filter(col("_year") == y).drop("_year")
        writer = df_year.coalesce(1) if coalesce_out else df_year
        writer.write.mode(write_mode).option("compression", compression).parquet(out_path)
        print(f"Wrote parquet to: {out_path}")
'''
dbutils.fs.put("/FileStore/utils_etl.py", module_code, True)
print("utils_etl.py written to /FileStore")
display(dbutils.fs.ls("/FileStore"))
