In [0]:
# utils_etl.py  (FINAL FIXED VERSION)
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

from pyspark.sql import DataFrame
from pyspark.sql.functions import trim, col, year, to_timestamp, coalesce, lit
import datetime

def read_sql_metadata(jdbc_url: str, jdbc_user: str, jdbc_pass: str, table_name: str = "dbo.sql_table_metadata") -> DataFrame:
    df = (spark.read.format("jdbc")
            .option("url", jdbc_url)
            .option("dbtable", table_name)
            .option("user", jdbc_user)
            .option("password", jdbc_pass)
            .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")
            .load())
    return df

def metadata_to_map(meta_df: DataFrame) -> dict:
    rows = meta_df.collect()
    m = {}
    for r in rows:
        d = r.asDict()
        fname = d.get("file_name")
        cols_str = d.get("column_list") or ""
        cols_list = [c.strip() for c in cols_str.split(",") if c.strip() != ""]
        year_col = d.get("year_column") if "year_column" in d else None
        table_name = d.get("table_name") if "table_name" in d and d.get("table_name") else (fname.split(".")[0] if fname else None)
        m[fname] = {"columns": cols_list, "year_column": year_col, "table_name": table_name}
    return m

def add_headers(df: DataFrame, columns: list) -> DataFrame:
    new_names = []
    for i, old in enumerate(df.columns):
        if i < len(columns):
            new_names.append(columns[i])
        else:
            new_names.append(f"_c{i}")
    return df.toDF(*new_names)

def extract_year_column(df: DataFrame, year_column_hint: str = None):
    current_year = datetime.datetime.utcnow().year
    cols = df.columns
    used = None
    df2 = df

    if year_column_hint and year_column_hint in cols:
        used = year_column_hint
        try:
            df2 = df2.withColumn("_year", year(to_timestamp(col(used))))
        except Exception:
            df2 = df2.withColumn("_year", col(used).cast("int"))
    else:
        candidates = [c for c in cols if "year" in c.lower()]
        if candidates:
            used = candidates[0]
            df2 = df2.withColumn("_year", col(used).cast("int"))
        else:
            date_candidates = [c for c in cols if "date" in c.lower()]
            if date_candidates:
                used = date_candidates[0]
                try:
                    df2 = df2.withColumn("_year", year(to_timestamp(col(used))))
                except Exception:
                    df2 = df2.withColumn("_year", col(used).cast("int"))
            else:
                df2 = df2.withColumn("_year", lit(current_year))

    df2 = df2.withColumn("_year", coalesce(col("_year").cast("int"), lit(current_year)))
    return df2, used

def write_parquet_by_year(df_with_year: DataFrame, bronze_base_path: str, table_name: str,
                          compression: str = "snappy", coalesce_out: bool = True, write_mode: str = "overwrite"):
    years = [r["_year"] for r in df_with_year.select("_year").distinct().collect()]
    for y in years:
        out_path = f"{bronze_base_path.rstrip('/')}/{table_name}/{y}"
        df_year = df_with_year.filter(col("_year") == y).drop("_year")
        writer = df_year.coalesce(1) if coalesce_out else df_year
        writer.write.mode(write_mode).option("compression", compression).parquet(out_path)
        print(f"Wrote parquet to: {out_path}")


In [0]:
# CELL 2 â€” add DBFS FileStore to sys.path and import utils_etl
import sys, importlib

dbfs_pkg_path = "/dbfs/FileStore"   # must match where we wrote utils_etl.py
fs_import_path = "/FileStore"       # to show files via dbutils.ls

if dbfs_pkg_path not in sys.path:
    sys.path.insert(0, dbfs_pkg_path)
    print("Inserted to sys.path:", dbfs_pkg_path)

# quick check listing
print("Files in /FileStore:")
for f in dbutils.fs.ls(fs_import_path):
    print(" -", f.name)

# import and reload
try:
    import utils_etl
    importlib.reload(utils_etl)
    print("\nImported utils_etl OK. Functions available:")
    print([n for n in dir(utils_etl) if not n.startswith("_")])
except Exception as e:
    print("Import failed:", e)
    print("sys.path (first 10):", sys.path[:10])
    raise


In [0]:
dbutils.fs.put("/FileStore/utils_etl.py", module_code, True)


In [0]:
import importlib, utils_etl
importlib.reload(utils_etl)
print("Imported OK!")
