In [0]:
# 02_Utils - helper functions for metadata-driven CSV -> Parquet ETL
# No I/O in this notebook; only pure functions.

from pyspark.sql import DataFrame
from pyspark.sql.functions import trim, col, year, to_timestamp, coalesce, lit
import datetime

def read_sql_metadata(jdbc_url: str, jdbc_user: str, jdbc_pass: str, table_name: str = "dbo.sql_table_metadata") -> DataFrame:
    """
    Read metadata table from Azure SQL and return a Spark DataFrame.
    Expects columns:
      - file_name (varchar)
      - column_list (comma-separated string)
      - year_column (optional)
      - table_name (optional)
    """
    df = (spark.read.format("jdbc")
            .option("url", jdbc_url)
            .option("dbtable", table_name)
            .option("user", jdbc_user)
            .option("password", jdbc_pass)
            .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")
            .load())
    return df

def metadata_to_map(meta_df: DataFrame) -> dict:
    """
    Convert metadata DataFrame to a Python dict:
      { file_name -> {"columns": [...], "year_column": "...", "table_name": "..." } }
    """
    rows = meta_df.collect()
    m = {}
    for r in rows:
        d = r.asDict()
        fname = d.get("file_name")
        cols_str = d.get("column_list") or ""
        cols_list = [c.strip() for c in cols_str.split(",") if c.strip() != ""]
        year_col = d.get("year_column") if "year_column" in d else None
        table_name = d.get("table_name") if "table_name" in d and d.get("table_name") else (fname.split(".")[0] if fname else None)
        m[fname] = {"columns": cols_list, "year_column": year_col, "table_name": table_name}
    return m

def add_headers(df: DataFrame, columns: list) -> DataFrame:
    """
    Rename dataframe columns using the metadata columns list.
    If the CSV has more columns than metadata, fallback names _cN are used for extras.
    """
    new_names = []
    for i, old in enumerate(df.columns):
        if i < len(columns):
            new_names.append(columns[i])
        else:
            new_names.append(f"_c{i}")
    return df.toDF(*new_names)

def extract_year_column(df: DataFrame, year_column_hint: str = None) -> (DataFrame, str):
    """
    Determine a year column and return (df_with__year, used_year_column_name).
    Logic:
      1. If year_column_hint is provided and exists, use it.
      2. Else pick first column containing 'year' in name.
      3. Else pick first column containing 'date' in name and extract year.
      4. Else add current UTC year as _year.
    The returned DataFrame always contains an integer column named '_year'.
    """
    current_year = datetime.datetime.utcnow().year
    cols = df.columns
    used = None
    df2 = df

    if year_column_hint and year_column_hint in cols:
        used = year_column_hint
        try:
            df2 = df2.withColumn("_year", year(to_timestamp(col(used))))
        except Exception:
            df2 = df2.withColumn("_year", col(used).cast("int"))
    else:
        candidates = [c for c in cols if "year" in c.lower()]
        if candidates:
            used = candidates[0]
            df2 = df2.withColumn("_year", col(used).cast("int"))
        else:
            date_candidates = [c for c in cols if "date" in c.lower()]
            if date_candidates:
                used = date_candidates[0]
                try:
                    df2 = df2.withColumn("_year", year(to_timestamp(col(used))))
                except Exception:
                    df2 = df2.withColumn("_year", col(used).cast("int"))
            else:
                df2 = df2.withColumn("_year", lit(current_year))

    # Normalize and fill nulls with current year
    df2 = df2.withColumn("_year", coalesce(col("_year").cast("int"), lit(current_year)))
    return df2, used

def write_parquet_by_year(df_with_year: DataFrame, bronze_base_path: str, table_name: str,
                          compression: str = "snappy", coalesce_out: bool = True, write_mode: str = "overwrite"):
    """
    Write parquet files grouped by _year. Folder layout:
      <bronze_base_path>/<table_name>/<year>/part-xxxx.snappy.parquet

    Parameters:
      - df_with_year: DataFrame that must contain integer column '_year'
      - bronze_base_path: ABFSS/wasbs base path (no trailing slash recommended)
      - table_name: logical table name (folder)
      - compression: parquet compression (default snappy)
      - coalesce_out: if True coalesce(1) to produce single file per year (small data). Set False for large data.
      - write_mode: "overwrite" or "append"
    """
    years = [r["_year"] for r in df_with_year.select("_year").distinct().collect()]
    for y in years:
        out_path = f"{bronze_base_path.rstrip('/')}/{table_name}/{y}"
        df_year = df_with_year.filter(col("_year") == y).drop("_year")
        writer = df_year.coalesce(1) if coalesce_out else df_year
        writer.write.mode(write_mode).option("compression", compression).parquet(out_path)
        print(f"Wrote parquet to: {out_path}")
