In [0]:
# 02_Utils - all helper functions for CSV â†’ Parquet conversion

from pyspark.sql.functions import trim, col, year, to_timestamp, coalesce, lit

def read_metadata(jdbc_url, user, password):
    df = (spark.read
                 .format("jdbc")
                 .option("url", jdbc_url)
                 .option("dbtable", "dbo.sql_table_metadata")
                 .option("user", user)
                 .option("password", password)
                 .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")
                 .load())
    return df


def get_column_list(metadata_df, file_name):
    row = metadata_df.filter(metadata_df["file_name"] == file_name).first()
    if not row:
        raise Exception(f"No metadata found for file {file_name}")
    return [c.strip() for c in row["column_list"].split(",")]


def get_year_column(metadata_df, file_name):
    row = metadata_df.filter(metadata_df["file_name"] == file_name).first()
    if "year_column" in row.asDict():
        return row["year_column"]
    return None


def add_headers(df, columns):
    # rename based on metadata
    new_names = []
    for i, c in enumerate(df.columns):
        if i < len(columns):
            new_names.append(columns[i])
        else:
            new_names.append(f"_c{i}")
    return df.toDF(*new_names)


def extract_year(df, year_col):
    if year_col and year_col in df.columns:
        try:
            df = df.withColumn("_year", year(to_timestamp(col(year_col))))
        except:
            df = df.withColumn("_year", col(year_col).cast("int"))
    else:
        df = df.withColumn("_year", lit(9999)) # fallback
    return df


def write_parquet(df, output_path):
    (df.coalesce(1)
       .write
       .mode("overwrite")
       .option("compression", "snappy")
       .parquet(output_path))
    print("PARQUET WRITTEN:", output_path)
