In [0]:
# ==============================
# WIDGET
# ==============================
dbutils.widgets.text(
    name="config_path",
    defaultValue="/Workspace/Users/ud3041@gmail.com/end-to-end-ETL-pipeline/medallion/bronze/yfinance.json",
    label="Config File Path"
)


# =========================
# IMPORTS
# =========================
import json
from pyspark.sql.functions import (
    col,
    input_file_name,
    current_timestamp,
    regexp_extract
)

from utils.logger import get_logger
from utils.sparksession import create_spark_session
from utils.schema import YFINANCE_SCHEMA_MAP

# =========================
# INIT LOGGER & SPARK
# =========================
logger = get_logger("ds2b_yfinance_bronze")
spark = create_spark_session("DS2B | YFinance Bronze")

logger.info("Spark session initialised")

# =========================
# LOAD CONFIG
# =========================
config_path = dbutils.widgets.get("config_path")
logger.info(f"Loading config from: {config_path}")

with open(config_path, "r") as f:
    config = json.load(f)

CATALOG = config["catalog"]
SCHEMA = config["schema"]
BASE_PATH = config["base_path"]
TABLES = config["tables"]

logger.info(
    f"Config loaded | Catalog={CATALOG}, Schema={SCHEMA}, BasePath={BASE_PATH}"
)



### 

import re

def clean_column_names(df):
    cleaned_cols = []
    for c in df.columns:
        new_c = c.lower()
        new_c = re.sub(r"[ ,;{}()\n\t=]", "_", new_c)
        new_c = re.sub(r"_+", "_", new_c)
        new_c = new_c.strip("_")
        cleaned_cols.append(new_c)

    return df.toDF(*cleaned_cols)
# =========================
# PROCESS TABLES
# =========================
for table in TABLES:

    table_name = table["name"]
    relative_path = table["path"]
    file_format = table.get("format", "csv")
    header = table.get("header", True)
    infer_schema = table.get("inferSchema", True)

    logger.info(f"Processing table: {table_name}")

    schema = YFINANCE_SCHEMA_MAP[table_name]

    df = (
        spark.read
        .format("csv")
        .schema(schema)
        .option("header", True)
        .option("mode", "PERMISSIVE")
        .load(f"{BASE_PATH}/{relative_path}/*.csv")
        .withColumn("file_path", col("_metadata.file_path"))
        .withColumn("last_updated_ts", current_timestamp())
    )

    logger.info(f"Read completed for {table_name}")

    (
        df.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable(f"`{CATALOG}`.{SCHEMA}.{table_name}")
    )

    logger.info(f"Written Bronze table: {table_name}")

logger.info("DS2B YFinance Bronze pipeline completed successfully")