In [0]:
dbutils.widgets.text(
    name="config_path",
    defaultValue="/Workspace/Users/ud3041@gmail.com/end-to-end-ETL-pipeline/databricks/bronze/config.json",
    label="Config File Path"
)


In [0]:
import json
from pyspark.sql.types import *
from pyspark.sql.functions import col, regexp_extract, explode, current_timestamp

# =========================
# LOAD CONFIG
# =========================
config_path = dbutils.widgets.get("config_path")

with open(config_path, "r") as f:
    config = json.load(f)


CATALOG = config["catalog"]
SCHEMA = config["schema"]
BASE_PATH = config["base_path"]

spark.sql(f"CREATE CATALOG IF NOT EXISTS `{CATALOG}`")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS `{CATALOG}`.{SCHEMA}")

# =========================
# SCHEMAS MAP
# =========================
overview_schema = StructType([
    StructField("company_name", StringType()),
    StructField("company_number", StringType()),
    StructField("company_status", StringType()),
    StructField("date_of_creation", StringType()),
    StructField("jurisdiction", StringType()),
    StructField("type", StringType()),
    StructField("etag", StringType()),
    StructField("has_charges", BooleanType()),
    StructField("has_insolvency_history", BooleanType())
])

officers_schema = StructType([
    StructField("items", ArrayType(StructType([
        StructField("name", StringType()),
        StructField("officer_role", StringType()),
        StructField("appointed_on", StringType()),
        StructField("nationality", StringType())
    ])))
])

filing_schema = StructType([
    StructField("items", ArrayType(StructType([
        StructField("date", StringType()),
        StructField("type", StringType()),
        StructField("description", StringType()),
        StructField("category", StringType())
    ])))
])

SCHEMA_MAP = {
    "overview": overview_schema,
    "officers": officers_schema,
    "filing_history": filing_schema
}

# =========================
# PROCESS TABLES
# =========================
for table in config["tables"]:

    table_name = table["name"]
    file_name = table["file"]
    explode_flag = table.get("explode", False)
    explode_column = table.get("explode_column")

    print(f"Processing {table_name}...")

    df = spark.read \
        .schema(SCHEMA_MAP[table_name]) \
        .option("multiline", "true") \
        .json(f"{BASE_PATH}/*/*/*/*/{file_name}") \
        .withColumn("file_path", col("_metadata.file_path")) \
        .withColumn("company_number",
            regexp_extract("file_path", r'/([0-9A-Z]+)/', 1)
        )

    if explode_flag:
        df = df.withColumn("exploded", explode(explode_column)).select(
            "company_number", "exploded.*"
        )

    df = df.withColumn("last_updated_ts", current_timestamp())

    df.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(f"`{CATALOG}`.{SCHEMA}.{table_name}")

print("Metadata-Driven Bronze Pipeline Completed")


In [0]:
config_raw = spark.read.text(config_path).collect()[0][0]
print(config_raw)
