In [0]:
# ==============================
# WIDGET
# ==============================
dbutils.widgets.text(
    name="config_path",
    defaultValue="/Workspace/Users/ud3041@gmail.com/end-to-end-ETL-pipeline/databricks/silver/config.json",
    label="Config File Path"
)

In [0]:
import json
from pyspark.sql.functions import *

# =========================
# LOAD CONFIG
# =========================
config_path = dbutils.widgets.get("config_path")
with open(config_path, "r") as f:
    config = json.load(f)

CATALOG = config["catalog"]
BRONZE_SCHEMA = config["bronze_schema"]
SILVER_SCHEMA = config["silver_schema"]


overview_b = spark.table(f"`{CATALOG}`.{BRONZE_SCHEMA}.overview")
officers_b = spark.table(f"`{CATALOG}`.{BRONZE_SCHEMA}.officers")
filing_b = spark.table(f"`{CATALOG}`.{BRONZE_SCHEMA}.filing_history")


overview_c = overview_b \
    .filter(col("company_number").isNotNull()) \
    .withColumn("company_name", trim(col("company_name"))) \
    .withColumn("date_of_creation", to_date("date_of_creation")) \
    .withColumn("company_status", initcap(col("company_status")))


overview_c = overview_c.dropDuplicates(["company_number"])


officer_summary = officers_b.groupBy("company_number") \
    .agg(
        count("*").alias("officer_count"),
        countDistinct("officer_role").alias("unique_roles")
    )
filing_summary = filing_b \
    .withColumn("date", to_date("date")) \
    .groupBy("company_number") \
    .agg(
        max("date").alias("last_filing_date"),
        count("*").alias("filing_count")
    )


company_master = overview_c \
    .join(officer_summary, "company_number", "left") \
    .join(filing_summary, "company_number", "left")


company_master = company_master \
    .withColumn(
        "company_age",
        year(current_date()) - year(col("date_of_creation"))
    ) \
    .withColumn(
        "is_active",
        when(col("company_status") == "Active", lit(True)).otherwise(lit(False))
    ) \
    .withColumn("last_updated_ts", current_timestamp())


company_master = company_master.filter(col("company_age") >= 0)


company_master.write \
    .format("delta") \
    .mode("overwrite") \
    .partitionBy("company_status") \
    .saveAsTable(f"`{CATALOG}`.{SILVER_SCHEMA}.company_master")
