In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number


### Clean and Transform `tag.txt` for Silver Layer

Steps:
1. Filter out records with null `version`
2. Drop unneeded columns: `year`, `quarter`
3. Assign unique `tag_id` to each (`tag`, `version`) combo using `row_number`
4. Join this surrogate key back to the original dataset
5. Drop less informative columns: `custom`, `abstract`, `crdr`


In [0]:
# Corrected
def tag_transform(tag_df):
    # 1. Filter out null versions and drop unused columns
    tag_df = tag_df.filter(col("version").isNotNull()).drop("year", "quarter")
    
    # 2. (Optional) normalize version if needed:
    # tag_df = tag_df.withColumn("version", upper(col("version")))
    
    # 3. Extract distinct (tag, version) combos and assign surrogate keys
    distinct_tags = tag_df.select("tag", "version").distinct()
    windowSpec = Window.orderBy("tag", "version")
    distinct_tags = distinct_tags.withColumn("tag_id", row_number().over(windowSpec))
    
    # 4. Join back so every row gets its tag_id
    tag_df = tag_df.join(distinct_tags, on=["tag", "version"], how="left")
    
    # 5. Drop unwanted columns and reorder with tag_id first
    tag_df = tag_df.drop("custom", "abstract", "crdr")
    cols = [c for c in tag_df.columns if c != "tag_id"]
    tag_df = tag_df.select(["tag_id"] + cols)
    
    return tag_df


### Load `tag.txt` from Bronze Layer

We load the raw `tag.txt` data, which defines metadata about financial tags used in filings (e.g., tag name, version, labels, and taxonomy).


In [0]:
tag_df = (
    spark.read
    .option("header", "true")         
    .option("inferSchema", "true")    
    .load("dbfs:/user/hive/warehouse/bronzes.db/tags")
)

tag_df=tag_transform(tag_df)
display(tag_df.limit(20))

###  Quick Validations

- Check total unique (`tag`, `version`) pairs created
- Confirm highest assigned `tag_id`


In [0]:
tag_df.select("tag","version").distinct().count()

In [0]:
display(tag_df.orderBy(col("tag_id").desc()).limit(1))

In [0]:
%sql
drop table if exists silver.tags

###  Save Transformed `tag.txt` to Silver Delta Table

Write the cleaned data into the Silver layer as a Delta table for downstream reference and joins.


In [0]:
tag_df.write.format("delta").mode("overwrite").saveAsTable("silver.tags")
tag_df_loaded = spark.read.format("delta").table("silver.tags")
display(tag_df_loaded)