In [0]:
SILVER_PATH = "wasbs://silver@flightdatastorage.blob.core.windows.net/flight_market/"

df_silver = (
    spark.read
         .format("delta")
         .load(SILVER_PATH)
)


###### Extract Distance Group Reference Data

In [0]:
from pyspark.sql.functions import col

df_distance_group = (
    df_silver
    .select(col("DISTANCE_GROUP").alias("distance_group_code"))
    .filter(col("distance_group_code").isNotNull())
    .dropDuplicates()
)


###### Generate Distance Group Surrogate Key

In [0]:
from pyspark.sql.functions import sha2, concat_ws, lit

df_dim_distance_group = (
    df_distance_group
    .withColumn(
        "distance_group_key",
        sha2(col("distance_group_code").cast("string"), 256)
    )
    .withColumn("description", lit(None).cast("string"))
)


###### Validate Distance Group Dimension Data Quality

In [0]:
# no null natural keys
assert df_dim_distance_group.filter(col("distance_group_code").isNull()).count() == 0

# uniqueness
assert (
    df_dim_distance_group.select("distance_group_code").distinct().count()
    == df_dim_distance_group.count()
)

# surrogate key uniqueness
assert (
    df_dim_distance_group.select("distance_group_key").distinct().count()
    == df_dim_distance_group.count()
)


###### Persist Distance Group Dimension

In [0]:
GOLD_DIM_DISTANCE_GROUP_PATH = (
    "wasbs://gold@flightdatastorage.blob.core.windows.net/dim_distance_group/"
)

(
    df_dim_distance_group
    .write
    .format("delta")
    .mode("overwrite")  # Type 0, deterministic
    .save(GOLD_DIM_DISTANCE_GROUP_PATH)
)
